Пример #1
0
def preview(filepath_vcf, mode):
    fields, samples, headers, chunks = allel.iter_vcf_chunks(filepath_vcf,
                                                             fields=['*'],
                                                             chunk_length=2)
    #get first 2 lines for example
    #get json
    for chunker in chunks:
        recordstring = chunker2string(chunker, fields, samples, mode)
        recordstring = RenameJsonKey(recordstring)
        break

    #get vcf
    linenum = 0
    vcfline = str()
    with open(filepath_vcf) as file:
        while True:
            line = file.readline()
            if not line:
                break
            else:
                if line[1] != '#':
                    vcfline += line
                    linenum += 1
                    if linenum == 3:
                        break

    result = {"vcf": vcfline, "json": recordstring}
    return result
Пример #2
0
    def vcf2json_multi2(self,
                        filepath_vcf,
                        filepath_json,
                        md5,
                        mode,
                        IsAddHead=True):
        fields, samples, headers, chunks = allel.iter_vcf_chunks(
            filepath_vcf,
            fields=['variants/*', 'calldata/*'],
            chunk_length=500)

        if os.path.exists(filepath_json):
            os.remove(filepath_json)
        #增加原vcf文件的头部信息, 用于逆向转换
        if IsAddHead:
            self.addhead(headers[0], filepath_json)

        tmpfile = "value_" + md5 + ".dat"
        with open(tmpfile, "wb") as f:
            pickle.dump(fields, f)
            pickle.dump(samples, f)
            pickle.dump(headers, f)
            pickle.dump(filepath_json, f)

        cores = multiprocessing.cpu_count()
        processnum = int(cores / 2)
        if processnum < 2:
            processnum = 2
        # 自己调度迭代器 防止内存溢出
        pool = multiprocessing.Pool(processes=processnum)
        index = 0
        tmpchunks = []
        first = True
        realchunks = []
        for chunker in chunks:
            index += 1
            tmpchunks.append(chunker)
            if index % (processnum * 10) == 0:
                if not first:
                    AppResult.get()
                    realchunks.clear()
                realchunks = copy.deepcopy(tmpchunks)
                tmpchunks.clear()
                first = False
                AppResult = pool.map_async(
                    partial(self.IoOperat_multi, tmpfile, mode), realchunks)

        if "AppResult" in locals().keys():
            AppResult.get()

        pool.map(partial(self.IoOperat_multi, tmpfile, mode), tmpchunks)
        tmpchunks.clear()
        if realchunks:
            realchunks.clear()
        pool.close()
        pool.join()  # 主进程阻塞等待子进程的退出
        #delete two last character '\n' and ',' and add '}'
        if IsAddHead:
            self.addEnd(filepath_json)
        os.remove(tmpfile)  # 删除临时文件,节约空间
Пример #3
0
def vcf2json_Single(filepath_vcf, filepath_json, mode):
    fields, samples, headers, chunks = allel.iter_vcf_chunks(filepath_vcf, fields=['*'], chunk_length=50)

    if os.path.exists(filepath_json):
        os.remove(filepath_json)
    addhead(headers[0], filepath_json)

    for chunker in chunks:
        with open(filepath_json, 'a') as fp:
            recordstring = chunker2string(chunker, fields, samples, mode)
            fp.write(recordstring)

    return
Пример #4
0
    def preview(self, filepath_vcf, mode):
        fields, samples, headers, chunks = allel.iter_vcf_chunks(
            filepath_vcf, fields=['*'], chunk_length=2)
        #get first 2 lines for example
        #get json
        for chunker in chunks:
            recordstring = self.chunker2string(chunker, fields, samples, mode)
            recordstring = RenameJsonKey(recordstring)
            break

        #get vcf
        if filepath_vcf.endswith('gz'):  #.vcf.gz
            linenum = 0
            vcfline = str()
            with gzip.open(filepath_vcf, 'rb') as file:
                for line in file:
                    if not line:
                        break
                    else:
                        strline = bytes.decode(line)
                        if strline[1] != '#':
                            vcfline += strline
                            linenum += 1
                            if linenum == 3:
                                break
            result = {"vcf": vcfline, "json": recordstring}
        else:  #.vcf
            linenum = 0
            vcfline = str()
            with open(filepath_vcf, 'rb') as file:
                while True:
                    line = file.readline()
                    if not line:
                        break
                    else:
                        if line[1] != '#':
                            vcfline += line
                            linenum += 1
                            if linenum == 3:
                                break

            result = {"vcf": vcfline, "json": recordstring}
        return result
Пример #5
0
 def __init__(self, filepath):
     self.filepath = filepath
     self.fields, self.samples, self.header, _ = allel.iter_vcf_chunks(filepath, fields='*')
     self.features = None
     self.variants = None
     self.logger = logging.getLogger(self.__class__.__name__)
Пример #6
0
def vcf2json_multi2(filepath_vcf, filepath_json, md5, mode):
    #统计数据
    time_start = time.time()
    manager = multiprocessing.Manager()
    statisticArr = manager.Array("i", [0, 0, 0])

    fields, samples, headers, chunks = allel.iter_vcf_chunks(
        filepath_vcf, fields=['variants/*', 'calldata/*'], chunk_length=500)
    print(filepath_vcf)
    if os.path.exists(filepath_json):
        os.remove(filepath_json)
    #增加原vcf文件的头部信息, 用于逆向转换
    addhead(headers[0], filepath_json)

    tmpfile = "value_" + md5 + ".dat"
    with open(tmpfile, "wb") as f:
        pickle.dump(fields, f)
        pickle.dump(samples, f)
        pickle.dump(headers, f)
        pickle.dump(filepath_json, f)

    cores = multiprocessing.cpu_count()
    processnum = max(int(cores / 2), 2)
    #processnum = min(cores, 20)
    #processnum = int(cores / 2)

    #自己调度迭代器 防止内存溢出
    pool = multiprocessing.Pool(processes=processnum)
    index = 0
    tmpchunks = []
    i = 0
    # for chunker in chunks:
    #     index+=1
    #     tmpchunks.append(chunker)
    #     if index % (processnum*10) == 0:
    #         # i += 1
    #         # print(("{0} - 1").format(i))
    #         pool.map(partial(IoOperat_multi, tmpfile, mode, statisticArr), tmpchunks)
    #         #print(("{0} - 2").format(i))
    #         #pool.map(partial(IoOperat_multi, tmpfile, mode, statisticArr), tmpchunks)
    #         # time.sleep(10)
    #         tmpchunks.clear()
    first = True
    realchunks = []

    for chunker in chunks:
        index += 1
        tmpchunks.append(chunker)
        if index % (processnum * 10) == 0:
            if not first:
                AppResult.get()
                realchunks.clear()
            realchunks = copy.deepcopy(tmpchunks)
            tmpchunks.clear()
            first = False
            AppResult = pool.map_async(
                partial(IoOperat_multi, tmpfile, mode, statisticArr),
                realchunks)

    if "AppResult" in locals().keys():
        AppResult.get()
    #print("last section")
    pool.map(partial(IoOperat_multi, tmpfile, mode, statisticArr), tmpchunks)
    tmpchunks.clear()
    if realchunks:
        realchunks.clear()
    pool.close()
    pool.join()  # 主进程阻塞等待子进程的退出
    os.remove(tmpfile)  # 删除临时文件,节约空间

    #保存统计数据
    filesize = os.path.getsize(filepath_json)
    time_end = time.time()
    time_cost = time_end - time_start
    dir = os.path.splitext(filepath_vcf)[0]
    #statisticFile = dir + '.txt'
    statisticFile = "vcf2json_results.txt"
    with open(statisticFile, mode='a') as fp:
        result = (filepath_vcf + '\t' + 'chrom: ' + '{0}' + '\t' + 'info: ' +
                  '{1}' + '\t' + 'sample: ' + '{2}' + '\t' + 'total cost: ' +
                  '{3}' + '\t' + 'jsonfilesize: ' + '{4}' +
                  'infoSpecial: {5}' + '\n').format(statisticArr[0],
                                                    statisticArr[1],
                                                    samples.size, time_cost,
                                                    filesize, statisticArr[2])
        fp.write(result)
    os.remove(filepath_json)  # 删除文件,节约空间