def preview(filepath_vcf, mode): fields, samples, headers, chunks = allel.iter_vcf_chunks(filepath_vcf, fields=['*'], chunk_length=2) #get first 2 lines for example #get json for chunker in chunks: recordstring = chunker2string(chunker, fields, samples, mode) recordstring = RenameJsonKey(recordstring) break #get vcf linenum = 0 vcfline = str() with open(filepath_vcf) as file: while True: line = file.readline() if not line: break else: if line[1] != '#': vcfline += line linenum += 1 if linenum == 3: break result = {"vcf": vcfline, "json": recordstring} return result
def vcf2json_multi2(self, filepath_vcf, filepath_json, md5, mode, IsAddHead=True): fields, samples, headers, chunks = allel.iter_vcf_chunks( filepath_vcf, fields=['variants/*', 'calldata/*'], chunk_length=500) if os.path.exists(filepath_json): os.remove(filepath_json) #增加原vcf文件的头部信息, 用于逆向转换 if IsAddHead: self.addhead(headers[0], filepath_json) tmpfile = "value_" + md5 + ".dat" with open(tmpfile, "wb") as f: pickle.dump(fields, f) pickle.dump(samples, f) pickle.dump(headers, f) pickle.dump(filepath_json, f) cores = multiprocessing.cpu_count() processnum = int(cores / 2) if processnum < 2: processnum = 2 # 自己调度迭代器 防止内存溢出 pool = multiprocessing.Pool(processes=processnum) index = 0 tmpchunks = [] first = True realchunks = [] for chunker in chunks: index += 1 tmpchunks.append(chunker) if index % (processnum * 10) == 0: if not first: AppResult.get() realchunks.clear() realchunks = copy.deepcopy(tmpchunks) tmpchunks.clear() first = False AppResult = pool.map_async( partial(self.IoOperat_multi, tmpfile, mode), realchunks) if "AppResult" in locals().keys(): AppResult.get() pool.map(partial(self.IoOperat_multi, tmpfile, mode), tmpchunks) tmpchunks.clear() if realchunks: realchunks.clear() pool.close() pool.join() # 主进程阻塞等待子进程的退出 #delete two last character '\n' and ',' and add '}' if IsAddHead: self.addEnd(filepath_json) os.remove(tmpfile) # 删除临时文件,节约空间
def vcf2json_Single(filepath_vcf, filepath_json, mode): fields, samples, headers, chunks = allel.iter_vcf_chunks(filepath_vcf, fields=['*'], chunk_length=50) if os.path.exists(filepath_json): os.remove(filepath_json) addhead(headers[0], filepath_json) for chunker in chunks: with open(filepath_json, 'a') as fp: recordstring = chunker2string(chunker, fields, samples, mode) fp.write(recordstring) return
def preview(self, filepath_vcf, mode): fields, samples, headers, chunks = allel.iter_vcf_chunks( filepath_vcf, fields=['*'], chunk_length=2) #get first 2 lines for example #get json for chunker in chunks: recordstring = self.chunker2string(chunker, fields, samples, mode) recordstring = RenameJsonKey(recordstring) break #get vcf if filepath_vcf.endswith('gz'): #.vcf.gz linenum = 0 vcfline = str() with gzip.open(filepath_vcf, 'rb') as file: for line in file: if not line: break else: strline = bytes.decode(line) if strline[1] != '#': vcfline += strline linenum += 1 if linenum == 3: break result = {"vcf": vcfline, "json": recordstring} else: #.vcf linenum = 0 vcfline = str() with open(filepath_vcf, 'rb') as file: while True: line = file.readline() if not line: break else: if line[1] != '#': vcfline += line linenum += 1 if linenum == 3: break result = {"vcf": vcfline, "json": recordstring} return result
def __init__(self, filepath): self.filepath = filepath self.fields, self.samples, self.header, _ = allel.iter_vcf_chunks(filepath, fields='*') self.features = None self.variants = None self.logger = logging.getLogger(self.__class__.__name__)
def vcf2json_multi2(filepath_vcf, filepath_json, md5, mode): #统计数据 time_start = time.time() manager = multiprocessing.Manager() statisticArr = manager.Array("i", [0, 0, 0]) fields, samples, headers, chunks = allel.iter_vcf_chunks( filepath_vcf, fields=['variants/*', 'calldata/*'], chunk_length=500) print(filepath_vcf) if os.path.exists(filepath_json): os.remove(filepath_json) #增加原vcf文件的头部信息, 用于逆向转换 addhead(headers[0], filepath_json) tmpfile = "value_" + md5 + ".dat" with open(tmpfile, "wb") as f: pickle.dump(fields, f) pickle.dump(samples, f) pickle.dump(headers, f) pickle.dump(filepath_json, f) cores = multiprocessing.cpu_count() processnum = max(int(cores / 2), 2) #processnum = min(cores, 20) #processnum = int(cores / 2) #自己调度迭代器 防止内存溢出 pool = multiprocessing.Pool(processes=processnum) index = 0 tmpchunks = [] i = 0 # for chunker in chunks: # index+=1 # tmpchunks.append(chunker) # if index % (processnum*10) == 0: # # i += 1 # # print(("{0} - 1").format(i)) # pool.map(partial(IoOperat_multi, tmpfile, mode, statisticArr), tmpchunks) # #print(("{0} - 2").format(i)) # #pool.map(partial(IoOperat_multi, tmpfile, mode, statisticArr), tmpchunks) # # time.sleep(10) # tmpchunks.clear() first = True realchunks = [] for chunker in chunks: index += 1 tmpchunks.append(chunker) if index % (processnum * 10) == 0: if not first: AppResult.get() realchunks.clear() realchunks = copy.deepcopy(tmpchunks) tmpchunks.clear() first = False AppResult = pool.map_async( partial(IoOperat_multi, tmpfile, mode, statisticArr), realchunks) if "AppResult" in locals().keys(): AppResult.get() #print("last section") pool.map(partial(IoOperat_multi, tmpfile, mode, statisticArr), tmpchunks) tmpchunks.clear() if realchunks: realchunks.clear() pool.close() pool.join() # 主进程阻塞等待子进程的退出 os.remove(tmpfile) # 删除临时文件,节约空间 #保存统计数据 filesize = os.path.getsize(filepath_json) time_end = time.time() time_cost = time_end - time_start dir = os.path.splitext(filepath_vcf)[0] #statisticFile = dir + '.txt' statisticFile = "vcf2json_results.txt" with open(statisticFile, mode='a') as fp: result = (filepath_vcf + '\t' + 'chrom: ' + '{0}' + '\t' + 'info: ' + '{1}' + '\t' + 'sample: ' + '{2}' + '\t' + 'total cost: ' + '{3}' + '\t' + 'jsonfilesize: ' + '{4}' + 'infoSpecial: {5}' + '\n').format(statisticArr[0], statisticArr[1], samples.size, time_cost, filesize, statisticArr[2]) fp.write(result) os.remove(filepath_json) # 删除文件,节约空间