def call_consensus(alignment, minproportion=0.5): pos_base = OrderedDict() record_size = 0 for faObj in formFASTA(alignment): name, seq = faObj reads_count = 1 record_size += reads_count index = 1 for base in seq: index += 1 if index not in pos_base.keys(): pos_base[index] = defaultdict(int) if base != "-": pos_base[index][base] += reads_count con_seq = "" for pos, count in pos_base.items(): total = float(sum(count.values())) if total / record_size >= minproportion: maxnum = 0 maxbase = "" for base, num in count.items(): if num > maxnum: maxnum = num maxbase = base con_seq += maxbase return con_seq, record_size
def group_reads(fa_file): BC_UMI_seq = defaultdict(lambda: defaultdict(list)) for record in formFASTA(fa_file): name, seq = record BC = name.split("BC=")[1].split(" ")[0] UMI = name.split("UMI=")[1].split(" ")[0] BC_UMI_seq[BC][UMI].append(seq) return BC_UMI_seq
def getCCSreadDF(CCS_result_path): CCS_fa = open(CCS_result_path, 'r').read() readName = [] readSeq = [] for record in formFASTA(CCS_fa): name, seq = record readName.append(name) readSeq.append(seq) # build dataframe ccs_df = pd.DataFrame({"name": readName, "sequence": readSeq}) ccs_df["name"] = ">" + ccs_df["name"] return ccs_df
outSta = open(options.statistics, 'w') # write statistics header outSta.write("{}\t{}\t{}\t{}\n".format("zwID", "BC", "UMI", "pass_filter")) ## some input parameters P2 = options.FP P4 = options.RP inFP = options.inFP inRP = options.inRP polyAwin = options.pAw pAmismatch = options.pMs mismatch = options.mismatch ## get file handle if options.format == "FASTQ": file_handle = fqTOfa(inputFile) elif options.format == "FASTA": file_handle = formFASTA(inputFile.read()) ## run filter pfq = multiprocessing.Pool(options.numcpu) for record in file_handle: header, raw_seq = record prf = pfq.apply_async(filterCCS, args=( header, raw_seq, P2, P4, inFP, inRP, polyAwin, pAmismatch, mismatch,
def main(): options = Parsers() alignFa = open(options.align, 'r').read() targetPos = open(options.targetpos, 'r') ### output file output = open(os.path.join(options.dir, options.name + ".Events.txt"), 'w') # editfre = open(os.path.join(options.dir, options.name + ".EditFrequency.txt"), 'w') # eventcount = open(os.path.join(options.dir, options.name + ".EventCounts.txt"), 'w') # eventcountDic = open(os.path.join(options.dir, options.name + ".EventCounts.pkl"), "wb") next(targetPos) tarPos = [] header = [] ### parser target positions for eachline in targetPos: sple = eachline.strip().split('\t') header.append(sple[0]) tarPos.append((int(sple[1]), int(sple[2]))) ### build a dict to count the edit frequency in every base # pos = list(range(1, options.len)) # Edit_fre = {} # for p in pos: # Edit_fre[p] = [0,0] ### write the output header #editfre.write("{}\t{}\t{}\t{}\t{}\n".format("Position", "Insertion", "InserPercent", "Deletion", "DeletPercent")) output.write("{}\t{}\t{}\n".format("BC", "UMI", '\t'.join(header))) #editfre.flush() output.flush() ### get the output data processPool = multiprocessing.Pool(options.cpu) results = [] all_seq = formFASTA(alignFa) while True: try: _, refseq = next(all_seq) readname, readseq = next(all_seq) editresult = processPool.apply_async(analyzeEdit, args=( readname, refseq, readseq, )) results.append(editresult) except StopIteration: break print('waite') processPool.close() processPool.join() print('done') editEventDict = collections.OrderedDict() for each_result in results: rawEditlist = each_result.get() editEventDict[rawEditlist[-1]] = rawEditlist[:-1] #EventCounts = defaultdict(int) #total = 0 for key, value in editEventDict.items(): #frequency = int(key.strip().split('_')[-1]) BC = key.split("BC=")[1].split(" ")[0] UMI = key.split("UMI=")[1].split(" ")[0] stringline = outputEdit(value, tarPos) ### write out edit events output.write("{}\t{}\t{}\n".format(BC, UMI, stringline)) #total += 1 # for align in value: # EventCounts[align.toEditString()] += 1 # if align.indicator == "D": # for i in range(align.refpos, align.refpos + len(align.readbase)): # Edit_fre[i][1] += 1 # elif align.indicator == "I": # Edit_fre[align.refpos][0] += 1 # ### write out the edit frequency # for k, v in Edit_fre.items(): # editfre.write("{}\t{}\t{}\t{}\t{}\n".format(k, v[0], v[0]/total, v[1], v[1]/total)) ### write out the Edit events counts # eventcount.write("{}\t{}\n".format("Editevent", "counts")) # clean_EventCounts = {} # for eve, num in EventCounts.items(): # if "M" not in eve: # clean_EventCounts[eve] = num # eventcount.write("{}\t{}\n".format(eve, num)) # pickle.dump(clean_EventCounts,eventcountDic) #editfre.close() output.close()