def Write_Single_Field(filename=None,outfile_location=None,field=None,count_field = None, file_format=None,contains_header=True): """ This will extract a specific field from a file and write it to the output file as a single column (without a header.) If *count_field* = None, then we assume there are no counts associated with the field of interest. If *count_field* =! None, then we assume that column refers to the number of counts a sequence has occurred """ total_data = 0 total_field = 0 if outfile_location == None: outfile_location = filename+'_singlefield.txt' if (filename): isfile = os.path.isfile(filename) if (filename==None) or (isfile==False): raise Exception("The pathname of the file is invalid") if (field==None): IF_file = readwrite.immunogrepFile(filelocation=filename,filetype='TAB',contains_header=False,mode='r') print("Warning no field name was provided. This file will be treated as a tab file and the first column will be selected") field = 'Column 1' else: IF_file = readwrite.immunogrepFile(filelocation=filename,filetype=file_format,contains_header=contains_header,mode='r') if (file_format==None): guessedFiletype = IF_file.getFiletype() print("Warning, no file type for this file was provided. The file was predicted to be a "+guessedFiletype+" file.") try: outfile = open(outfile_location,'w') while not(IF_file.IFclass.eof): data = IF_file.IFclass.read() #read in each line as a dictionary if data: total_data+=1 if field in data and data[field]: value = data[field] if count_field!=None and count_field in data and data[count_field]: count = data[count_field] #this defines the number of times we will write it to a file else: count = '1' outfile.write(value+'\t'+count+'\n')#write sequence to new file total_field+=1 except Exception as e: os.remove(outfile_location)#("rm '{0}'".format(outfile_location)) print_error(e) return [total_data,total_field]
def parseMIXCR(originalfileloc,resultfileloc,inputype,outfile=None,header_var='document_header',sequence_var='sequence',command_val = {}): command_string = json.dumps(command_val) if command_val else json.dumps({'MIXCR V1.3': 'Unknown settings'}) if not outfile: outfile = "%s-parsed.annotation"%resultfileloc print('Parsing mixcr file') number_of_annotation_lines = useful.file_line_count(resultfileloc) seqfile=immunogrepFile(originalfileloc,inputype) #the original file used as an input file for mixcr annotation iffile=immunogrepFile(resultfileloc,'TAB',None)#,"\t",True,"r") #the mixcr generated alignment file parent_folder = '/'.join(resultfileloc.split('/')[:-1])+'/' error_file = open(resultfileloc+'.errorlog.txt','w') unfound_seqs = open(resultfileloc+'.notfound.txt','w') notfound=0 seq_num=0 errors=0 needcapture = True looper = useful.LoopStatusGen(number_of_annotation_lines,10) t1 = time.time() with open(outfile,"w") as f: f.write(descriptor_symbol+json.dumps(DatabaseTranslator())+'\n')#write a translator line to this file so that we know how to add results to database f.write('\t'.join(presetlabels)+'\n') #read each input sequence/file for fastseq in seqfile.read(): try: content={} if not fastseq: continue #read in the annotation information from mixcr seq = fastseq[sequence_var].upper() #extract sequence header and the SEQ_ID field from input file [header, id] = GetHeaderInfo(fastseq,header_var) if needcapture: #we need to match this sequence to mixcr program output if iffile.IFclass.eof: mixcr_data = None else: mixcr_data = iffile.IFclass.read() #print percent status completed looper.next() #check whether mixcr data matches the current sequence strand='' if mixcr_data: if 'Read id' in mixcr_data: if int(mixcr_data['Read id']) == seq_num: matched_seqs = True else: matched_seqs=False elif 'Description R1' in mixcr_data: if mixcr_data['Description R1'].strip() == fastseq[header_var].strip(): matched_seqs=True else: matched_seqs = False else: mixcr_data['Read(s) sequence'] = mixcr_data['Read(s) sequence'].upper() [matched_seqs,strand] = match_sequence(seq,mixcr_data['Read(s) sequence']) mixcr_seq = mixcr_data['Read(s) sequence'] else: mixcr_seq = '' matched_seqs=False strand='' needcapture=True if matched_seqs==False: #these results did not match mixcr sequence, so this sequence probably did not yield any results #so we do not need to recapture a new miseq sequence. We will just stay with this one needcapture = False content['Sequence']=seq content['Seqheader']=header content['Notes'] = 'Sequence not found in mixcr file;' content[idIdentifier] = id unfound_seqs.write('\t'.join([content['Seqheader'],content['Sequence'],mixcr_seq])+'\n') content['Command'] = command_string content = defaultdict(str,content) output_line = [str(content[lab]) for lab in presetlabels] f.write('\t'.join(output_line)+'\n') notfound+=1 seq_num+=1 continue seq_num+=1 #in the next iteration of the code, we will need to get a fresh mixcr result needcapture=True content = mixcr_data content['Notes'] = '' content[idIdentifier] = id content['Seqheader'] = header r_j = '' r_v = '' chain_v = '' content['Sequence']=seq content['Strand corrected sequence'] = content['Read(s) sequence'] [content['Full NT'],content['5_Prime_Annotation'],content['3_Prime_Annotation'],missing_fields]=return_full_nt(content) if missing_fields: content['Notes']+='The sequence is missing features between the 5 prime and 3 prime region;' content['3_Prime_Annotation']=content['3_Prime_Annotation']+'*' content['Full length'] = 'FALSE' else: if content['5_Prime_Annotation'] == 'FR1' and content['3_Prime_Annotation'] == 'FR4': content['Full length'] = 'TRUE' else: content['Full length'] = 'FALSE' [content['Full AA'],content['Productivity']] =GetFullAA(content,missing_fields) if content['AA. seq. CDR3'] and content['AA. seq. CDR3'] in content['Full AA']: content['CDR3_Junction_In_Frame']= 'TRUE' else: content['CDR3_Junction_In_Frame']= 'FALSE' if content['All V hits']: [vgenelist,vscorelist,vlocus,chain_v,r_v]=extractScores(content['All V hits']) content['All V hits']=','.join(vgenelist) content['All V scores']=','.join(vscorelist) content['FirstVgene']=vgenelist[0] content['Locus']=vlocus else: content['All V hits']='' content['All V scores']='' content['FirstVgene']='' content['Locus']='' if content['All D hits']: [dgenelist,dscorelist,dlocus,chain,recomb]=extractScores(content['All D hits']) content['All D hits']=','.join(dgenelist) content['All D scores']=','.join(dscorelist) content['FirstDgene']=dgenelist[0] else: content['All D hits']='' content['All D scores']='' content['FirstDgene']='' if content['All J hits']: [jgenelist,jscorelist,jlocus,chain,r_j]=extractScores(content['All J hits']) content['All J hits']=','.join(jgenelist) content['All J scores']=','.join(jscorelist) content['FirstJgene']=jgenelist[0] else: r_j = r_v content['All J hits']='' content['All J scores']='' content['FirstJgene']='' if content['All C hits']: [cgenelist,cscorelist,clocus,chain,recomb]=extractScores(content['All C hits']) content['All C hits']=','.join(cgenelist) content['All C scores']=','.join(cscorelist) else: content['All C hits']='' content['All C scores']='' if r_j == r_v: content['Recombination Type'] = r_v content['Chain'] = chain_v else: content['Recombination Type'] = '' content['Chain'] = '' if content['All V alignment']: [query_start,query_end,germ_start,germ_end,algn_len,num_mismatch,num_ins,num_del,shm,alignment_string] = ParseAlignment(content['All V alignment']) content['VGENE: Query start'] = query_start content['VGENE: Query end'] = query_end content['VGENE: Germline start'] = germ_start content['VGENE: Germline end'] = germ_end content['VGENE: Shm.nt'] = num_ins+num_del+num_mismatch content['VGENE: Mismatch'] = num_mismatch content['VGENE: Insertion'] = num_ins content['VGENE: Deletion'] = num_del content['VGENE: Alignment'] = alignment_string content['VGENE: Shm.per'] = round(100*shm,3) content['VGENE: Alignment length'] = algn_len content['AB end'] = query_end content['AB start'] = query_start if content['All J alignment']: [query_start,query_end,germ_start,germ_end,algn_len,num_mismatch,num_ins,num_del,shm,alignment_string] = ParseAlignment(content['All J alignment']) content['JGENE: Query start'] = query_start content['JGENE: Query end'] = query_end content['JGENE: Germline start'] = germ_start content['JGENE: Germline end'] = germ_end content['JGENE: Shm.nt'] = num_ins+num_del+num_mismatch content['JGENE: Mismatch'] = num_mismatch content['JGENE: Insertion'] = num_ins content['JGENE: Deletion'] = num_del content['JGENE: Alignment'] = alignment_string content['JGENE: Shm.per'] = round(100*shm,3) content['JGENE: Alignment length'] = algn_len content['AB end'] = query_end if 'AB start' not in content: content['AB start'] = query_start content['Orientation'] = guess_strand(content['Full NT'],content['Sequence']) content['Command'] = command_string content = defaultdict(str,content) output_line = [str(content[lab]) for lab in presetlabels] f.write('\t'.join(output_line)+'\n') except Exception as e: errors+=1 print('There was an error in sequence: '+str(seq_num)) print('Error: '+str(e)) error_file.write('****ERROR FOUND IN SEQUENCE:{0} ****\n'.format(str(seq_num))) error_file.write(useful.print_error_string(e)+'\n') error_file.write('MIXCR DATA: \n') error_file.write(json.dumps(content,indent=4)+'\n') error_file.write('*************END OF ERROR*********\n') iffile.IFclass.close() seqfile.IFclass.close() error_file.close() unfound_seqs.close() if errors==0: os.remove(resultfileloc+'.errorlog.txt') if notfound==0: os.remove(resultfileloc+'.notfound.txt') t2 =time.time() print(str(t2-t1)) return outfile
def isotype_sequences(input_file, input_file_type, barcode_file='', output_file=None, output_format='TAB', seq_var='sequence', header_var='header', helper_fields={}, alignment_settings={}, analysis_name=None): #####OVER HEAD FUNCTIONS help_1 = defaultdict(str, copy.deepcopy(helper_fields)) recombination_var = help_1['recombination_var'] strand_field = help_1['strand_field'] end_of_ab_field = help_1['end_of_ab_field'] al_1 = copy.deepcopy(alignment_settings) penalize_truncations = al_1[ 'penalize_truncations'] if 'penalize_truncations' in al_1 else True minimum_alignment_length = al_1[ 'minimum_alignment_length'] if 'minimum_alignment_length' in al_1 else 15 #0=> only consider barcodes as provided #1=> only consider the reverse complmeent of barcodes provided #2=> consider both strands search_rc = al_1['search_rc'] if 'search_rc' in al_1 else 2 allowed_mismatches_in_alignment = al_1[ 'allowed_mismatches_in_alignment'] if 'allowed_mismatches_in_alignment' in al_1 else 2 #the sequence filed provided is the sequence of the SENSE AB gene not the antisense #when False, will consider both the forward and reverse copmlmement of sequence strand_corrected = al_1[ 'strand_corrected'] if 'strand_corrected' in al_1 else False #file locations seq_fasta_location = input_file # functionVars["folder_location"]+functionVars["input_file"] #location of input file translator_field = copy.deepcopy(translator) if analysis_name: translator_field['ANALYSIS_NAME'] = analysis_name.upper() translator_field = {translation_var: translator_field} if output_file == None or output_file == input_file: output_file = useful.removeFileExtension( input_file) + '.isotype.annotation' output_file_location = output_file output_file_format = output_format #functionVars['write_format'] #seqHandle = open(seq_fasta_location,"rU") outHandle = open(output_file_location, 'w') outHandle.write( descriptor_symbol + json.dumps(translator_field) + '\n' ) #write a translator line to this file so that we know how to add results to database if output_format == 'TAB' or output_format == 'CSV': outHandle.write('\t'.join(FileDelimFields) + '\n') if not barcode_file: # 'barcodefilename' in functionVars: #manually using these primers barcodeSeqList = defaultBarcodes() elif not (os.path.isfile(barcode_file)): print('Barcode file not found! Using default barcodes') #manually using these primers barcodeSeqList = defaultBarcodes() else: barcodeSeqList = readBarcodeFile(barcode_file) command_string = json.dumps({ 'Barcodes': barcodeSeqList, 'mismatch_cutoff': allowed_mismatches_in_alignment, 'penalize_truncations': penalize_truncations, 'minimum_length_cutoff': minimum_alignment_length }) iffile = readwrite.immunogrepFile(filelocation=seq_fasta_location, filetype=input_file_type) #get maximum length of sequences in file [maxLen, numSeq] = maxSeqLen(iffile, seq_var) #make a call to the generator for alinging sequences to isotypes guessed_num_bases_after_jgene = 60 isotype_predictor = fft_tools.BarcodeAligner( barcodeSeqList, penalize_truncations, search_rc, allowed_mismatches_in_alignment, minimum_alignment_length, nmax=maxLen, nmin=guessed_num_bases_after_jgene) ###END OF OVERHEAD FUNCTIONS #now lets read through sequences and start alignining algnLim = 10 currentSeq = 0 overlap_len = 10 #seqHandle=open(seq_fasta_location,"rU") counter = 0 startPer = 0 num_isotype_found = {} total_isotype_found = 0 total_found_score = 0 total_notfound_score = 0 print("Starting isotyping analysis for {0} sequences".format(numSeq)) totaltime = 0 a = int(round(time.time())) found = 0 iffile = readwrite.immunogrepFile(filelocation=seq_fasta_location, filetype=input_file_type) summary_data = { 'found': 0, 'top_isotype': defaultdict(int), 'average_mismatch': 0, 'average_num_isotype_found': 0 } for line_row in iffile.read(): jsonVar = {} if not line_row: continue if header_var in line_row: if idIdentifier in line_row: jsonVar[idIdentifier] = line_row[idIdentifier] jsonVar['Header'] = line_row[header_var] else: [header, id] = GrabAdditionalHeaderInfo(line_row[header_var]) jsonVar[idIdentifier] = id jsonVar['Header'] = header if seq_var not in line_row or line_row[seq_var] == '': jsonVar['Sequence'] = '' jsonVar['Notes'] = 'No sequence found' writeSeqResult(outHandle, jsonVar, output_format) continue #allow the user to monitor what percent of the sequences have been processed startPer = useful.LoopStatus(counter, numSeq, 10, startPer) bestScore = 0 bestBarcode = -1 jsonVar['Sequence'] = line_row[seq_var] jsonVar['Command'] = command_string counter += 1 seqFwd = jsonVar['Sequence'] if strand_corrected: all_seqs = [seqFwd] else: all_seqs = [seqFwd, str(Seq(seqFwd).reverse_complement())] found_strand = '' for pos, each_seq in enumerate(all_seqs): #determine if we should take a substring of the sequence #basically, only consider nucleotides AFTER the end of the ab field if end_of_ab_field in line_row and line_row[end_of_ab_field] != '': try: end_of_ab = int(line_row[end_of_ab_field]) except: end_of_ab = 0 #take substring if end_of_ab - overlap_len < len( each_seq) and end_of_ab - overlap_len >= 0: each_seq = each_seq[end_of_ab:] isotypes_results = isotype_predictor.AlignToSeq(each_seq) if isotypes_results: found_strand = strand_orientation_list[pos] break if isotypes_results: found += 1 jsonVar = dict(jsonVar.items() + isotypes_results.items()) jsonVar['Sequence strand'] = found_strand if recombination_var in line_row and line_row[recombination_var]: #always trust the recombination type from input file IF provided jsonVar['Recombination type'] = line_row[recombination_var] else: #if there is no results then attemp to guess it our selves jsonVar['Recombination type'] = GuessRecombType( jsonVar['Isotype'][0]) summary_data['top_isotype'][jsonVar['Isotype'][0]] += 1 summary_data['average_num_isotype_found'] += len( jsonVar['Isotype']) summary_data['average_mismatch'] += jsonVar['Mismatches'][0] else: if recombination_var in line_row and line_row[recombination_var]: #always trust the recombination type from input file IF provided jsonVar['Recombination type'] = line_row[recombination_var] jsonVar['Isotype'] = '' jsonVar[ 'Notes'] = 'Could not identify isotype with alignment score above threshold' summary_data['top_isotype']['NotFound'] += 1 writeSeqResult(outHandle, jsonVar, output_format) b = int(round(time.time())) summary_data['found'] = found if found: summary_data['average_mismatch'] = summary_data[ 'average_mismatch'] / float(found) summary_data['average_num_isotype_found'] = summary_data[ 'average_num_isotype_found'] / float(found) totaltime = (b - a) print "time: " print totaltime print "Summary of identified isotypes:" print summary_data #if total_isotype_found>0: # print "\nAverage score for identified isotypes:" # print str(total_found_score/float(total_isotype_found)) #if numSeq-total_isotype_found>0: # print "\nAverage score for unidentified isotypes:" # print str(total_notfound_score/float(numSeq-total_isotype_found)) outHandle.close() #if output_file_format=="txt": # JSON_to_TXT(output_file_location, output_file_location, True,{'Header':1,'Seq':2,'dir':3,'isotype':4,'algnPos':5,'maxscore':6,'bestscore':7}) return output_file
def Write_Single_Field(filename=None, outfile_location=None, field=None, count_field=None, file_format=None, contains_header=True): """ This will extract a specific field from a file and write it to the output file as a single column (without a header.) If *count_field* = None, then we assume there are no counts associated with the field of interest. If *count_field* =! None, then we assume that column refers to the number of counts a sequence has occurred """ total_data = 0 total_field = 0 if outfile_location == None: outfile_location = filename + '_singlefield.txt' if (filename): isfile = os.path.isfile(filename) if (filename == None) or (isfile == False): raise Exception("The pathname of the file is invalid") if (field == None): IF_file = readwrite.immunogrepFile(filelocation=filename, filetype='TAB', contains_header=False, mode='r') print( "Warning no field name was provided. This file will be treated as a tab file and the first column will be selected" ) field = 'Column 1' else: IF_file = readwrite.immunogrepFile(filelocation=filename, filetype=file_format, contains_header=contains_header, mode='r') if (file_format == None): guessedFiletype = IF_file.getFiletype() print( "Warning, no file type for this file was provided. The file was predicted to be a " + guessedFiletype + " file.") try: outfile = open(outfile_location, 'w') while not (IF_file.IFclass.eof): data = IF_file.IFclass.read() #read in each line as a dictionary if data: total_data += 1 if field in data and data[field]: value = data[field] if count_field != None and count_field in data and data[ count_field]: count = data[ count_field] #this defines the number of times we will write it to a file else: count = '1' outfile.write(value + '\t' + count + '\n') #write sequence to new file total_field += 1 except Exception as e: os.remove(outfile_location) #("rm '{0}'".format(outfile_location)) print_error(e) return [total_data, total_field]
def parseMIXCR(originalfileloc, resultfileloc, inputype, outfile=None, header_var='document_header', sequence_var='sequence', quality_var='phred', command_val={}): command_string = json.dumps(command_val) if command_val else json.dumps( {'MIXCR V1.3': 'Unknown settings'}) if not outfile: outfile = "%s-parsed.annotation" % resultfileloc print('Parsing mixcr file') number_of_annotation_lines = useful.file_line_count(resultfileloc) seqfile = immunogrepFile( originalfileloc, inputype ) #the original file used as an input file for mixcr annotation iffile = immunogrepFile( resultfileloc, 'TAB', None) #,"\t",True,"r") #the mixcr generated alignment file parent_folder = '/'.join(resultfileloc.split('/')[:-1]) + '/' error_file = open(resultfileloc + '.errorlog.txt', 'w') unfound_seqs = open(resultfileloc + '.notfound.txt', 'w') notfound = 0 seq_num = 0 errors = 0 needcapture = True looper = useful.LoopStatusGen(number_of_annotation_lines, 10) t1 = time.time() with open(outfile, "w") as f: f.write( descriptor_symbol + json.dumps(DatabaseTranslator()) + '\n' ) #write a translator line to this file so that we know how to add results to database f.write('\t'.join(presetlabels) + '\n') #read each input sequence/file for fastseq in seqfile.read(): try: content = {} if not fastseq: continue #read in the annotation information from mixcr seq = fastseq[sequence_var].upper() #extract sequence header and the SEQ_ID field from input file [header, id] = GetHeaderInfo(fastseq, header_var) if needcapture: #we need to match this sequence to mixcr program output if iffile.IFclass.eof: mixcr_data = None else: mixcr_data = iffile.IFclass.read() #print percent status completed looper.next() #check whether mixcr data matches the current sequence strand = '' if mixcr_data: if 'Read id' in mixcr_data: if int(mixcr_data['Read id']) == seq_num: matched_seqs = True else: matched_seqs = False elif 'Description R1' in mixcr_data: if mixcr_data['Description R1'].strip( ) == fastseq[header_var].strip(): matched_seqs = True else: matched_seqs = False else: mixcr_data['Read(s) sequence'] = mixcr_data[ 'Read(s) sequence'].upper() [matched_seqs, strand ] = match_sequence(seq, mixcr_data['Read(s) sequence']) mixcr_seq = mixcr_data['Read(s) sequence'] else: mixcr_seq = '' matched_seqs = False strand = '' needcapture = True if matched_seqs == False: #these results did not match mixcr sequence, so this sequence probably did not yield any results #so we do not need to recapture a new miseq sequence. We will just stay with this one needcapture = False content['Sequence'] = seq content['Seqheader'] = header if quality_var in fastseq: content['Read(s) sequence qualities'] = fastseq[ quality_var] content['Notes'] = 'Sequence not found in mixcr file;' content[idIdentifier] = id unfound_seqs.write('\t'.join([ content['Seqheader'], content['Sequence'], mixcr_seq ]) + '\n') content['Command'] = command_string content = defaultdict(str, content) output_line = [str(content[lab]) for lab in presetlabels] f.write('\t'.join(output_line) + '\n') notfound += 1 seq_num += 1 continue seq_num += 1 #in the next iteration of the code, we will need to get a fresh mixcr result needcapture = True content = mixcr_data content['Notes'] = '' content[idIdentifier] = id content['Seqheader'] = header r_j = '' r_v = '' chain_v = '' content['Sequence'] = seq content['Strand corrected sequence'] = content[ 'Read(s) sequence'] [ content['Full NT'], content['5_Prime_Annotation'], content['3_Prime_Annotation'], missing_fields ] = return_full_nt(content) if missing_fields: content[ 'Notes'] += 'The sequence is missing features between the 5 prime and 3 prime region;' content['3_Prime_Annotation'] = content[ '3_Prime_Annotation'] + '*' content['Full length'] = 'FALSE' else: if content['5_Prime_Annotation'] == 'FR1' and content[ '3_Prime_Annotation'] == 'FR4': content['Full length'] = 'TRUE' else: content['Full length'] = 'FALSE' [content['Full AA'], content['Productivity']] = GetFullAA(content, missing_fields) if content['AA. Seq. CDR3'] and content[ 'AA. Seq. CDR3'] in content['Full AA']: content['CDR3_Junction_In_Frame'] = 'TRUE' else: content['CDR3_Junction_In_Frame'] = 'FALSE' if content['All V hits']: [vgenelist, vscorelist, vlocus, chain_v, r_v] = extractScores(content['All V hits']) content['All V hits'] = ','.join(vgenelist) content['All V scores'] = ','.join(vscorelist) content['FirstVgene'] = vgenelist[0] content['Locus'] = vlocus else: content['All V hits'] = '' content['All V scores'] = '' content['FirstVgene'] = '' content['Locus'] = '' if content['All D hits']: [dgenelist, dscorelist, dlocus, chain, recomb] = extractScores(content['All D hits']) content['All D hits'] = ','.join(dgenelist) content['All D scores'] = ','.join(dscorelist) content['FirstDgene'] = dgenelist[0] else: content['All D hits'] = '' content['All D scores'] = '' content['FirstDgene'] = '' if content['All J hits']: [jgenelist, jscorelist, jlocus, chain, r_j] = extractScores(content['All J hits']) content['All J hits'] = ','.join(jgenelist) content['All J scores'] = ','.join(jscorelist) content['FirstJgene'] = jgenelist[0] else: r_j = r_v content['All J hits'] = '' content['All J scores'] = '' content['FirstJgene'] = '' if content['All C hits']: [cgenelist, cscorelist, clocus, chain, recomb] = extractScores(content['All C hits']) content['All C hits'] = ','.join(cgenelist) content['All C scores'] = ','.join(cscorelist) else: content['All C hits'] = '' content['All C scores'] = '' if r_j == r_v: content['Recombination Type'] = r_v content['Chain'] = chain_v else: content['Recombination Type'] = '' content['Chain'] = '' if content['All V alignments']: [ query_start, query_end, germ_start, germ_end, algn_len, num_mismatch, num_ins, num_del, shm, alignment_string ] = ParseAlignment(content['All V alignments']) content['VGENE: Query start'] = query_start content['VGENE: Query end'] = query_end content['VGENE: Germline start'] = germ_start content['VGENE: Germline end'] = germ_end content['VGENE: Shm.nt'] = num_ins + num_del + num_mismatch content['VGENE: Mismatch'] = num_mismatch content['VGENE: Insertion'] = num_ins content['VGENE: Deletion'] = num_del content['VGENE: Alignment'] = alignment_string content['VGENE: Shm.per'] = round(100 * shm, 3) content['VGENE: Alignment length'] = algn_len content['AB end'] = query_end content['AB start'] = query_start if content['All J alignments']: [ query_start, query_end, germ_start, germ_end, algn_len, num_mismatch, num_ins, num_del, shm, alignment_string ] = ParseAlignment(content['All J alignments']) content['JGENE: Query start'] = query_start content['JGENE: Query end'] = query_end content['JGENE: Germline start'] = germ_start content['JGENE: Germline end'] = germ_end content['JGENE: Shm.nt'] = num_ins + num_del + num_mismatch content['JGENE: Mismatch'] = num_mismatch content['JGENE: Insertion'] = num_ins content['JGENE: Deletion'] = num_del content['JGENE: Alignment'] = alignment_string content['JGENE: Shm.per'] = round(100 * shm, 3) content['JGENE: Alignment length'] = algn_len content['AB end'] = query_end if 'AB start' not in content: content['AB start'] = query_start content['Orientation'] = guess_strand(content['Full NT'], content['Sequence']) content['Command'] = command_string content = defaultdict(str, content) output_line = [str(content[lab]) for lab in presetlabels] f.write('\t'.join(output_line) + '\n') except Exception as e: errors += 1 print('There was an error in sequence: ' + str(seq_num)) print('Error: ' + str(e)) error_file.write( '****ERROR FOUND IN SEQUENCE:{0} ****\n'.format( str(seq_num))) error_file.write(useful.print_error_string(e) + '\n') error_file.write('MIXCR DATA: \n') error_file.write(json.dumps(content, indent=4) + '\n') error_file.write('*************END OF ERROR*********\n') iffile.IFclass.close() seqfile.IFclass.close() error_file.close() unfound_seqs.close() if errors == 0: os.remove(resultfileloc + '.errorlog.txt') if notfound == 0: os.remove(resultfileloc + '.notfound.txt') t2 = time.time() print(str(t2 - t1)) return outfile
def Descriptive_Statistics(list_of_files,input_file_type,analysis_name='',exp_names = [],output_file_prefix='',fields={},statistics_to_run=['ab_aa','cdr3','vgene','jgene','vjgene']): analysis_name = analysis_name.upper() if input_file_type=='IMGT' and not isinstance(list_of_files[0],list): list_of_files = [list_of_files] elif not isinstance(list_of_files,list): list_of_files = [list_of_files] if len(exp_names)!=len(list_of_files): exp_names = [] #by default, save results to the same folder as the input file if not output_file_prefix: output_file_prefix = useful.removeFileExtension(list_of_files[0]) analysis_name = analysis_name.upper() supported_analyses = fields_for_analysis.keys() if (not analysis_name or analysis_name=='CUSTOM' or analysis_name not in supported_analyses) and not fields: raise Exception('The required fields for the provided analysis, {0}, is not currently automated. Please explicity provide the fields names'.format(str(analysis_name))) #first we use default fields defined ehere if analysis_name in supported_analyses: fields_to_use = copy.deepcopy(fields_for_analysis[analysis_name]) else: fields_to_use = {} #next we add in user defined fields just in case there are any changes/mistakes for f,name in fields.iteritems(): fields_to_use[f] = name filenames_to_use = [f[0] if isinstance(f,list) else f for f in list_of_files] print('Performing descriptive statistics at {0}.'.format(str(datetime.datetime.now()))) print('Analyzing the following files:\n\t {0}'.format('\n\t'.join(filenames_to_use))) unique_aa_file = None unique_cdr3_file = None v_gene_analysis = None j_gene_analysis = None vj_gene_analysis = None gene_analysis_plot = output_file_prefix plots_created = [] gene_summary_file = output_file_prefix+'.summary_of_stats.txt' output_file_names = {} aa_files = ['AB AA SEQUENCE','RECOMBINATION_TYPE','LOCUS','CDR1','CDR2','CDR3','STOP CODONS','PRODUCTIVE','VGENES','DGENES','JGENES','TOTAL COUNTS'] fields_order = ['full_len_ab','recomb','locus','cdr1','cdr2','cdr3','stopc','functionality','vgene','dgene','jgene'] num_exp= len(list_of_files) if not exp_names: if input_file_type=='IMGT': pass else: exp_names = [] for file in list_of_files: count = 1 str_file = os.path.basename(file) while True: if str_file in exp_names: str_file = os.path.basename(file)+'_'+str(count) count+=1 else: exp_names.append(str_file) break if 'ab_aa' in statistics_to_run: intermediate_file = output_file_prefix+'.unique_aa_file_temp' #first we will use a temp file/intermeidate file output_file_names['ab_aa'] = open(intermediate_file,'w') #output_file_names['ab_aa'].write('\t'.join(aa_files)+'\n') cdr3analysis = True if 'cdr3' in statistics_to_run else False aaanalysis = True if 'ab_aa' in statistics_to_run else False vjgene_dict=defaultdict(lambda:defaultdictgenes(num_exp)) #cdr3_dict=defaultdict(lambda:defaultdictcdr3(num_exp)) cdr3_dict_vdj = defaultdict(lambda:defaultdictcdr3(num_exp)) cdr3_dict_vj = defaultdict(lambda:defaultdictcdr3(num_exp)) cdr3_dict_unk = defaultdict(lambda:defaultdictcdr3(num_exp)) use_these_fields = fields_to_use.values() fields_to_use['stopc'] = 'stopc' num_results = [0]*(num_exp) num_cdr3 = [0]*(num_exp) num_stop_codon = [0]*(num_exp) num_vdj = [0]*(num_exp) num_vj = [0]*(num_exp) num_sequences = [0]*(num_exp) if not fields_to_use['recomb']: #maybe the user never defined a feild for recombinoation type..that coudl be a problem because we will have to guess it using the variable at the top of the script: recomb_call recomb_not_defined = True fields_to_use['recomb'] = 'recomb' else: recomb_not_defined = False print('Reading through sequences in file(s)') seqnum=1 #go through all of the files and report the relevant fields #if we are creating a unique amino acid file, then report thiese fields to temp file for fnum,each_file in enumerate(list_of_files): annotated_file = readfile.immunogrepFile(each_file,input_file_type,field_names = use_these_fields) #loop through each file for seq_lines in annotated_file.read(): if not seq_lines: continue if seqnum%500000==0: print('Read {0} sequences'.format(str(seqnum))) seqnum+=1 num_sequences[fnum]+=1 seq_lines = defaultdict(str,seq_lines) if seq_lines[fields_to_use['full_len_ab']]: #full length antibody sequence not found num_results[fnum]+=1 #only select the first gene in the list. alos remove allelic name ('*') seq_lines[fields_to_use['vgene']] = seq_lines[fields_to_use['vgene']].split(',')[0].split('*')[0] seq_lines[fields_to_use['dgene']] = seq_lines[fields_to_use['dgene']].split(',')[0].split('*')[0] #IF NO RECOMBINATION TYPE IS FOUND or provided, THEN guess it using the vgene or jgene call if recomb_not_defined or not seq_lines[fields_to_use['recomb']]: r = '' #not sure what the recombation type is yet #try to guess the recombination type if seq_lines[fields_to_use['vgene']]: #use vgene if present # look at the first three characters in vgene to predict recombioation type gn = ProcessGene(seq_lines[fields_to_use['vgene']]) if gn[:3] in recomb_call: r = recomb_call[gn[:3]] elif gn[:2] in recomb_call: #next check the first two letters (IGBLAST REPORTS TA RATHER THAN TRA for example r = recomb_call[gn[:2]] if not r and seq_lines[fields_to_use['jgene']]: #still not r found, so use jgene gn = ProcessGene(seq_lines[fields_to_use['jgene']]) if gn[:3] in recomb_call: r = recomb_call[gn[:3]] elif gn[:2] in recomb_call: #next check the first two letters (IGBLAST REPORTS TA RATHER THAN TRA for example r = recomb_call[gn[:2]] #update recomb result seq_lines[fields_to_use['recomb']] = r if not seq_lines[fields_to_use['recomb']]: continue if seq_lines[fields_to_use['recomb']] == 'VDJ': num_vdj[fnum]+=1 elif seq_lines[fields_to_use['recomb']] == 'VJ': num_vj[fnum]+=1 seq_lines[fields_to_use['jgene']] = seq_lines[fields_to_use['jgene']].split(',')[0].split('*')[0] seq_lines['stopc'] = 'YES' if '*' in seq_lines[fields_to_use['full_len_ab']] else 'NO' if seq_lines['stopc'] == 'YES': num_stop_codon[fnum]+=1 if aaanalysis: exp_str = str(fnum+1) #make an intermediate file where we only put the fields we want in the proper order from any file #we will use this field for sorting afterwards #also output exp_num to account for which sequence came from which experiment output_file_names['ab_aa'].write('\t'.join([seq_lines[fields_to_use[f]] for f in fields_order])+'\t'+str(exp_str)+'\n') if seq_lines[fields_to_use['vgene']] or seq_lines[fields_to_use['jgene']]: key_v =delim.join([seq_lines[fields_to_use['vgene']],seq_lines[fields_to_use['jgene']],seq_lines[fields_to_use['recomb']]]) vjgene_dict[key_v][fnum]+=1 if not seq_lines[fields_to_use['cdr3']]: #no cdr3 found continue #add unique cdr3_recomb and vjgene info to dictionaires num_cdr3[fnum]+=1 if cdr3analysis: key = seq_lines[fields_to_use['cdr3']] #key_cdr3 = delim.join([],seq_lines[fields_to_use['recomb']]]) if seq_lines[fields_to_use['recomb']]=='VDJ': cdr3_dict_vdj[key][fnum]+=1 elif seq_lines[fields_to_use['recomb']]=='VJ': cdr3_dict_vj[key][fnum]+=1 else: print('unknown recombination types: ',seq_lines[fields_to_use['recomb']]) cdr3_dict_unk[key][fnum]+=1 if seqnum>10000: break if aaanalysis: output_file_names['ab_aa'].close() print('Generating a file of unique AB amino acid sequences') unique_aa_file = output_file_prefix+'.unique_aa_file.txt' #Use some bash to make a unique amino acid file using sorting and then some awk GenerateAAFile(intermediate_file,unique_aa_file,aa_files,exp_names) #number of amino acid sequences observed if not os.path.isfile(unique_aa_file): num_unique_aa = 0 else: num_unique_aa = useful.file_line_count(unique_aa_file)-1 #-1 => remove header row count #Now have some fun with pandas if set(['vgene','jgene','vjgene']) & set(statistics_to_run): #vjgene_dict format = { #'key' = 'vgene',_,'jgene',_,'recombtype' #value = [count,count] => a list of counts for presence of that key in EACH providced file/experiment. Length of list = number of experiments #} gene_df = pd.DataFrame(vjgene_dict).transpose() if 'VGENE' not in gene_df.columns: gene_df['VGENE'] = '' if 'JGENE' not in gene_df.columns: gene_df['JGENE'] = '' if 'recomb' not in gene_df.columns: gene_df['recomb'] = '' gene_df['TOTAL_COUNTS'] = gene_df.sum(axis=1) gene_df = gene_df.reset_index() gene_df = gene_df.apply(ModifyPDTable,axis=1,args=(['VGENE','JGENE','recomb'],delim)) new_names = {} for f,v in enumerate(exp_names): new_names[f]=v #key = experiment index number #value = new name #rename the columns 0,1,...num experiments to match the experiment names gene_df = gene_df.rename(columns=new_names) #format of gene_df: #index => no index set, just use default numbers #columns => start with column for each experiment, then add the following columns: VGENE, JGENE, recomb, TOTAL_COUNTS if 'vgene' in statistics_to_run: print('Performing V gene analysis') v_gene_analysis = output_file_prefix+'.vgenes.txt' #group elements by VH GENE CALLS and VL gene calls sorted_v_counts = gene_df.groupby(['recomb','VGENE']).sum()#.count()#.sort('VGENE',ascending=1) #find out which level in multilevel index corresponds to 'VGENE' => looking at above code , it should be level 1 (recomb should be level 0) vgene_level = sorted_v_counts.index.names.index('VGENE') #remove results where vGENE is empty if '' in list(sorted_v_counts.index.levels[vgene_level]): sorted_v_counts = sorted_v_counts.drop('',level='VGENE') ignore_counts = ['TOTAL_COUNTS','JGENE'] keep_col = [n for n in sorted_v_counts.columns if n not in ignore_counts] g = sorted_v_counts[keep_col] #NOW PLOT the FREQUENCY for every exeprement if 'VDJ' in list(g.index.levels[g.index.names.index('recomb')]): vdj_g = g.xs('VDJ',level='recomb') PlotGeneDist(vdj_g/vdj_g.sum(),gene_analysis_plot+'.vdj.vgenes','VH Gene Distribution','Frequency','V Gene',max_val=None,min_val=0) plots_created.append(gene_analysis_plot+'.vdj.vgenes.png') #.png extension is added in the function plotgenedist if 'VJ' in list(g.index.levels[g.index.names.index('recomb')]): vj_g = g.xs('VJ',level='recomb') PlotGeneDist(vj_g/vj_g.sum(),gene_analysis_plot+'.vj.vgenes','VL Gene Distribution','Frequency','V Gene',max_val=None,min_val=0) plots_created.append(gene_analysis_plot+'.vj.vgenes.png') #.png extension is added in the function plotgenedist sorted_v_counts.reset_index().sort(['recomb','TOTAL_COUNTS'],ascending=[1,0]).iloc[:,:-1].to_csv(v_gene_analysis,sep='\t',index=False) #do the same as above, except for J genes this time if 'jgene' in statistics_to_run: print('Performing J gene analysis') j_gene_analysis = output_file_prefix+'.jgenes.txt' sorted_j_counts = gene_df.groupby(['recomb','JGENE']).sum()#.sort('VGENE',ascending=1) jgene_level = sorted_j_counts.index.names.index('JGENE') if '' in list(sorted_j_counts.index.levels[jgene_level]): sorted_j_counts.drop('',level='JGENE',inplace=True) ignore_counts = ['TOTAL_COUNTS','VGENE'] keep_col = [n for n in sorted_j_counts.columns if n not in ignore_counts] g = sorted_j_counts[keep_col] sorted_j_counts.reset_index().sort(['recomb','TOTAL_COUNTS'],ascending=[1,0]).iloc[:,:-1].to_csv(j_gene_analysis,sep='\t',index=False) #NOW CALCULATE FREQUENCY for every exeprement if 'VDJ' in list(g.index.levels[g.index.names.index('recomb')]): vdj_g = g.xs('VDJ',level='recomb') PlotGeneDist(vdj_g/vdj_g.sum(),gene_analysis_plot+'.vdj.jgenes','JH Gene Distribution','Frequency','J Gene',max_val=None,min_val=0,step=5) plots_created.append(gene_analysis_plot+'.vdj.jgenes.png') #.png extension is added in the function plotgenedist if 'VJ' in list(g.index.levels[g.index.names.index('recomb')]): vj_g = g.xs('VJ',level='recomb') PlotGeneDist(vj_g/vj_g.sum(),gene_analysis_plot+'.vj.jgenes','JL Gene Distribution','Frequency','J Gene',max_val=None,min_val=0,step=5) plots_created.append(gene_analysis_plot+'.vj.jgenes.png') #.png extension is added in the function plotgenedist #now perform a V-J gene analysis (heat map) for each experiment if 'vjgene' in statistics_to_run: print('Performing V-J gene analysis') vj_gene_analysis = output_file_prefix+'.v_and_jgene_analysis.txt' #group datafraom by recombination, vgene, and jgene #first rename all V and J gnees that are empyt as No call #Then Group H / L results by by v and j gnees and take the sum of each column in the group vj_df = gene_df.replace([''],[' No call']).groupby(['recomb','VGENE','JGENE']).sum() vj_df.to_csv(vj_gene_analysis,sep='\t') #remove TOTAL_COUNTS vj_df.drop('TOTAL_COUNTS', axis=1, inplace=True) #calculate frequency for each recomb type if 'VDJ' in list(vj_df.index.levels[g.index.names.index('recomb')]): v1 = vj_df.loc['VDJ',:]/vj_df.loc['VDJ',:].sum() PlotVJGeneHeatMap(v1,gene_analysis_plot+'.vdj.v_and_jgene_analysis',max_val=None,min_val=None) plots_created.append(gene_analysis_plot+'.vdj.v_and_jgene_analysis.png') #.png extension is added in the function plotgenedist if 'VJ' in list(vj_df.index.levels[g.index.names.index('recomb')]): v2 = vj_df.loc['VJ',:]/vj_df.loc['VJ',:].sum() PlotVJGeneHeatMap(v2,gene_analysis_plot+'.vj.v_and_jgene_analysis',max_val=None,min_val=None) plots_created.append(gene_analysis_plot+'.vj.v_and_jgene_analysis.png') #.png extension is added in the function plotgenedist del vj_df del gene_df #lets do some cdr3 analysis cdr3_length_stats = {} diversity_measurements = {} if cdr3analysis: unique_cdr3_file = output_file_prefix+'.unique_cdr3_counts.txt' print('Performing CDR3 analyisis') if sum(num_cdr3)>0: #again create a pandas dataframe but this time using the unique cdr3 calls print('Loading CDR3s into a dataframe') cdr3_df_list = [pd.DataFrame.from_dict(c,orient='index') for c in [cdr3_dict_vdj,cdr3_dict_vj,cdr3_dict_unk]] #merge all dftogether keys=['VDJ','VJ','UNK'] cdr3_df = pd.concat(cdr3_df_list,keys=keys) #cdr3_df = pd.DataFrame(cdr3_dict).transpose() cdr3_df['TOTAL_COUNTS'] = cdr3_df.sum(axis=1) print('Dataframe created') cdr3_df.index.names = ['recomb','CDR3'] cdr3_df = cdr3_df.reset_index() #cdr3_df['CDR3'] = '' #cdr3_df['recomb'] = '' #cdr3_df = cdr3_df.apply(ModifyPDTable,axis=1,raw=True,reduce=True,args=(['CDR3','recomb'],delim)) new_names = {} #performm cdr3_df['CDR3_LENGTH'] = cdr3_df.CDR3.map(len) for f,v in enumerate(exp_names): new_names[f]=v #rename the columns to match the experiment names cdr3_df = cdr3_df.rename(columns=new_names) cdr3_df.sort(['recomb','TOTAL_COUNTS'],ascending=[1,0],inplace=True) cdr3_df.set_index(['recomb','CDR3'],inplace=True) #save dataframe as tab dleim file cdr3_df.to_csv(unique_cdr3_file,sep='\t') cdr3_length_stats = PlotCDR3Histogram(cdr3_df,gene_analysis_plot+'.cdr3_length_histogram') plots_created.append(gene_analysis_plot+'.cdr3_length_histogram.png') diversity_measurements = CalculateDiversities(cdr3_df,gene_analysis_plot+'.cdr3_diversity_plots') plots_created.append(gene_analysis_plot+'.cdr3_diversity_plots.png') del cdr3_df print('Writing summary to file') #finally make a results text file that summarizes all the information GenerateResultsSummaryFile(gene_summary_file,statistics_to_run,list_of_files,exp_names,unique_aa_file,unique_cdr3_file,v_gene_analysis,j_gene_analysis,vj_gene_analysis,plots_created,num_sequences,num_results,num_vdj,num_vj,num_cdr3,num_stop_codon,cdr3_length_stats,diversity_measurements) files_generated = [gene_summary_file] if unique_aa_file: files_generated.append(unique_aa_file) if unique_cdr3_file: files_generated.append(unique_cdr3_file) if v_gene_analysis: files_generated.append(v_gene_analysis) if j_gene_analysis: files_generated.append(j_gene_analysis) if vj_gene_analysis: files_generated.append(vj_gene_analysis) print('Descriptive statistics completed at {0}.'.format(str(datetime.datetime.now()))) gc.collect() return {'files':files_generated,'figures':plots_created}
def isotype_sequences(input_file,input_file_type,barcode_file='',output_file=None,output_format='TAB',seq_var='sequence',header_var='header',helper_fields = {},alignment_settings = {},analysis_name = None): #####OVER HEAD FUNCTIONS help_1 = defaultdict(str,copy.deepcopy(helper_fields)) recombination_var = help_1['recombination_var'] strand_field = help_1['strand_field'] end_of_ab_field = help_1['end_of_ab_field'] al_1 = copy.deepcopy(alignment_settings) penalize_truncations = al_1['penalize_truncations'] if 'penalize_truncations' in al_1 else True minimum_alignment_length = al_1['minimum_alignment_length'] if 'minimum_alignment_length' in al_1 else 15 #0=> only consider barcodes as provided #1=> only consider the reverse complmeent of barcodes provided #2=> consider both strands search_rc = al_1['search_rc'] if 'search_rc' in al_1 else 2 allowed_mismatches_in_alignment = al_1['allowed_mismatches_in_alignment'] if 'allowed_mismatches_in_alignment' in al_1 else 2 #the sequence filed provided is the sequence of the SENSE AB gene not the antisense #when False, will consider both the forward and reverse copmlmement of sequence strand_corrected = al_1['strand_corrected'] if 'strand_corrected' in al_1 else False #file locations seq_fasta_location =input_file# functionVars["folder_location"]+functionVars["input_file"] #location of input file translator_field = copy.deepcopy(translator) if analysis_name: translator_field['ANALYSIS_NAME'] = analysis_name.upper() translator_field = {translation_var:translator_field} if output_file == None or output_file==input_file: output_file = useful.removeFileExtension(input_file)+'.isotype.annotation' output_file_location = output_file output_file_format = output_format #functionVars['write_format'] #seqHandle = open(seq_fasta_location,"rU") outHandle = open(output_file_location,'w') outHandle.write(descriptor_symbol+json.dumps(translator_field)+'\n')#write a translator line to this file so that we know how to add results to database if output_format == 'TAB' or output_format == 'CSV': outHandle.write('\t'.join(FileDelimFields)+'\n') if not barcode_file:# 'barcodefilename' in functionVars: #manually using these primers barcodeSeqList = defaultBarcodes() elif not(os.path.isfile(barcode_file)): print('Barcode file not found! Using default barcodes') #manually using these primers barcodeSeqList = defaultBarcodes() else: barcodeSeqList = readBarcodeFile(barcode_file) command_string = json.dumps({'Barcodes':barcodeSeqList,'mismatch_cutoff':allowed_mismatches_in_alignment,'penalize_truncations':penalize_truncations,'minimum_length_cutoff':minimum_alignment_length}) iffile = readwrite.immunogrepFile(filelocation=seq_fasta_location,filetype=input_file_type) #get maximum length of sequences in file [maxLen,numSeq] = maxSeqLen(iffile,seq_var) #make a call to the generator for alinging sequences to isotypes guessed_num_bases_after_jgene = 60 isotype_predictor =fft_tools.BarcodeAligner(barcodeSeqList,penalize_truncations,search_rc,allowed_mismatches_in_alignment,minimum_alignment_length,nmax=maxLen,nmin=guessed_num_bases_after_jgene) ###END OF OVERHEAD FUNCTIONS #now lets read through sequences and start alignining algnLim = 10 currentSeq = 0 overlap_len = 10 #seqHandle=open(seq_fasta_location,"rU") counter = 0 startPer = 0 num_isotype_found = {} total_isotype_found = 0 total_found_score=0 total_notfound_score=0 print("Starting isotyping analysis for {0} sequences".format(numSeq)) totaltime = 0 a = int(round(time.time())) found = 0 iffile = readwrite.immunogrepFile(filelocation=seq_fasta_location,filetype=input_file_type); summary_data = {'found':0,'top_isotype':defaultdict(int),'average_mismatch':0,'average_num_isotype_found':0} for line_row in iffile.read(): jsonVar = {} if not line_row: continue if header_var in line_row: if idIdentifier in line_row: jsonVar[idIdentifier] = line_row[idIdentifier] jsonVar['Header'] = line_row[header_var] else: [header,id] = GrabAdditionalHeaderInfo(line_row[header_var]) jsonVar[idIdentifier] = id jsonVar['Header'] = header if seq_var not in line_row or line_row[seq_var]=='': jsonVar['Sequence']='' jsonVar['Notes'] = 'No sequence found' writeSeqResult(outHandle,jsonVar,output_format) continue #allow the user to monitor what percent of the sequences have been processed startPer = useful.LoopStatus(counter,numSeq,10,startPer) bestScore = 0; bestBarcode = -1; jsonVar['Sequence'] = line_row[seq_var] jsonVar['Command'] = command_string counter+=1 seqFwd = jsonVar['Sequence'] if strand_corrected: all_seqs = [seqFwd] else: all_seqs = [seqFwd,str(Seq(seqFwd).reverse_complement())] found_strand ='' for pos,each_seq in enumerate(all_seqs): #determine if we should take a substring of the sequence #basically, only consider nucleotides AFTER the end of the ab field if end_of_ab_field in line_row and line_row[end_of_ab_field]!='': try: end_of_ab = int(line_row[end_of_ab_field]) except: end_of_ab = 0 #take substring if end_of_ab-overlap_len<len(each_seq) and end_of_ab-overlap_len>=0: each_seq = each_seq[end_of_ab:] isotypes_results = isotype_predictor.AlignToSeq(each_seq) if isotypes_results: found_strand = strand_orientation_list[pos] break if isotypes_results: found += 1 jsonVar = dict(jsonVar.items()+isotypes_results.items()) jsonVar['Sequence strand'] = found_strand if recombination_var in line_row and line_row[recombination_var]: #always trust the recombination type from input file IF provided jsonVar['Recombination type'] = line_row[recombination_var] else: #if there is no results then attemp to guess it our selves jsonVar['Recombination type'] = GuessRecombType(jsonVar['Isotype'][0]) summary_data['top_isotype'][jsonVar['Isotype'][0]]+=1 summary_data['average_num_isotype_found']+=len(jsonVar['Isotype']) summary_data['average_mismatch']+=jsonVar['Mismatches'][0] else: if recombination_var in line_row and line_row[recombination_var]: #always trust the recombination type from input file IF provided jsonVar['Recombination type'] = line_row[recombination_var] jsonVar['Isotype'] = '' jsonVar['Notes'] = 'Could not identify isotype with alignment score above threshold' summary_data['top_isotype']['NotFound']+=1 writeSeqResult(outHandle,jsonVar,output_format) b = int(round(time.time())) summary_data['found'] = found if found: summary_data['average_mismatch'] = summary_data['average_mismatch']/float(found) summary_data['average_num_isotype_found'] = summary_data['average_num_isotype_found']/float(found) totaltime=(b-a) print "time: " print totaltime print "Summary of identified isotypes:" print summary_data #if total_isotype_found>0: # print "\nAverage score for identified isotypes:" # print str(total_found_score/float(total_isotype_found)) #if numSeq-total_isotype_found>0: # print "\nAverage score for unidentified isotypes:" # print str(total_notfound_score/float(numSeq-total_isotype_found)) outHandle.close() #if output_file_format=="txt": # JSON_to_TXT(output_file_location, output_file_location, True,{'Header':1,'Seq':2,'dir':3,'isotype':4,'algnPos':5,'maxscore':6,'bestscore':7}) return output_file
def pandas_read_chunks(input_file, filetype, fields, chunks=10000): ''' Function for creating pandas dataframes using the provided input_files. We will read x lines from each file at a time where x = chunks If the input file is not a delimited file (not CSV, TAB), then we will have to read the file using our read class and create a dataframe from those results ''' # FLIPPED FIELDS -> the parameter fields is as follows: key => field name we use in program, value => field name in provided file. # We will flip this structure to make it easier for parsing the input files flipped_fields = {value: key for key, value in fields.iteritems()} each_file = input_file if isinstance(each_file, list): print('Reading file: ', each_file[0]) else: print('Reading file: ', each_file) # First Lets figure out how we will read the files. Will we be able to simply load the dataframe using a delimited file, or will we use our methods for reading the files if not filetype: # We need to guess the filetype temp = readfile.immunogrepFile(each_file) guessed_type = temp.getFiletype() else: guessed_type = filetype # Next lets figure out what fields from each file we want to load if guessed_type == 'IMGT': # we will need these fields from IMGT field_names = ['V-D-J-REGION_5', 'V-J-REGION_5', 'CDR1-IMGT_5', 'CDR2-IMGT_5', 'CDR3-IMGT_5', 'Functionality_1', 'V-GENE and allele_1', 'J-GENE and allele_1', 'V-REGION Nb of mutations_8'] else: field_names = flipped_fields.keys() if guessed_type in ['TAB', 'CSV']: # this is easy to read into a pandas dataframe seps = {'TAB': '\t', 'CSV': ','} skip_lines = 0 # We need to figure out if we have to skip any lines because we aren't using our immunogrep class reader for line in open(each_file): line = line.strip() if not line.startswith(comment_line): break skip_lines += 1 tmp = readfile.immunogrepFile(each_file, 'TAB') header_names_in_file = tmp.getDescription() field_names = [f for f in field_names if f in header_names_in_file] tmp.IFclass.close() reader = pd.read_table(each_file, sep=seps[guessed_type], chunksize=chunks, usecols=field_names, dtype=object, skip_blank_lines=True, skiprows=skip_lines) else: # We need to use our class for reading files if guessed_type == 'IMGT': reader = iffile_to_pandas(readfile.immunogrepFile(each_file, 'IMGT', required_files=[1, 5, 8], field_names=field_names, chunk_size=chunks)) else: reader = iffile_to_pandas(readfile.immunogrepFile(each_file, guessed_type, field_names=field_names, chunk_size=chunks)) for table in reader: # Rename the columns names based on the translator provided (this df will have column names as we expect in program) table.rename(columns=flipped_fields, inplace=True) # append dataset name to column for input file if 'READS' not in table.columns: table['READS'] = 1 else: table['READS'] = table['READS'].astype(int) table['READS'].fillna(1, inplace=True) if 'RECOMBINATIONTYPE' not in table.columns: table['RECOMBINATIONTYPE'] = '' table.fillna('', inplace=True) yield table