def Write_Single_Field(filename=None,outfile_location=None,field=None,count_field = None, file_format=None,contains_header=True):
	"""
			This will extract a specific field from a file and write it to the output file as a single column (without a header.)
			If *count_field* = None, then we assume there are no counts associated with the field of interest.
			If *count_field* =! None, then we assume that column refers to the number of counts a sequence has occurred
	"""

	total_data = 0
	total_field = 0

	if outfile_location == None:
		outfile_location = filename+'_singlefield.txt'

	if (filename):
		isfile = os.path.isfile(filename)
	if (filename==None) or (isfile==False):
		raise Exception("The pathname of the file is invalid")
	if (field==None):
		IF_file = readwrite.immunogrepFile(filelocation=filename,filetype='TAB',contains_header=False,mode='r')
		print("Warning no field name was provided.  This file will be treated as a tab file and the first column will be selected")
		field = 'Column 1'
	else:
		IF_file = readwrite.immunogrepFile(filelocation=filename,filetype=file_format,contains_header=contains_header,mode='r')

		if (file_format==None):
			guessedFiletype = IF_file.getFiletype()
			print("Warning, no file type for this file was provided.  The file was predicted to be a "+guessedFiletype+" file.")
	try:
		outfile = open(outfile_location,'w')
		while not(IF_file.IFclass.eof):
			data = IF_file.IFclass.read() #read in each line as a dictionary
			if data:
				total_data+=1
				if field in data and data[field]:
					value = data[field]
					if count_field!=None and count_field in data and data[count_field]:
						count = data[count_field] #this defines the number of times we will write it to a file
					else:
						count = '1'
					outfile.write(value+'\t'+count+'\n')#write sequence to new file
					total_field+=1
	except Exception as e:
		os.remove(outfile_location)#("rm '{0}'".format(outfile_location))
		print_error(e)

	return [total_data,total_field]
def parseMIXCR(originalfileloc,resultfileloc,inputype,outfile=None,header_var='document_header',sequence_var='sequence',command_val = {}):
								
	command_string = json.dumps(command_val) if command_val else json.dumps({'MIXCR V1.3': 'Unknown settings'})
	if not outfile:
		outfile = "%s-parsed.annotation"%resultfileloc
	
	print('Parsing mixcr file')
	number_of_annotation_lines = useful.file_line_count(resultfileloc)	
	
	seqfile=immunogrepFile(originalfileloc,inputype) #the original file used as an input file for mixcr annotation  	
	iffile=immunogrepFile(resultfileloc,'TAB',None)#,"\t",True,"r") #the mixcr generated alignment file 
	
	parent_folder = '/'.join(resultfileloc.split('/')[:-1])+'/'
	error_file = open(resultfileloc+'.errorlog.txt','w')
	unfound_seqs = open(resultfileloc+'.notfound.txt','w') 
	notfound=0
	seq_num=0
	errors=0
	needcapture = True
	
	looper = useful.LoopStatusGen(number_of_annotation_lines,10)
	t1 = time.time()
	with open(outfile,"w") as f:
		f.write(descriptor_symbol+json.dumps(DatabaseTranslator())+'\n')#write a translator line to this file so that we know how to add results to database 
		f.write('\t'.join(presetlabels)+'\n')		
		
		#read each input sequence/file
		for fastseq in seqfile.read():			
			try:
				content={}
				if not fastseq:
					continue
				
				#read in the annotation information from mixcr
				seq = fastseq[sequence_var].upper()
				#extract sequence header and the SEQ_ID field from input file 
				[header, id] = GetHeaderInfo(fastseq,header_var)
				
				if needcapture:
					#we need to match this sequence to mixcr program output
					if iffile.IFclass.eof:
						mixcr_data = None
					else:
						mixcr_data = iffile.IFclass.read()
						#print percent status completed
						looper.next()
				
				#check whether mixcr data matches the current sequence 
				strand=''
				if mixcr_data:
					if 'Read id' in mixcr_data:
						if int(mixcr_data['Read id']) == seq_num:
							matched_seqs = True
						else:
							matched_seqs=False							
					elif 'Description R1' in mixcr_data:
						if mixcr_data['Description R1'].strip() == fastseq[header_var].strip():
							matched_seqs=True							
						else:
							matched_seqs = False							
					else:
						mixcr_data['Read(s) sequence'] = mixcr_data['Read(s) sequence'].upper()				
						[matched_seqs,strand] = match_sequence(seq,mixcr_data['Read(s) sequence'])					
					mixcr_seq = mixcr_data['Read(s) sequence']
				else:
					mixcr_seq = ''
					matched_seqs=False
					strand=''
					needcapture=True
								
				if matched_seqs==False:
					#these results did not match mixcr sequence, so this sequence probably did not yield any results
					#so we do not need to recapture a new miseq sequence. We will just stay with this one 
					needcapture = False
					content['Sequence']=seq
					content['Seqheader']=header
					content['Notes'] = 'Sequence not found in mixcr file;'
					content[idIdentifier] = id
					unfound_seqs.write('\t'.join([content['Seqheader'],content['Sequence'],mixcr_seq])+'\n')														
					content['Command'] = command_string
					content = defaultdict(str,content)
					output_line = [str(content[lab]) for lab in presetlabels]
					f.write('\t'.join(output_line)+'\n')					
					notfound+=1
					seq_num+=1		
					continue
				
				seq_num+=1		
				#in the next iteration of the code, we will need to get a fresh mixcr result
				needcapture=True						
				content = mixcr_data
				content['Notes'] = ''
				content[idIdentifier] = id
				content['Seqheader'] = header
				r_j = ''
				r_v = ''
				chain_v = ''
				
				content['Sequence']=seq
				content['Strand corrected sequence'] = content['Read(s) sequence']				
				[content['Full NT'],content['5_Prime_Annotation'],content['3_Prime_Annotation'],missing_fields]=return_full_nt(content)				
				if missing_fields:
					content['Notes']+='The sequence is missing features between the 5 prime and 3 prime region;'
					content['3_Prime_Annotation']=content['3_Prime_Annotation']+'*'				
					content['Full length'] = 'FALSE'
				else:
					if content['5_Prime_Annotation'] == 'FR1' and content['3_Prime_Annotation'] == 'FR4':						
						content['Full length'] = 'TRUE'
					else:
						content['Full length'] = 'FALSE'
					
				[content['Full AA'],content['Productivity']] =GetFullAA(content,missing_fields)
				if content['AA. seq. CDR3'] and content['AA. seq. CDR3'] in content['Full AA']:
					content['CDR3_Junction_In_Frame']= 'TRUE'
				else:
					content['CDR3_Junction_In_Frame']= 'FALSE'
					
				if content['All V hits']:
					[vgenelist,vscorelist,vlocus,chain_v,r_v]=extractScores(content['All V hits'])
					content['All V hits']=','.join(vgenelist)
					content['All V scores']=','.join(vscorelist)
					content['FirstVgene']=vgenelist[0]
					content['Locus']=vlocus
																	
				else:
					content['All V hits']=''
					content['All V scores']=''
					content['FirstVgene']=''
					content['Locus']=''
					
				if content['All D hits']:
					[dgenelist,dscorelist,dlocus,chain,recomb]=extractScores(content['All D hits'])
					content['All D hits']=','.join(dgenelist)
					content['All D scores']=','.join(dscorelist)
					content['FirstDgene']=dgenelist[0]
				else:
					content['All D hits']=''
					content['All D scores']=''
					content['FirstDgene']=''
					
					
				if content['All J hits']:
					[jgenelist,jscorelist,jlocus,chain,r_j]=extractScores(content['All J hits'])
					content['All J hits']=','.join(jgenelist)
					content['All J scores']=','.join(jscorelist)
					content['FirstJgene']=jgenelist[0]
				else:
					r_j = r_v
					content['All J hits']=''
					content['All J scores']=''
					content['FirstJgene']=''
					
				if content['All C hits']:
					[cgenelist,cscorelist,clocus,chain,recomb]=extractScores(content['All C hits'])
					content['All C hits']=','.join(cgenelist)
					content['All C scores']=','.join(cscorelist)
				else:
					content['All C hits']=''
					content['All C scores']=''
				if r_j == r_v:
					content['Recombination Type'] = r_v
					content['Chain'] = chain_v
				else:
					content['Recombination Type'] = ''
					content['Chain'] = ''
				
				if content['All V alignment']:
					[query_start,query_end,germ_start,germ_end,algn_len,num_mismatch,num_ins,num_del,shm,alignment_string] = ParseAlignment(content['All V alignment'])
					content['VGENE: Query start'] = query_start
					content['VGENE: Query end'] = query_end
					content['VGENE: Germline start'] = germ_start
					content['VGENE: Germline end'] = germ_end
					content['VGENE: Shm.nt'] = num_ins+num_del+num_mismatch
					content['VGENE: Mismatch'] = num_mismatch
					content['VGENE: Insertion'] = num_ins
					content['VGENE: Deletion'] = num_del
					content['VGENE: Alignment'] = alignment_string
					content['VGENE: Shm.per'] = round(100*shm,3)
					content['VGENE: Alignment length'] = algn_len						
					content['AB end'] = query_end
					content['AB start'] = query_start
					
				if content['All J alignment']:
					[query_start,query_end,germ_start,germ_end,algn_len,num_mismatch,num_ins,num_del,shm,alignment_string] = ParseAlignment(content['All J alignment'])
					content['JGENE: Query start'] = query_start
					content['JGENE: Query end'] = query_end
					content['JGENE: Germline start'] = germ_start
					content['JGENE: Germline end'] = germ_end
					content['JGENE: Shm.nt'] = num_ins+num_del+num_mismatch
					content['JGENE: Mismatch'] = num_mismatch
					content['JGENE: Insertion'] = num_ins
					content['JGENE: Deletion'] = num_del
					content['JGENE: Alignment'] = alignment_string
					content['JGENE: Shm.per'] = round(100*shm,3)
					content['JGENE: Alignment length'] = algn_len
					content['AB end'] = query_end
					if 'AB start' not in content:
						content['AB start'] = query_start
								
				content['Orientation'] = guess_strand(content['Full NT'],content['Sequence'])
				content['Command'] = command_string
				content = defaultdict(str,content)
				output_line = [str(content[lab]) for lab in presetlabels]
				f.write('\t'.join(output_line)+'\n')
																				
			except Exception as e:
				errors+=1
				print('There was an error in sequence: '+str(seq_num))
				print('Error: '+str(e))				
				error_file.write('****ERROR FOUND IN SEQUENCE:{0}  ****\n'.format(str(seq_num)))
				error_file.write(useful.print_error_string(e)+'\n')
				error_file.write('MIXCR DATA: \n')
				error_file.write(json.dumps(content,indent=4)+'\n')
				error_file.write('*************END OF ERROR*********\n')
														
					
	iffile.IFclass.close()
	seqfile.IFclass.close()
	error_file.close()
	unfound_seqs.close()
	
	if errors==0:
		os.remove(resultfileloc+'.errorlog.txt')
	
	if notfound==0:
		os.remove(resultfileloc+'.notfound.txt')
	t2 =time.time()
	print(str(t2-t1))
	return outfile
Exemplo n.º 3
0
def isotype_sequences(input_file,
                      input_file_type,
                      barcode_file='',
                      output_file=None,
                      output_format='TAB',
                      seq_var='sequence',
                      header_var='header',
                      helper_fields={},
                      alignment_settings={},
                      analysis_name=None):
    #####OVER HEAD FUNCTIONS

    help_1 = defaultdict(str, copy.deepcopy(helper_fields))
    recombination_var = help_1['recombination_var']
    strand_field = help_1['strand_field']
    end_of_ab_field = help_1['end_of_ab_field']

    al_1 = copy.deepcopy(alignment_settings)

    penalize_truncations = al_1[
        'penalize_truncations'] if 'penalize_truncations' in al_1 else True

    minimum_alignment_length = al_1[
        'minimum_alignment_length'] if 'minimum_alignment_length' in al_1 else 15

    #0=> only consider barcodes as provided
    #1=> only consider the reverse complmeent of barcodes provided
    #2=> consider both strands
    search_rc = al_1['search_rc'] if 'search_rc' in al_1 else 2

    allowed_mismatches_in_alignment = al_1[
        'allowed_mismatches_in_alignment'] if 'allowed_mismatches_in_alignment' in al_1 else 2

    #the sequence filed provided is the sequence of the SENSE AB gene not the antisense
    #when False, will consider both the forward and reverse copmlmement of sequence
    strand_corrected = al_1[
        'strand_corrected'] if 'strand_corrected' in al_1 else False

    #file locations
    seq_fasta_location = input_file  #  functionVars["folder_location"]+functionVars["input_file"] #location of input file

    translator_field = copy.deepcopy(translator)

    if analysis_name:
        translator_field['ANALYSIS_NAME'] = analysis_name.upper()

    translator_field = {translation_var: translator_field}
    if output_file == None or output_file == input_file:
        output_file = useful.removeFileExtension(
            input_file) + '.isotype.annotation'

    output_file_location = output_file

    output_file_format = output_format  #functionVars['write_format']
    #seqHandle = open(seq_fasta_location,"rU")

    outHandle = open(output_file_location, 'w')
    outHandle.write(
        descriptor_symbol + json.dumps(translator_field) + '\n'
    )  #write a translator line to this file so that we know how to add results to database
    if output_format == 'TAB' or output_format == 'CSV':
        outHandle.write('\t'.join(FileDelimFields) + '\n')

    if not barcode_file:  # 'barcodefilename' in functionVars:
        #manually using these primers
        barcodeSeqList = defaultBarcodes()
    elif not (os.path.isfile(barcode_file)):
        print('Barcode file not found! Using default barcodes')
        #manually using these primers
        barcodeSeqList = defaultBarcodes()
    else:
        barcodeSeqList = readBarcodeFile(barcode_file)

    command_string = json.dumps({
        'Barcodes':
        barcodeSeqList,
        'mismatch_cutoff':
        allowed_mismatches_in_alignment,
        'penalize_truncations':
        penalize_truncations,
        'minimum_length_cutoff':
        minimum_alignment_length
    })

    iffile = readwrite.immunogrepFile(filelocation=seq_fasta_location,
                                      filetype=input_file_type)

    #get maximum length of sequences in file
    [maxLen, numSeq] = maxSeqLen(iffile, seq_var)

    #make a call to the generator for alinging sequences to isotypes
    guessed_num_bases_after_jgene = 60
    isotype_predictor = fft_tools.BarcodeAligner(
        barcodeSeqList,
        penalize_truncations,
        search_rc,
        allowed_mismatches_in_alignment,
        minimum_alignment_length,
        nmax=maxLen,
        nmin=guessed_num_bases_after_jgene)

    ###END OF OVERHEAD FUNCTIONS

    #now lets read through sequences and start alignining
    algnLim = 10
    currentSeq = 0
    overlap_len = 10

    #seqHandle=open(seq_fasta_location,"rU")
    counter = 0
    startPer = 0

    num_isotype_found = {}
    total_isotype_found = 0
    total_found_score = 0
    total_notfound_score = 0

    print("Starting isotyping analysis for {0} sequences".format(numSeq))

    totaltime = 0
    a = int(round(time.time()))
    found = 0

    iffile = readwrite.immunogrepFile(filelocation=seq_fasta_location,
                                      filetype=input_file_type)
    summary_data = {
        'found': 0,
        'top_isotype': defaultdict(int),
        'average_mismatch': 0,
        'average_num_isotype_found': 0
    }

    for line_row in iffile.read():
        jsonVar = {}
        if not line_row:
            continue

        if header_var in line_row:
            if idIdentifier in line_row:
                jsonVar[idIdentifier] = line_row[idIdentifier]
                jsonVar['Header'] = line_row[header_var]
            else:
                [header, id] = GrabAdditionalHeaderInfo(line_row[header_var])
                jsonVar[idIdentifier] = id
                jsonVar['Header'] = header

        if seq_var not in line_row or line_row[seq_var] == '':
            jsonVar['Sequence'] = ''
            jsonVar['Notes'] = 'No sequence found'
            writeSeqResult(outHandle, jsonVar, output_format)
            continue

        #allow the user to monitor what percent of the sequences have been processed
        startPer = useful.LoopStatus(counter, numSeq, 10, startPer)

        bestScore = 0
        bestBarcode = -1

        jsonVar['Sequence'] = line_row[seq_var]
        jsonVar['Command'] = command_string
        counter += 1

        seqFwd = jsonVar['Sequence']

        if strand_corrected:
            all_seqs = [seqFwd]
        else:
            all_seqs = [seqFwd, str(Seq(seqFwd).reverse_complement())]

        found_strand = ''
        for pos, each_seq in enumerate(all_seqs):
            #determine if we should take a substring of the sequence
            #basically, only consider nucleotides AFTER the end of the ab field
            if end_of_ab_field in line_row and line_row[end_of_ab_field] != '':
                try:
                    end_of_ab = int(line_row[end_of_ab_field])
                except:
                    end_of_ab = 0
                #take substring
                if end_of_ab - overlap_len < len(
                        each_seq) and end_of_ab - overlap_len >= 0:
                    each_seq = each_seq[end_of_ab:]

            isotypes_results = isotype_predictor.AlignToSeq(each_seq)
            if isotypes_results:
                found_strand = strand_orientation_list[pos]
                break

        if isotypes_results:
            found += 1
            jsonVar = dict(jsonVar.items() + isotypes_results.items())

            jsonVar['Sequence strand'] = found_strand

            if recombination_var in line_row and line_row[recombination_var]:
                #always trust the recombination type from input file IF provided
                jsonVar['Recombination type'] = line_row[recombination_var]
            else:
                #if there is no results then attemp to guess it our selves
                jsonVar['Recombination type'] = GuessRecombType(
                    jsonVar['Isotype'][0])

            summary_data['top_isotype'][jsonVar['Isotype'][0]] += 1
            summary_data['average_num_isotype_found'] += len(
                jsonVar['Isotype'])
            summary_data['average_mismatch'] += jsonVar['Mismatches'][0]
        else:
            if recombination_var in line_row and line_row[recombination_var]:
                #always trust the recombination type from input file IF provided
                jsonVar['Recombination type'] = line_row[recombination_var]

            jsonVar['Isotype'] = ''
            jsonVar[
                'Notes'] = 'Could not identify isotype with alignment score above threshold'
            summary_data['top_isotype']['NotFound'] += 1

        writeSeqResult(outHandle, jsonVar, output_format)

    b = int(round(time.time()))

    summary_data['found'] = found
    if found:
        summary_data['average_mismatch'] = summary_data[
            'average_mismatch'] / float(found)
        summary_data['average_num_isotype_found'] = summary_data[
            'average_num_isotype_found'] / float(found)

    totaltime = (b - a)

    print "time: "
    print totaltime

    print "Summary of identified isotypes:"
    print summary_data

    #if total_isotype_found>0:
    #	print "\nAverage score for identified isotypes:"
    #	print str(total_found_score/float(total_isotype_found))

    #if numSeq-total_isotype_found>0:
    #	print "\nAverage score for unidentified isotypes:"
    #	print str(total_notfound_score/float(numSeq-total_isotype_found))

    outHandle.close()
    #if output_file_format=="txt":
    #	JSON_to_TXT(output_file_location, output_file_location, True,{'Header':1,'Seq':2,'dir':3,'isotype':4,'algnPos':5,'maxscore':6,'bestscore':7})
    return output_file
Exemplo n.º 4
0
def Write_Single_Field(filename=None,
                       outfile_location=None,
                       field=None,
                       count_field=None,
                       file_format=None,
                       contains_header=True):
    """
			This will extract a specific field from a file and write it to the output file as a single column (without a header.)
			If *count_field* = None, then we assume there are no counts associated with the field of interest.
			If *count_field* =! None, then we assume that column refers to the number of counts a sequence has occurred
	"""

    total_data = 0
    total_field = 0

    if outfile_location == None:
        outfile_location = filename + '_singlefield.txt'

    if (filename):
        isfile = os.path.isfile(filename)
    if (filename == None) or (isfile == False):
        raise Exception("The pathname of the file is invalid")
    if (field == None):
        IF_file = readwrite.immunogrepFile(filelocation=filename,
                                           filetype='TAB',
                                           contains_header=False,
                                           mode='r')
        print(
            "Warning no field name was provided.  This file will be treated as a tab file and the first column will be selected"
        )
        field = 'Column 1'
    else:
        IF_file = readwrite.immunogrepFile(filelocation=filename,
                                           filetype=file_format,
                                           contains_header=contains_header,
                                           mode='r')

        if (file_format == None):
            guessedFiletype = IF_file.getFiletype()
            print(
                "Warning, no file type for this file was provided.  The file was predicted to be a "
                + guessedFiletype + " file.")
    try:
        outfile = open(outfile_location, 'w')
        while not (IF_file.IFclass.eof):
            data = IF_file.IFclass.read()  #read in each line as a dictionary
            if data:
                total_data += 1
                if field in data and data[field]:
                    value = data[field]
                    if count_field != None and count_field in data and data[
                            count_field]:
                        count = data[
                            count_field]  #this defines the number of times we will write it to a file
                    else:
                        count = '1'
                    outfile.write(value + '\t' + count +
                                  '\n')  #write sequence to new file
                    total_field += 1
    except Exception as e:
        os.remove(outfile_location)  #("rm '{0}'".format(outfile_location))
        print_error(e)

    return [total_data, total_field]
Exemplo n.º 5
0
def parseMIXCR(originalfileloc,
               resultfileloc,
               inputype,
               outfile=None,
               header_var='document_header',
               sequence_var='sequence',
               quality_var='phred',
               command_val={}):

    command_string = json.dumps(command_val) if command_val else json.dumps(
        {'MIXCR V1.3': 'Unknown settings'})
    if not outfile:
        outfile = "%s-parsed.annotation" % resultfileloc

    print('Parsing mixcr file')
    number_of_annotation_lines = useful.file_line_count(resultfileloc)

    seqfile = immunogrepFile(
        originalfileloc, inputype
    )  #the original file used as an input file for mixcr annotation
    iffile = immunogrepFile(
        resultfileloc, 'TAB',
        None)  #,"\t",True,"r") #the mixcr generated alignment file

    parent_folder = '/'.join(resultfileloc.split('/')[:-1]) + '/'
    error_file = open(resultfileloc + '.errorlog.txt', 'w')
    unfound_seqs = open(resultfileloc + '.notfound.txt', 'w')
    notfound = 0
    seq_num = 0
    errors = 0
    needcapture = True

    looper = useful.LoopStatusGen(number_of_annotation_lines, 10)
    t1 = time.time()
    with open(outfile, "w") as f:
        f.write(
            descriptor_symbol + json.dumps(DatabaseTranslator()) + '\n'
        )  #write a translator line to this file so that we know how to add results to database
        f.write('\t'.join(presetlabels) + '\n')

        #read each input sequence/file
        for fastseq in seqfile.read():
            try:
                content = {}
                if not fastseq:
                    continue

                #read in the annotation information from mixcr
                seq = fastseq[sequence_var].upper()
                #extract sequence header and the SEQ_ID field from input file
                [header, id] = GetHeaderInfo(fastseq, header_var)

                if needcapture:
                    #we need to match this sequence to mixcr program output
                    if iffile.IFclass.eof:
                        mixcr_data = None
                    else:
                        mixcr_data = iffile.IFclass.read()
                        #print percent status completed
                        looper.next()

                #check whether mixcr data matches the current sequence
                strand = ''
                if mixcr_data:
                    if 'Read id' in mixcr_data:
                        if int(mixcr_data['Read id']) == seq_num:
                            matched_seqs = True
                        else:
                            matched_seqs = False
                    elif 'Description R1' in mixcr_data:
                        if mixcr_data['Description R1'].strip(
                        ) == fastseq[header_var].strip():
                            matched_seqs = True
                        else:
                            matched_seqs = False
                    else:
                        mixcr_data['Read(s) sequence'] = mixcr_data[
                            'Read(s) sequence'].upper()
                        [matched_seqs, strand
                         ] = match_sequence(seq,
                                            mixcr_data['Read(s) sequence'])
                    mixcr_seq = mixcr_data['Read(s) sequence']
                else:
                    mixcr_seq = ''
                    matched_seqs = False
                    strand = ''
                    needcapture = True

                if matched_seqs == False:
                    #these results did not match mixcr sequence, so this sequence probably did not yield any results
                    #so we do not need to recapture a new miseq sequence. We will just stay with this one
                    needcapture = False
                    content['Sequence'] = seq
                    content['Seqheader'] = header
                    if quality_var in fastseq:
                        content['Read(s) sequence qualities'] = fastseq[
                            quality_var]
                    content['Notes'] = 'Sequence not found in mixcr file;'
                    content[idIdentifier] = id
                    unfound_seqs.write('\t'.join([
                        content['Seqheader'], content['Sequence'], mixcr_seq
                    ]) + '\n')
                    content['Command'] = command_string
                    content = defaultdict(str, content)
                    output_line = [str(content[lab]) for lab in presetlabels]
                    f.write('\t'.join(output_line) + '\n')
                    notfound += 1
                    seq_num += 1
                    continue

                seq_num += 1
                #in the next iteration of the code, we will need to get a fresh mixcr result
                needcapture = True
                content = mixcr_data
                content['Notes'] = ''
                content[idIdentifier] = id
                content['Seqheader'] = header
                r_j = ''
                r_v = ''
                chain_v = ''

                content['Sequence'] = seq
                content['Strand corrected sequence'] = content[
                    'Read(s) sequence']
                [
                    content['Full NT'], content['5_Prime_Annotation'],
                    content['3_Prime_Annotation'], missing_fields
                ] = return_full_nt(content)
                if missing_fields:
                    content[
                        'Notes'] += 'The sequence is missing features between the 5 prime and 3 prime region;'
                    content['3_Prime_Annotation'] = content[
                        '3_Prime_Annotation'] + '*'
                    content['Full length'] = 'FALSE'
                else:
                    if content['5_Prime_Annotation'] == 'FR1' and content[
                            '3_Prime_Annotation'] == 'FR4':
                        content['Full length'] = 'TRUE'
                    else:
                        content['Full length'] = 'FALSE'

                [content['Full AA'],
                 content['Productivity']] = GetFullAA(content, missing_fields)
                if content['AA. Seq. CDR3'] and content[
                        'AA. Seq. CDR3'] in content['Full AA']:
                    content['CDR3_Junction_In_Frame'] = 'TRUE'
                else:
                    content['CDR3_Junction_In_Frame'] = 'FALSE'

                if content['All V hits']:
                    [vgenelist, vscorelist, vlocus, chain_v,
                     r_v] = extractScores(content['All V hits'])
                    content['All V hits'] = ','.join(vgenelist)
                    content['All V scores'] = ','.join(vscorelist)
                    content['FirstVgene'] = vgenelist[0]
                    content['Locus'] = vlocus

                else:
                    content['All V hits'] = ''
                    content['All V scores'] = ''
                    content['FirstVgene'] = ''
                    content['Locus'] = ''

                if content['All D hits']:
                    [dgenelist, dscorelist, dlocus, chain,
                     recomb] = extractScores(content['All D hits'])
                    content['All D hits'] = ','.join(dgenelist)
                    content['All D scores'] = ','.join(dscorelist)
                    content['FirstDgene'] = dgenelist[0]
                else:
                    content['All D hits'] = ''
                    content['All D scores'] = ''
                    content['FirstDgene'] = ''

                if content['All J hits']:
                    [jgenelist, jscorelist, jlocus, chain,
                     r_j] = extractScores(content['All J hits'])
                    content['All J hits'] = ','.join(jgenelist)
                    content['All J scores'] = ','.join(jscorelist)
                    content['FirstJgene'] = jgenelist[0]
                else:
                    r_j = r_v
                    content['All J hits'] = ''
                    content['All J scores'] = ''
                    content['FirstJgene'] = ''

                if content['All C hits']:
                    [cgenelist, cscorelist, clocus, chain,
                     recomb] = extractScores(content['All C hits'])
                    content['All C hits'] = ','.join(cgenelist)
                    content['All C scores'] = ','.join(cscorelist)
                else:
                    content['All C hits'] = ''
                    content['All C scores'] = ''
                if r_j == r_v:
                    content['Recombination Type'] = r_v
                    content['Chain'] = chain_v
                else:
                    content['Recombination Type'] = ''
                    content['Chain'] = ''

                if content['All V alignments']:
                    [
                        query_start, query_end, germ_start, germ_end, algn_len,
                        num_mismatch, num_ins, num_del, shm, alignment_string
                    ] = ParseAlignment(content['All V alignments'])
                    content['VGENE: Query start'] = query_start
                    content['VGENE: Query end'] = query_end
                    content['VGENE: Germline start'] = germ_start
                    content['VGENE: Germline end'] = germ_end
                    content['VGENE: Shm.nt'] = num_ins + num_del + num_mismatch
                    content['VGENE: Mismatch'] = num_mismatch
                    content['VGENE: Insertion'] = num_ins
                    content['VGENE: Deletion'] = num_del
                    content['VGENE: Alignment'] = alignment_string
                    content['VGENE: Shm.per'] = round(100 * shm, 3)
                    content['VGENE: Alignment length'] = algn_len
                    content['AB end'] = query_end
                    content['AB start'] = query_start

                if content['All J alignments']:
                    [
                        query_start, query_end, germ_start, germ_end, algn_len,
                        num_mismatch, num_ins, num_del, shm, alignment_string
                    ] = ParseAlignment(content['All J alignments'])
                    content['JGENE: Query start'] = query_start
                    content['JGENE: Query end'] = query_end
                    content['JGENE: Germline start'] = germ_start
                    content['JGENE: Germline end'] = germ_end
                    content['JGENE: Shm.nt'] = num_ins + num_del + num_mismatch
                    content['JGENE: Mismatch'] = num_mismatch
                    content['JGENE: Insertion'] = num_ins
                    content['JGENE: Deletion'] = num_del
                    content['JGENE: Alignment'] = alignment_string
                    content['JGENE: Shm.per'] = round(100 * shm, 3)
                    content['JGENE: Alignment length'] = algn_len
                    content['AB end'] = query_end
                    if 'AB start' not in content:
                        content['AB start'] = query_start

                content['Orientation'] = guess_strand(content['Full NT'],
                                                      content['Sequence'])
                content['Command'] = command_string
                content = defaultdict(str, content)
                output_line = [str(content[lab]) for lab in presetlabels]
                f.write('\t'.join(output_line) + '\n')

            except Exception as e:
                errors += 1
                print('There was an error in sequence: ' + str(seq_num))
                print('Error: ' + str(e))
                error_file.write(
                    '****ERROR FOUND IN SEQUENCE:{0}  ****\n'.format(
                        str(seq_num)))
                error_file.write(useful.print_error_string(e) + '\n')
                error_file.write('MIXCR DATA: \n')
                error_file.write(json.dumps(content, indent=4) + '\n')
                error_file.write('*************END OF ERROR*********\n')

    iffile.IFclass.close()
    seqfile.IFclass.close()
    error_file.close()
    unfound_seqs.close()
    if errors == 0:
        os.remove(resultfileloc + '.errorlog.txt')
    if notfound == 0:
        os.remove(resultfileloc + '.notfound.txt')
    t2 = time.time()
    print(str(t2 - t1))
    return outfile
def Descriptive_Statistics(list_of_files,input_file_type,analysis_name='',exp_names = [],output_file_prefix='',fields={},statistics_to_run=['ab_aa','cdr3','vgene','jgene','vjgene']):
	analysis_name = analysis_name.upper()
	if input_file_type=='IMGT' and not isinstance(list_of_files[0],list):
		list_of_files = [list_of_files]
	elif not isinstance(list_of_files,list):
		list_of_files = [list_of_files]
		
	if len(exp_names)!=len(list_of_files):
		exp_names = []
	
	#by default, save results to the same folder as the input file
	if not output_file_prefix:		
		output_file_prefix = useful.removeFileExtension(list_of_files[0])
	
	analysis_name = analysis_name.upper()
	supported_analyses = fields_for_analysis.keys()
	if (not analysis_name or analysis_name=='CUSTOM' or analysis_name not in supported_analyses) and not fields:
		raise Exception('The required fields for the provided analysis, {0}, is not currently automated. Please explicity provide the fields names'.format(str(analysis_name)))
	
	#first we use default fields defined ehere
	if analysis_name in supported_analyses:
		fields_to_use = copy.deepcopy(fields_for_analysis[analysis_name])
	else:
		fields_to_use = {}
	#next we add in user defined fields just in case there are any changes/mistakes
	for f,name in fields.iteritems():
		fields_to_use[f] = name 
	
	
	
	filenames_to_use = [f[0] if isinstance(f,list) else f for f in list_of_files]
	print('Performing descriptive statistics at {0}.'.format(str(datetime.datetime.now())))
	print('Analyzing the following files:\n\t {0}'.format('\n\t'.join(filenames_to_use)))
	unique_aa_file = None 
	unique_cdr3_file = None 	
	v_gene_analysis = None
	j_gene_analysis = None
	vj_gene_analysis = None
	gene_analysis_plot = output_file_prefix
	plots_created = []
	gene_summary_file = output_file_prefix+'.summary_of_stats.txt'
	
	
	output_file_names = {}
	
	aa_files = ['AB AA SEQUENCE','RECOMBINATION_TYPE','LOCUS','CDR1','CDR2','CDR3','STOP CODONS','PRODUCTIVE','VGENES','DGENES','JGENES','TOTAL COUNTS']
	fields_order = ['full_len_ab','recomb','locus','cdr1','cdr2','cdr3','stopc','functionality','vgene','dgene','jgene']
	num_exp= len(list_of_files)
	if not exp_names:
		if input_file_type=='IMGT':
			pass
		else:			
			exp_names = []
			for file in list_of_files:
				count = 1
				str_file = os.path.basename(file)
				while True:					
					if str_file in exp_names:
						str_file = os.path.basename(file)+'_'+str(count)
						count+=1
					else:
						exp_names.append(str_file)
						break			
		
	if 'ab_aa' in statistics_to_run:
		intermediate_file =  output_file_prefix+'.unique_aa_file_temp'
		#first we will use a temp file/intermeidate file 
		output_file_names['ab_aa'] = open(intermediate_file,'w')
		#output_file_names['ab_aa'].write('\t'.join(aa_files)+'\n')
	
	
	cdr3analysis = True if 'cdr3' in statistics_to_run else False
	aaanalysis = True if 'ab_aa' in statistics_to_run else False
	
	vjgene_dict=defaultdict(lambda:defaultdictgenes(num_exp))
	
	#cdr3_dict=defaultdict(lambda:defaultdictcdr3(num_exp))	
	cdr3_dict_vdj = defaultdict(lambda:defaultdictcdr3(num_exp))	
	cdr3_dict_vj = defaultdict(lambda:defaultdictcdr3(num_exp))	
	cdr3_dict_unk = defaultdict(lambda:defaultdictcdr3(num_exp))	
	
	use_these_fields = fields_to_use.values()
	fields_to_use['stopc'] = 'stopc'
	num_results = [0]*(num_exp)
	num_cdr3 = [0]*(num_exp)
	num_stop_codon = [0]*(num_exp)
	num_vdj = [0]*(num_exp)
	num_vj = [0]*(num_exp)
	num_sequences = [0]*(num_exp)
	
	
	if not fields_to_use['recomb']:
		#maybe the user never defined a feild for recombinoation type..that coudl be a problem because we will have to guess it using the variable at the top of the script: recomb_call
		recomb_not_defined = True	
		fields_to_use['recomb'] = 'recomb'
	else:
		recomb_not_defined = False
	
	
	print('Reading through sequences in file(s)')
	seqnum=1
	#go through all of the files and report the relevant fields 
	#if we are creating a unique amino acid file, then report thiese fields to temp file
	for fnum,each_file in enumerate(list_of_files):				
		annotated_file = readfile.immunogrepFile(each_file,input_file_type,field_names = use_these_fields)
		#loop through each file 
		for seq_lines in annotated_file.read():						
			if not seq_lines:
				continue
			if seqnum%500000==0:
				print('Read {0} sequences'.format(str(seqnum)))
			seqnum+=1
			num_sequences[fnum]+=1			
			seq_lines = defaultdict(str,seq_lines)
			if seq_lines[fields_to_use['full_len_ab']]:
				#full length antibody sequence not found
				num_results[fnum]+=1				
												
			#only select the first gene in the list. alos remove allelic name ('*')
			seq_lines[fields_to_use['vgene']] = seq_lines[fields_to_use['vgene']].split(',')[0].split('*')[0]
			seq_lines[fields_to_use['dgene']] = seq_lines[fields_to_use['dgene']].split(',')[0].split('*')[0]
						
			#IF NO RECOMBINATION TYPE IS FOUND or provided, THEN guess it using the vgene or jgene call
			if recomb_not_defined or not seq_lines[fields_to_use['recomb']]:
				r = '' #not sure what the recombation type is yet
				#try to guess the recombination type 				
				if seq_lines[fields_to_use['vgene']]:
					#use vgene if present
					# look at the first three characters in vgene to predict recombioation type
					gn = ProcessGene(seq_lines[fields_to_use['vgene']])
					if gn[:3] in recomb_call:
						r = recomb_call[gn[:3]]
					elif gn[:2] in recomb_call: #next check the first two letters (IGBLAST REPORTS TA RATHER THAN TRA for example 
						r = recomb_call[gn[:2]]
				if not r and seq_lines[fields_to_use['jgene']]:
					#still not r found, so use jgene 
					gn = ProcessGene(seq_lines[fields_to_use['jgene']])
					if gn[:3] in recomb_call:
						r = recomb_call[gn[:3]]
					elif gn[:2] in recomb_call: #next check the first two letters (IGBLAST REPORTS TA RATHER THAN TRA for example 
						r = recomb_call[gn[:2]]					
				
				#update recomb result 
				seq_lines[fields_to_use['recomb']] = r								
			
			if not seq_lines[fields_to_use['recomb']]:
				continue
								
			if seq_lines[fields_to_use['recomb']] == 'VDJ':
				num_vdj[fnum]+=1								
			elif seq_lines[fields_to_use['recomb']] == 'VJ':				
				num_vj[fnum]+=1
				
			
			seq_lines[fields_to_use['jgene']] = seq_lines[fields_to_use['jgene']].split(',')[0].split('*')[0]
			seq_lines['stopc'] = 'YES' if '*' in seq_lines[fields_to_use['full_len_ab']] else 'NO'			
			if seq_lines['stopc'] == 'YES':
				num_stop_codon[fnum]+=1
			if aaanalysis:
				exp_str = str(fnum+1)
				#make an intermediate file where we only put the fields we want in the proper order from any file 
				#we will use this field for sorting afterwards
				#also output exp_num to account for which sequence came from which experiment 
				output_file_names['ab_aa'].write('\t'.join([seq_lines[fields_to_use[f]] for f in fields_order])+'\t'+str(exp_str)+'\n')
			if seq_lines[fields_to_use['vgene']] or seq_lines[fields_to_use['jgene']]:							
				key_v =delim.join([seq_lines[fields_to_use['vgene']],seq_lines[fields_to_use['jgene']],seq_lines[fields_to_use['recomb']]])
				vjgene_dict[key_v][fnum]+=1
			
			if not seq_lines[fields_to_use['cdr3']]:
				#no cdr3 found 	
				continue
			
			#add unique cdr3_recomb and vjgene info to dictionaires
			num_cdr3[fnum]+=1
			
			if cdr3analysis:
				key = seq_lines[fields_to_use['cdr3']]
				#key_cdr3 = delim.join([],seq_lines[fields_to_use['recomb']]])
				if seq_lines[fields_to_use['recomb']]=='VDJ':
					cdr3_dict_vdj[key][fnum]+=1
				elif seq_lines[fields_to_use['recomb']]=='VJ':
					cdr3_dict_vj[key][fnum]+=1					
				else:
					print('unknown recombination types: ',seq_lines[fields_to_use['recomb']])
					cdr3_dict_unk[key][fnum]+=1 
									  
			if seqnum>10000:
				break
				
					
	if aaanalysis:
		output_file_names['ab_aa'].close()
		print('Generating a file of unique AB amino acid sequences')
		unique_aa_file = output_file_prefix+'.unique_aa_file.txt'
		#Use some bash to make a unique amino acid file using sorting and then some awk 
		GenerateAAFile(intermediate_file,unique_aa_file,aa_files,exp_names)
		#number of amino acid sequences observed
		if not os.path.isfile(unique_aa_file):
			num_unique_aa = 0 
		else:
			num_unique_aa = useful.file_line_count(unique_aa_file)-1 #-1 => remove header row count
	
	#Now have some fun with pandas 	
	if set(['vgene','jgene','vjgene']) & set(statistics_to_run):
		#vjgene_dict format = {
			#'key' = 'vgene',_,'jgene',_,'recombtype'
			#value = [count,count] => a list of counts for presence of that key in EACH providced file/experiment. Length of list = number of experiments
		#}
		gene_df = pd.DataFrame(vjgene_dict).transpose()
		if 'VGENE' not in gene_df.columns:
			gene_df['VGENE'] = ''
		if 'JGENE' not in gene_df.columns:
			gene_df['JGENE'] = ''
		if 'recomb' not in gene_df.columns:
			gene_df['recomb'] = ''
		gene_df['TOTAL_COUNTS'] = gene_df.sum(axis=1)		
		gene_df = gene_df.reset_index()				
		gene_df = gene_df.apply(ModifyPDTable,axis=1,args=(['VGENE','JGENE','recomb'],delim))
		
		
		new_names = {}
		for f,v in enumerate(exp_names):
			new_names[f]=v
			#key = experiment index number
			#value = new name

		#rename the columns 0,1,...num experiments to match the experiment names 
		gene_df = gene_df.rename(columns=new_names)
		
		#format of gene_df:
			#index => no index set, just use default numbers
			#columns => start with column for each experiment, then add the following columns: VGENE, JGENE, recomb, TOTAL_COUNTS

		if 'vgene' in statistics_to_run:
			print('Performing V gene analysis')
			
			v_gene_analysis = output_file_prefix+'.vgenes.txt'
			#group elements by VH GENE CALLS and VL gene calls 
			sorted_v_counts =  gene_df.groupby(['recomb','VGENE']).sum()#.count()#.sort('VGENE',ascending=1)						
			
			#find out which level in multilevel index corresponds to 'VGENE' => looking at above code , it should be level 1 (recomb should be level 0)
			vgene_level = sorted_v_counts.index.names.index('VGENE')			
			
			#remove results where vGENE is empty
			if '' in list(sorted_v_counts.index.levels[vgene_level]):
				sorted_v_counts = sorted_v_counts.drop('',level='VGENE')			
			
			ignore_counts = ['TOTAL_COUNTS','JGENE']
			keep_col = [n for n in sorted_v_counts.columns if n not in ignore_counts]
			g = sorted_v_counts[keep_col]			
			
			#NOW PLOT the FREQUENCY for every exeprement 
			if 'VDJ' in list(g.index.levels[g.index.names.index('recomb')]):
				
				vdj_g = g.xs('VDJ',level='recomb')
				
				PlotGeneDist(vdj_g/vdj_g.sum(),gene_analysis_plot+'.vdj.vgenes','VH Gene Distribution','Frequency','V Gene',max_val=None,min_val=0)
				
				plots_created.append(gene_analysis_plot+'.vdj.vgenes.png') #.png extension is added in the function plotgenedist
				
			if 'VJ' in list(g.index.levels[g.index.names.index('recomb')]):
				
				vj_g = g.xs('VJ',level='recomb')
				PlotGeneDist(vj_g/vj_g.sum(),gene_analysis_plot+'.vj.vgenes','VL Gene Distribution','Frequency','V Gene',max_val=None,min_val=0)			
				plots_created.append(gene_analysis_plot+'.vj.vgenes.png') #.png extension is added in the function plotgenedist
			sorted_v_counts.reset_index().sort(['recomb','TOTAL_COUNTS'],ascending=[1,0]).iloc[:,:-1].to_csv(v_gene_analysis,sep='\t',index=False)			
			
		
		#do the same as above, except for J genes this time 
		if 'jgene' in statistics_to_run:
			print('Performing J gene analysis')
			j_gene_analysis = output_file_prefix+'.jgenes.txt'
			sorted_j_counts =  gene_df.groupby(['recomb','JGENE']).sum()#.sort('VGENE',ascending=1)						
			jgene_level = sorted_j_counts.index.names.index('JGENE')			
			if '' in list(sorted_j_counts.index.levels[jgene_level]):
				sorted_j_counts.drop('',level='JGENE',inplace=True)			
			ignore_counts = ['TOTAL_COUNTS','VGENE']
			keep_col = [n for n in sorted_j_counts.columns if n not in ignore_counts]
			g = sorted_j_counts[keep_col]			
			sorted_j_counts.reset_index().sort(['recomb','TOTAL_COUNTS'],ascending=[1,0]).iloc[:,:-1].to_csv(j_gene_analysis,sep='\t',index=False)
			
			#NOW CALCULATE FREQUENCY for every exeprement 						
			if 'VDJ' in list(g.index.levels[g.index.names.index('recomb')]):
				vdj_g = g.xs('VDJ',level='recomb')
				PlotGeneDist(vdj_g/vdj_g.sum(),gene_analysis_plot+'.vdj.jgenes','JH Gene Distribution','Frequency','J Gene',max_val=None,min_val=0,step=5)
				plots_created.append(gene_analysis_plot+'.vdj.jgenes.png') #.png extension is added in the function plotgenedist
			if 'VJ' in list(g.index.levels[g.index.names.index('recomb')]):
				vj_g = g.xs('VJ',level='recomb')			
				PlotGeneDist(vj_g/vj_g.sum(),gene_analysis_plot+'.vj.jgenes','JL Gene Distribution','Frequency','J Gene',max_val=None,min_val=0,step=5)
				plots_created.append(gene_analysis_plot+'.vj.jgenes.png') #.png extension is added in the function plotgenedist		
		
		#now perform a V-J gene analysis (heat map) for each experiment 
		if 'vjgene' in statistics_to_run:
			print('Performing V-J gene analysis')
			vj_gene_analysis = output_file_prefix+'.v_and_jgene_analysis.txt'
			#group datafraom by recombination, vgene, and jgene 
			#first rename all V and J gnees that are empyt as No call 						
			#Then Group H / L results by  by v and j gnees and take the sum of each column in the group 
			vj_df =  gene_df.replace([''],[' No call']).groupby(['recomb','VGENE','JGENE']).sum()
			vj_df.to_csv(vj_gene_analysis,sep='\t')			
			
			#remove TOTAL_COUNTS			
			vj_df.drop('TOTAL_COUNTS', axis=1, inplace=True)
			
			#calculate frequency for each recomb type 
			if 'VDJ' in list(vj_df.index.levels[g.index.names.index('recomb')]):						
				v1 =  vj_df.loc['VDJ',:]/vj_df.loc['VDJ',:].sum()
				PlotVJGeneHeatMap(v1,gene_analysis_plot+'.vdj.v_and_jgene_analysis',max_val=None,min_val=None)
				plots_created.append(gene_analysis_plot+'.vdj.v_and_jgene_analysis.png') #.png extension is added in the function plotgenedist
			if 'VJ' in list(vj_df.index.levels[g.index.names.index('recomb')]):
				v2 =  vj_df.loc['VJ',:]/vj_df.loc['VJ',:].sum()
				PlotVJGeneHeatMap(v2,gene_analysis_plot+'.vj.v_and_jgene_analysis',max_val=None,min_val=None)																							
				plots_created.append(gene_analysis_plot+'.vj.v_and_jgene_analysis.png') #.png extension is added in the function plotgenedist
			del vj_df
		del gene_df
		
	#lets do some cdr3 analysis 									
	cdr3_length_stats = {}
	diversity_measurements = {}
	if cdr3analysis:	
		unique_cdr3_file = output_file_prefix+'.unique_cdr3_counts.txt' 
		print('Performing CDR3 analyisis')
		if sum(num_cdr3)>0:
			#again create a pandas dataframe but this time using the unique cdr3 calls 
			print('Loading CDR3s into a dataframe')
			cdr3_df_list = [pd.DataFrame.from_dict(c,orient='index') for c in [cdr3_dict_vdj,cdr3_dict_vj,cdr3_dict_unk]]
			#merge all dftogether
			keys=['VDJ','VJ','UNK']
			cdr3_df = pd.concat(cdr3_df_list,keys=keys)
			#cdr3_df = pd.DataFrame(cdr3_dict).transpose()			
			cdr3_df['TOTAL_COUNTS'] = cdr3_df.sum(axis=1)
			print('Dataframe created')
			
			cdr3_df.index.names = ['recomb','CDR3']
			cdr3_df = cdr3_df.reset_index()				
			#cdr3_df['CDR3'] = ''
			#cdr3_df['recomb'] = ''
			#cdr3_df = cdr3_df.apply(ModifyPDTable,axis=1,raw=True,reduce=True,args=(['CDR3','recomb'],delim))			
			
			new_names = {}
			#performm 			
			cdr3_df['CDR3_LENGTH'] = cdr3_df.CDR3.map(len) 
			for f,v in enumerate(exp_names):
				new_names[f]=v
			#rename the columns to match the experiment names 
			
			cdr3_df = cdr3_df.rename(columns=new_names)
			cdr3_df.sort(['recomb','TOTAL_COUNTS'],ascending=[1,0],inplace=True)
			cdr3_df.set_index(['recomb','CDR3'],inplace=True)					
			
			#save dataframe as tab dleim file 						
			cdr3_df.to_csv(unique_cdr3_file,sep='\t')									
			
			cdr3_length_stats = PlotCDR3Histogram(cdr3_df,gene_analysis_plot+'.cdr3_length_histogram')
			plots_created.append(gene_analysis_plot+'.cdr3_length_histogram.png')
			
			diversity_measurements = CalculateDiversities(cdr3_df,gene_analysis_plot+'.cdr3_diversity_plots')
			plots_created.append(gene_analysis_plot+'.cdr3_diversity_plots.png')			
		del cdr3_df
	
	print('Writing summary to file')
	#finally make a results text file that summarizes all the information	
	GenerateResultsSummaryFile(gene_summary_file,statistics_to_run,list_of_files,exp_names,unique_aa_file,unique_cdr3_file,v_gene_analysis,j_gene_analysis,vj_gene_analysis,plots_created,num_sequences,num_results,num_vdj,num_vj,num_cdr3,num_stop_codon,cdr3_length_stats,diversity_measurements)	
	
	files_generated = [gene_summary_file]
	if unique_aa_file:
		files_generated.append(unique_aa_file)
	if unique_cdr3_file:
		files_generated.append(unique_cdr3_file)
	if v_gene_analysis:
		files_generated.append(v_gene_analysis)
	if j_gene_analysis:
		files_generated.append(j_gene_analysis)
	if vj_gene_analysis:
		files_generated.append(vj_gene_analysis)
	
	print('Descriptive statistics completed at {0}.'.format(str(datetime.datetime.now())))
	
	gc.collect()

	
	return {'files':files_generated,'figures':plots_created}
Exemplo n.º 7
0
def isotype_sequences(input_file,input_file_type,barcode_file='',output_file=None,output_format='TAB',seq_var='sequence',header_var='header',helper_fields = {},alignment_settings = {},analysis_name = None):		
	#####OVER HEAD FUNCTIONS
	
	help_1 = defaultdict(str,copy.deepcopy(helper_fields))
	recombination_var = help_1['recombination_var']
	strand_field = help_1['strand_field']
	end_of_ab_field = help_1['end_of_ab_field']
		
	
	al_1 = copy.deepcopy(alignment_settings)
	
	penalize_truncations = al_1['penalize_truncations'] if 'penalize_truncations' in al_1 else True
	
	minimum_alignment_length = al_1['minimum_alignment_length'] if 'minimum_alignment_length' in al_1 else 15
	
	#0=> only consider barcodes as provided
	#1=> only consider the reverse complmeent of barcodes provided 
	#2=> consider both strands 
	search_rc = al_1['search_rc'] if 'search_rc' in al_1 else 2
	
	allowed_mismatches_in_alignment = al_1['allowed_mismatches_in_alignment'] if 'allowed_mismatches_in_alignment' in al_1 else 2
	
	#the sequence filed provided is the sequence of the SENSE AB gene not the antisense
	#when False, will consider both the forward and reverse copmlmement of sequence 
	strand_corrected = al_1['strand_corrected'] if 'strand_corrected' in al_1 else False
		
				
	#file locations
	seq_fasta_location =input_file#  functionVars["folder_location"]+functionVars["input_file"] #location of input file
	
	translator_field = copy.deepcopy(translator)
	
	if analysis_name:
		translator_field['ANALYSIS_NAME'] = analysis_name.upper()
	
	
	translator_field = {translation_var:translator_field}
	if output_file == None or output_file==input_file:
		output_file = useful.removeFileExtension(input_file)+'.isotype.annotation'
	
	output_file_location = output_file
		
		
	output_file_format = output_format #functionVars['write_format']
	#seqHandle = open(seq_fasta_location,"rU")
		
	outHandle = open(output_file_location,'w')		
	outHandle.write(descriptor_symbol+json.dumps(translator_field)+'\n')#write a translator line to this file so that we know how to add results to database 
	if output_format == 'TAB' or output_format == 'CSV':
		outHandle.write('\t'.join(FileDelimFields)+'\n')
	
	if not barcode_file:# 'barcodefilename' in functionVars:
		#manually using these primers
		barcodeSeqList = defaultBarcodes()
	elif not(os.path.isfile(barcode_file)):
		print('Barcode file not found! Using default barcodes')		
		#manually using these primers
		barcodeSeqList = defaultBarcodes()
	else:
		barcodeSeqList = readBarcodeFile(barcode_file)
		
	command_string = json.dumps({'Barcodes':barcodeSeqList,'mismatch_cutoff':allowed_mismatches_in_alignment,'penalize_truncations':penalize_truncations,'minimum_length_cutoff':minimum_alignment_length})
	
	
	
	iffile = readwrite.immunogrepFile(filelocation=seq_fasta_location,filetype=input_file_type)
		
	#get maximum length of sequences in file 
	[maxLen,numSeq] = maxSeqLen(iffile,seq_var) 	
	
	#make a call to the generator for alinging sequences to isotypes 
	guessed_num_bases_after_jgene = 60
	isotype_predictor =fft_tools.BarcodeAligner(barcodeSeqList,penalize_truncations,search_rc,allowed_mismatches_in_alignment,minimum_alignment_length,nmax=maxLen,nmin=guessed_num_bases_after_jgene)		
					
	###END OF OVERHEAD FUNCTIONS
	
	
	#now lets read through sequences and start alignining
	algnLim = 10
	currentSeq = 0
	overlap_len = 10
	
	#seqHandle=open(seq_fasta_location,"rU")
	counter = 0
	startPer = 0
	
	num_isotype_found = {}
	total_isotype_found = 0
	total_found_score=0
	total_notfound_score=0
	
	print("Starting isotyping analysis for {0} sequences".format(numSeq))

	
	totaltime = 0
	a = int(round(time.time()))
	found = 0 
	
	iffile = readwrite.immunogrepFile(filelocation=seq_fasta_location,filetype=input_file_type);
	summary_data = {'found':0,'top_isotype':defaultdict(int),'average_mismatch':0,'average_num_isotype_found':0}
	
	for line_row in iffile.read():			
		jsonVar = {}
		if not line_row:
			continue
		
		if header_var in line_row:
			if idIdentifier in line_row:
				jsonVar[idIdentifier] = line_row[idIdentifier]
				jsonVar['Header'] = line_row[header_var]
			else:
				[header,id] = GrabAdditionalHeaderInfo(line_row[header_var])
				jsonVar[idIdentifier] = id			
				jsonVar['Header'] = header
			
		
		if seq_var not in line_row or line_row[seq_var]=='':		
			jsonVar['Sequence']=''					
			jsonVar['Notes'] = 'No sequence found'			
			writeSeqResult(outHandle,jsonVar,output_format)			
			continue
								
		#allow the user to monitor what percent of the sequences have been processed					
		startPer = useful.LoopStatus(counter,numSeq,10,startPer)
		
		bestScore = 0;
		bestBarcode = -1;
			
		jsonVar['Sequence'] = line_row[seq_var]
		jsonVar['Command'] = command_string
		counter+=1		
				
		seqFwd = jsonVar['Sequence']
		
		if strand_corrected:
			all_seqs = [seqFwd]
		else:
			all_seqs = [seqFwd,str(Seq(seqFwd).reverse_complement())]
		
		
		found_strand =''
		for pos,each_seq in enumerate(all_seqs):										
			#determine if we should take a substring of the sequence 
			#basically, only consider nucleotides AFTER the end of the ab field 
			if end_of_ab_field in line_row and line_row[end_of_ab_field]!='':
				try:
					end_of_ab = int(line_row[end_of_ab_field])							
				except:
					end_of_ab = 0
				#take substring
				if end_of_ab-overlap_len<len(each_seq) and end_of_ab-overlap_len>=0:
					each_seq = each_seq[end_of_ab:]																							
										
			isotypes_results = isotype_predictor.AlignToSeq(each_seq)
			if isotypes_results:
				found_strand = strand_orientation_list[pos]
				break
		
		
		if isotypes_results:
			found += 1 
			jsonVar = dict(jsonVar.items()+isotypes_results.items())
			
			jsonVar['Sequence strand'] = found_strand			
			
			
			if recombination_var in line_row and line_row[recombination_var]:
				#always trust the recombination type from input file IF provided
				jsonVar['Recombination type'] = line_row[recombination_var]
			else:
				#if there is no results then attemp to guess it our selves
				jsonVar['Recombination type'] = GuessRecombType(jsonVar['Isotype'][0])
			
			summary_data['top_isotype'][jsonVar['Isotype'][0]]+=1
			summary_data['average_num_isotype_found']+=len(jsonVar['Isotype'])
			summary_data['average_mismatch']+=jsonVar['Mismatches'][0]
		else:
			if recombination_var in line_row and line_row[recombination_var]:
				#always trust the recombination type from input file IF provided
				jsonVar['Recombination type'] = line_row[recombination_var]
		
		
			jsonVar['Isotype'] = ''
			jsonVar['Notes'] = 'Could not identify isotype with alignment score above threshold'
			summary_data['top_isotype']['NotFound']+=1
				
		writeSeqResult(outHandle,jsonVar,output_format)
				
		
	
	b = int(round(time.time()))
	
	summary_data['found'] = found
	if found:
		summary_data['average_mismatch'] = summary_data['average_mismatch']/float(found) 
		summary_data['average_num_isotype_found'] = summary_data['average_num_isotype_found']/float(found)
		
	totaltime=(b-a)			
	
	print "time: "
	print totaltime
	
	print "Summary of identified isotypes:"
	print summary_data
	
	#if total_isotype_found>0:
	#	print "\nAverage score for identified isotypes:"	
	#	print str(total_found_score/float(total_isotype_found))		
	
	#if numSeq-total_isotype_found>0:	
	#	print "\nAverage score for unidentified isotypes:"	
	#	print str(total_notfound_score/float(numSeq-total_isotype_found))
			
	outHandle.close()	
	#if output_file_format=="txt":
	#	JSON_to_TXT(output_file_location, output_file_location, True,{'Header':1,'Seq':2,'dir':3,'isotype':4,'algnPos':5,'maxscore':6,'bestscore':7})
	return output_file 
Exemplo n.º 8
0
def pandas_read_chunks(input_file, filetype, fields, chunks=10000):
	'''
		Function for creating pandas dataframes using the provided input_files.
		We will read x lines from each file at a time where x = chunks
		If the input file is not a delimited file (not CSV, TAB), then we will have to read the file using our read class and create a dataframe from those results
	'''
	# FLIPPED FIELDS -> the parameter fields is as follows: key => field name we use in program, value => field name in provided file.
	# We will flip this structure to make it easier for parsing the input files
	flipped_fields = {value: key for key, value in fields.iteritems()}

	each_file = input_file
	if isinstance(each_file, list):
		print('Reading file: ', each_file[0])
	else:
		print('Reading file: ', each_file)
	# First Lets figure out how we will read the files. Will we be able to simply load the dataframe using a delimited file, or will we use our methods for reading the files
	if not filetype:
		# We need to guess the filetype
		temp = readfile.immunogrepFile(each_file)
		guessed_type = temp.getFiletype()
	else:
		guessed_type = filetype

	# Next lets figure out what fields from each file we want to load
	if guessed_type == 'IMGT':
		# we will need these fields from IMGT
		field_names = ['V-D-J-REGION_5', 'V-J-REGION_5', 'CDR1-IMGT_5', 'CDR2-IMGT_5', 'CDR3-IMGT_5', 'Functionality_1', 'V-GENE and allele_1', 'J-GENE and allele_1', 'V-REGION Nb of mutations_8']
	else:
		field_names = flipped_fields.keys()

	if guessed_type in ['TAB', 'CSV']:
		# this is easy to read into a pandas dataframe
		seps = {'TAB': '\t', 'CSV': ','}
		skip_lines = 0
		# We need to figure out if we have to skip any lines because we aren't using our immunogrep class reader
		for line in open(each_file):
			line = line.strip()
			if not line.startswith(comment_line):
				break
			skip_lines += 1
		tmp = readfile.immunogrepFile(each_file, 'TAB')
		header_names_in_file = tmp.getDescription()
		
		field_names = [f for f in field_names if f in header_names_in_file]
		tmp.IFclass.close()
		reader = pd.read_table(each_file, sep=seps[guessed_type], chunksize=chunks, usecols=field_names, dtype=object, skip_blank_lines=True, skiprows=skip_lines)
	else:
		# We need to use our class for reading files
		if guessed_type == 'IMGT':
			reader = iffile_to_pandas(readfile.immunogrepFile(each_file, 'IMGT', required_files=[1, 5, 8], field_names=field_names, chunk_size=chunks))
		else:
			reader = iffile_to_pandas(readfile.immunogrepFile(each_file, guessed_type, field_names=field_names, chunk_size=chunks))
	for table in reader:

		# Rename the columns names based on the translator provided (this df will have column names as we expect in program)
		table.rename(columns=flipped_fields, inplace=True)
		# append dataset name to column for input file

		if 'READS' not in table.columns:
			table['READS'] = 1
		else:
			table['READS'] = table['READS'].astype(int)
			table['READS'].fillna(1, inplace=True)
		if 'RECOMBINATIONTYPE' not in table.columns:
			table['RECOMBINATIONTYPE'] = ''
		table.fillna('', inplace=True)
		yield table