import sys import os.path import os import numpy as np import csv sys.path.append(os.path.join(os.path.dirname(__file__), '..')) from ngs_scripts.Aggregate import returnSampleDict alldict = returnSampleDict() varnumberntdict = {} for key in alldict: if alldict[key].strain.upper() == 'FLUB': varcounter = 0 # print alldict[key].smartid varlist = alldict[key].varlist for var in varlist: if var.segment == 'HA': if var.totalcount >= 200 and var.binocheck.upper() == 'TRUE': # print var.ntpos,var.major,var.majorfreq,var.minor,var.minorfreq varcounter+=1 varnumberntdict[alldict[key].smartid] = (key,str(varcounter)) thefile = open('flub_varcount.txt','w') print>>thefile,'smartid,sampleid,varcount' for key in varnumberntdict: print>>thefile, key+','+varnumberntdict[key][0]+','+varnumberntdict[key][1]
else: seq.append(line) if name: yield (name, "".join(seq)) condict = {} with open(consensuspath) as fp: for name, seq in read_fasta(fp): # print(name, seq) condict[name[1:].split(" ")[0]] = seq # for key in condict: # print key, condict[key] somedict = returnSampleDict() ultvarlist = [] for key in somedict: if somedict[key].strain == "H3N2": # print key,somedict[key].smartid for eachvar in somedict[key].varlist: if eachvar.segment == "HA": if eachvar.binocheck == "TRUE": if eachvar.totalcount >= 100: ultvarlist.append(eachvar.ntpos) # printrow1 = [somedict[key].smartid,eachvar.segment,eachvar.ntpos,eachvar.major,eachvar.majorfreq,'major'] # printrow2 = [somedict[key].smartid,eachvar.segment,eachvar.ntpos,eachvar.minor,eachvar.minorfreq,'minor'] # print ','.join(map(str, printrow1)) # print ','.join(map(str, printrow2)) ultvarlist = list(set(ultvarlist))
def l1_norm(somestrain, CUTOFF = 0.03, COVERCUTOFF = 200): #Obtain reference files if somestrain.upper() == 'FLUB': refdict = snplists.Reference.open_fasta('flub_reference.fa') elif somestrain.upper() == 'H3N2': refdict = snplists.Reference.open_fasta('flua_reference.fa') #Retrieves variant information for samples alldict = returnSampleDict() #Find samples that only have the strain we specified sampdict = {} for key in alldict: if alldict[key].strain.upper() == somestrain.upper(): sampdict[key] = alldict[key] keylist = list(sampdict.keys()) keylist.sort() officialnamelist = [] for key in keylist: officialnamelist.append(sampdict[key].smartid) #Peform L1_norm for each segment for SEGMENT in refdict: # if SEGMENT == 'HA': dismatrix = np.zeros((len(keylist),len(keylist))) for aidx,asamp in enumerate(keylist): #column for bidx,bsamp in enumerate(keylist): #row if aidx == bidx: dismatrix[aidx,bidx] = 0 elif aidx > bidx: #Because matrix is symmetrical we cut work in half dismatrix[aidx,bidx] = dismatrix[bidx,aidx] else: # print aidx,bidx avarlist = sampdict[asamp].varlist bvarlist = sampdict[bsamp].varlist asegntlist = [] bsegntlist = [] asegntdict = {} bsegntdict = {} #Filter for good quality variants for var in avarlist: if var.totalcount > COVERCUTOFF and var.binocheck.upper() == 'TRUE' and var.minorfreq > CUTOFF and var.segment == SEGMENT: asegntlist.append(var.segment+'_'+str(var.ntpos)) asegntdict[var.segment+'_'+str(var.ntpos)] = var for var in bvarlist: if var.totalcount > COVERCUTOFF and var.binocheck.upper() == 'TRUE' and var.minorfreq > CUTOFF and var.segment == SEGMENT: bsegntlist.append(var.segment+'_'+str(var.ntpos)) bsegntdict[var.segment+'_'+str(var.ntpos)] = var #Combine list of variants unionsegntlist = list(set(asegntlist).union(set(bsegntlist))) unionsegntlist.sort() sampindexlist=[] #Three cases to generating L1_NORM. #Case 1 -> The variant is found in both samples #Case 2 -> The variant is found only in sample A, in that case we take B info from reference. #Case 3 -> The variant is found only in Sample B for segnt in unionsegntlist: #Case 1 if segnt in asegntlist and segnt in bsegntlist: a_afreq = asegntdict[segnt].afreq a_cfreq = asegntdict[segnt].cfreq a_gfreq = asegntdict[segnt].gfreq a_tfreq = asegntdict[segnt].tfreq b_afreq = bsegntdict[segnt].afreq b_cfreq = bsegntdict[segnt].cfreq b_gfreq = bsegntdict[segnt].gfreq b_tfreq = bsegntdict[segnt].tfreq #Case 2 elif segnt in asegntlist: a_afreq = asegntdict[segnt].afreq a_cfreq = asegntdict[segnt].cfreq a_gfreq = asegntdict[segnt].gfreq a_tfreq = asegntdict[segnt].tfreq refseg,refpos = segnt.split('_') ref_nt = refdict[refseg][int(refpos)-1] ref_nt = ref_nt.upper() b_afreq = 0.0 b_cfreq = 0.0 b_gfreq = 0.0 b_tfreq = 0.0 if ref_nt == 'A': b_afreq = 1.0 elif ref_nt == 'C': b_cfreq = 1.0 elif ref_nt == 'G': b_gfreq = 1.0 elif ref_nt == 'T': b_tfreq = 1.0 else: print 'ERROR_ERROR' #Case 3 elif segnt in bsegntlist: b_afreq = bsegntdict[segnt].afreq b_cfreq = bsegntdict[segnt].cfreq b_gfreq = bsegntdict[segnt].gfreq b_tfreq = bsegntdict[segnt].tfreq refseg,refpos = segnt.split('_') ref_nt = refdict[refseg][int(refpos)-1] ref_nt = ref_nt.upper() a_afreq = 0.0 a_cfreq = 0.0 a_gfreq = 0.0 a_tfreq = 0.0 if ref_nt == 'A': a_afreq = 1.0 elif ref_nt == 'C': a_cfreq = 1.0 elif ref_nt == 'G': a_gfreq = 1.0 elif ref_nt == 'T': a_tfreq = 1.0 else: print 'ERROR_ERROR' indexvalue = abs(a_afreq-b_afreq)+abs(a_cfreq-b_cfreq)+abs(a_gfreq-b_gfreq)+abs(a_tfreq-b_tfreq) sampindexlist.append(indexvalue) #Sum over sampindexlist (all variants in that segment) inputvalue = sum(sampindexlist)#/float(len(refdict[SEGMENT])) #normalize dismatrix[aidx,bidx] = inputvalue relpath = os.getcwd() filepath = relpath+'/../FILES/output/' # thefile = open(filepath+somestrain+'.namelist.csv','w') # for row in dismatrix: # print row # df = pd.DataFrame(dismatrix) # print df df = pd.DataFrame(dismatrix, index=officialnamelist, columns=officialnamelist) df.to_csv(filepath+somestrain+'.'+SEGMENT+'.dissim_all.csv', index=True, header=True, sep=',')