def populatePatients( conn, patient_names ) : patient_dbix = conn.getNextID("Patients"); for patient in patient_names : query = "select id from Patients where name='%s'" % patient pid = conn.queryScalar(query,int) if not pid : patient = broad.sanitizePatientName( patient ) conn.insert( "Patients", [patient_dbix,patient], ["id","name"] ) patients_dbix[patient] = patient_dbix patient_dbix += 1
def populatePatients( conn, patient_names ) : patient_dbix = conn.getNextID("Patients"); for patient in patient_names : query = "select id from Patients where name='%s'" % patient pid = conn.queryScalar(query,int) if not pid : patient = broad.sanitizePatientName( patient ) conn.insert( "Patients", [patient_dbix,patient,plate_id], ["id","name","plate"] ) patients_dbix[patient] = patient_dbix else : src = conn.queryScalar("select plate from Patients where id=%d" % pid, int) if not presentIn( src, plate_id ) : #update the plate column if this variant is from a novel plate update = "update Patients set plate = plate+%d where id=%d" \ % ( plate_contrib, pid) conn.cur.execute( update ) patient_dbix += 1
def familyReports() : #print "changed getPatients(), things will break" outdir = globes.OUT_DIR conn = db.Conn("localhost") conn2 = db.Conn("localhost") print "connetions made" num_vcols = len(vcols) #handles to the file names fouts = {} #buffers to accumulate writes to the fouts fbuffers = {} query = '''select name from Patients''' #open files for each patient, one for hets, one for homs #print the column headers for each file #also initialize the buffer space for r in conn.iterateQuery( query ) : patient = broad.sanitizePatientName( r[0] ) fouts[patient] = {} fbuffers[patient] = {} for gt in ["hets","homs"] : filename = '%s/%s_%s.tsv' % (outdir,patient,gt) fouts[patient][gt] = filename f = open(filename, 'wb') fout = csv.writer( f,\ delimiter='\t', \ quoting=csv.QUOTE_MINIMAL ) #fouts[patient][gt].writerow( column_headers ) fout.writerow( column_headers ) f.close() fbuffers[patient][gt] = [] #what are the interesting variants query = '''select %s, %s, %s from Variants as v inner join Isoforms as i on v.id = i.var_id inner join Genes as g on g.id = i.gene_id where (%s) and v.AF < 0.1 order by AF''' % (vcols_string, icols_string, gcols_string, gvs) print query #write the buffers out to the respective files def flush() : for pat in fbuffers : for gt in ["homs","hets"] : f = open( fouts[pat][gt], 'a') fout = csv.writer( f,\ delimiter='\t', \ quoting=csv.QUOTE_MINIMAL ) fout.writerows( fbuffers[pat][gt] ) f.close() #fouts[pat][gt].writerows( fbuffers[pat][gt] ) fbuffers[pat][gt] = [] for varix,r in enumerate(conn.query( query )) : #do a buffer flush if varix % 10000 == 0 : flush() print varix var_id = r[0] isIndel = int(r[6]) == 2 if isIndel : where = "" else : # only look at patients meeting these call reqs where = " and c.DP >= %d" % COVERAGE lookup = getPatients( conn2, var_id, where_clause=where ) (noinfs,hets,homs) = [lookup[gt] for gt in [0,1,2]] if len(hets) == len(homs) == 0 : continue hom_pats = [p[1] for p in homs] num_homs = len(hom_pats) hom_string = '; '.join(hom_pats) het_pats = [p[1] for p in hets] het_string = '; '.join(het_pats) num_hets = len(het_pats) output_row = formatQueryRow( r ) for ix,(pat_id,pat,call) in enumerate(homs) : pat = broad.sanitizePatientName( pat ) hom_shares = hom_pats[:ix] + hom_pats[ix+1:] new_hom_string = '; '.join(hom_shares) fbuffers[pat]["homs"].append( output_row + \ [call, num_homs-1, new_hom_string, \ num_hets, het_string] ) for ix,(pat_id,pat,call) in enumerate(hets) : pat = broad.sanitizePatientName( pat ) het_shares = het_pats[:ix] + het_pats[ix+1:] new_het_string = '; '.join(het_shares) fbuffers[pat]["hets"].append( output_row + \ [call, num_homs, hom_string, \ num_hets-1, new_het_string] ) #fouts[pat]["hets"].writerow( output_row + \ #[call, num_homs, hom_string, \ #num_hets-1, new_het_string] ) flush()
def familyReports() : outdir = globes.OUT_DIR conn = db.Conn("localhost") conn2 = db.Conn("localhost") print "connetions made" vcols = conn2.getColumns('Variants') vcols = ["id", "chrom","pos","ref","mut","type","ref_aa","mut_aa","qual", "filter","AF","granthamScore","scorePhastCons", "consScoreGERP","distanceToSplice","AfricanHapMapFreq", "EuropeanHapMapFreq", "AsianHapMapFreq","clinicalAssociation"] icols = conn2.getColumns('Isoforms') #the general report #fout = open("%s/indelReport.tsv" % (outdir),'w') #freport = csv.writer( fout, \ #delimiter='\t', \ #quoting=csv.QUOTE_MINIMAL ) #freport.writerow( vcols + icols + ["Homs","Hets"] ) #the per family reports fouts = {} #plate_id = globes.plates["CIDR"] #if we want to restrict attention to a certain plate of patients #query = "select distinct(pat_id) from Calls where plate = %d" % plate_id #string = [] #for row in conn.iterateQuery( query ) : #string.append("id=%d" % row[0]) #string = ' or '.join(string) string = '1 = 1' query = '''select name from Patients where %s''' % string for r in conn.iterateQuery( query ) : patient = broad.sanitizePatientName( r[0] ) fouts[patient] = {} for gt in ["hets","homs"] : filename = '%s/%s_%s.tsv' % (outdir,patient,gt) fouts[patient][gt] = csv.writer( open(filename, 'wb'),\ delimiter='\t', \ quoting=csv.QUOTE_MINIMAL ) #print header fouts[patient][gt].writerow( ["GT","DP","GQ","geneSymbol"] + vcols[1:] + icols + [ "#HomShares", "Hom Shares", "#HetShares", "Het Shares"] ) #what are the interesting variants vcols_string = ', '.join(["v.%s" % c for c in vcols]) dont_want = ["intron","near-gene-5","intergenic","near-gene-3","coding-synonymous","coding-notMod3"] gvs = ["ss_functionGVS <> '%s'" % dw for dw in dont_want] gvs = ' and '.join(gvs) query = '''select %s, i.*, g.geneSymbol from Variants as v inner join Isoforms as i on v.id = i.var_id inner join Genes as g on i.gene_id = g.id where v.dbSNP is NULL and (%s) and v.AF < 0.1 order by AF''' % (vcols_string, gvs) #(ss_polyPhen = 'probably-damaging' or ss_polyPhen = 'possibly-damaging') print query for varix,r in enumerate(conn.query( query )) : if varix % 5000 == 0 : print varix var_id = r[0] #only look at patients meeting these call reqs where = " and c.DP >= 8" (noinfs,hets,homs) = getPatients( conn, var_id, where ) if len(hets) == len(homs) == 0 : continue hom_pats = [p[1] for p in homs] num_homs = len(hom_pats) hom_string = '; '.join(hom_pats) het_pats = [p[1] for p in hets] het_string = '; '.join(het_pats) num_hets = len(het_pats) for ix,(pat_id,pat,call) in enumerate(homs) : pat = broad.sanitizePatientName( pat ) hom_shares = hom_pats[:ix] + hom_pats[ix+1:] hom_string = '; '.join(hom_shares) fouts[pat]["homs"].writerow( call.split(':') + \ [r[-1]] + \ list(r[1:-1]) + \ [num_homs-1, hom_string, num_hets, het_string] ) for ix,(pat_id,pat,call) in enumerate(hets) : pat = broad.sanitizePatientName( pat ) het_shares = het_pats[:ix] + het_pats[ix+1:] het_string = '; '.join(het_shares) fouts[pat]["hets"].writerow( call.split(':') + \ [r[-1]] + \ list(r[1:-1]) + \ [num_homs,hom_string,num_hets-1,het_string] )