def parse_gpdfile(tdir,gpdfile,smoothing_factor): # Go through the long reads and make a genepred if gpdfile != '-': fr = FileBasics.GenericFileReader(gpdfile) else: fr = sys.stdin seennames = {} longreadnumber = 0 of_gpd = open(tdir+'/longreads.gpd','w') while True: line = fr.readline() if not line: break if re.match('^#',line): #skip comments continue longreadnumber += 1 entry = GenePredBasics.smooth_gaps( \ GenePredBasics.line_to_entry(line.rstrip()) \ ,smoothing_factor) readname = entry['name'] if readname in seennames: sys.stderr.write("Warning: repeat name '"+readname+"'\n") #set our first name to our bin entry['name'] = str(longreadnumber) gline = GenePredBasics.entry_to_line(entry) of_gpd.write(gline+"\n") fr.close() of_gpd.close()
def get_exons_from_seqs(seqs, d, spcf): sind = 0 oline = '' for seq in seqs: sind += 1 psec = 'P' #primary or secondary if sind > 1: psec = 'S' d1 = d.copy() d1['rname'] = seq[1] if seq[2] == '+': d1['flag'] = 0 else: d1['flag'] = 16 d1['pos'] = seq[3] d1['cigar'] = seq[4] d1['cigar_array'] = SamBasics.parse_cigar(seq[4]) skips = set(['H', 'D', 'N']) total_length = 0 possible_matches = 0 indels = 0 qstart = 0 if d1['cigar_array'][0]['op'] == 'S': qstart = d1['cigar_array'][0]['val'] if d1['cigar_array'][0]['op'] == 'H': qstart = d1['cigar_array'][0]['val'] for ce in d1['cigar_array']: if ce['op'] not in skips: total_length += ce['val'] if ce['op'] == 'M': possible_matches += ce['val'] elif ce['op'] == 'I': indels += ce['val'] elif ce['op'] == 'D' and ce['val'] < 68: indels += ce['val'] fakeseq = 'N' * total_length d1['seq'] = fakeseq nline = SamBasics.entry_to_line(d1) pline = spcf.convert_line(nline) pentry = PSLBasics.line_to_entry(pline) #mismatch_count = -1 #if sind == 1 and args.reference_genome: #for primary alignments we can calculate the number of matches # for i in range(0,len(pentry['blockSizes'])): # tseq = spcf.genome[pentry['tName']][pentry['tStarts'][i]:pentry['tStarts'][i]+pentry['blockSizes'][i]] # qseq = sequence[pentry['qStarts'][i]:pentry['qStarts'][i]+pentry['blockSizes'][i]] # print pentry['blockSizes'][i] # print tseq # print qseq # for j in range(0,len(tseq)): # if tseq[j].upper() != qseq[j].upper(): mismatch_count += 1 gline = PSLBasics.convert_entry_to_genepred_line(pentry) gentry = GenePredBasics.line_to_entry(gline) gsmooth = GenePredBasics.smooth_gaps(gentry, 68) for i in range(0, len(gsmooth['exonStarts'])): oline += gsmooth['chrom'] + "\t" + str( gsmooth['exonStarts'][i]) + "\t" + str( gsmooth['exonEnds'] [i]) + "\t" + gsmooth['strand'] + "\t" + gsmooth[ 'name'] + "\t" + str(possible_matches) + "\t" + str( indels) + "\t" + psec + "\t" + str(qstart) + "\n" return oline
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('genepred',help="FILENAME or use - for STDIN") parser.add_argument('--smoothing_size',type=int,default=68,help="INT no gaps less than this size") args = parser.parse_args() inf = sys.stdin if args.genepred != '-': inf = open(args.genepred) for line in inf: e = GenePredBasics.line_to_entry(line) e2 = GenePredBasics.smooth_gaps(e,args.smoothing_size) print GenePredBasics.entry_to_line(e2)
def main(): parser = argparse.ArgumentParser(description='Use reference junctions when they are close',formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--min_intron_size',type=int,default=68,help="INT min intron size") parser.add_argument('--min_local_support',type=int,default=0,help="INT min number of junctions within search_size of a junction in order to count it") parser.add_argument('--search_size',type=int,default=10,help="INT search space for reference") parser.add_argument('--output_fake_psl',help="FASTAFILE reference genome to make a fake PSL output") parser.add_argument('psl',help="PSLFILENAME or '-' for STDIN") parser.add_argument('reference_genepred',help="FASTAFILENAME for reference genepred") args = parser.parse_args() cpus = multiprocessing.cpu_count() genome = {} if args.output_fake_psl: genome = read_fasta_into_hash(args.output_fake_psl) #read in the reference genepred first gpf = GenePredBasics.GenePredFile(args.reference_genepred) #lets sort entries by chromosome ref = {} for e in [x.entry for x in gpf.entries]: if len(e['exonStarts']) <= 1: continue if e['chrom'] not in ref: ref[e['chrom']] = {} for i in range(1,len(e['exonStarts'])): if e['exonEnds'][i-1] not in ref[e['chrom']]: ref[e['chrom']][e['exonEnds'][i-1]] = {} if e['exonStarts'][i]+1 not in ref[e['chrom']][e['exonEnds'][i-1]]: ref[e['chrom']][e['exonEnds'][i-1]][e['exonStarts'][i]+1] = e['strand'] #Stored all junctions as 1-base read_info = {} pf = GenericFileReader(args.psl) fcount_total = 0 while True: line = pf.readline() if not line: break if re.match('^#',line): continue line = line.rstrip() pe = PSLBasics.line_to_entry(line) if len(pe['tStarts']) != len(pe['blockSizes']) or len(pe['qStarts']) != len(pe['blockSizes']): sys.stderr.write("WARNING invalid psl\n") continue genepred_line = PSLBasics.convert_entry_to_genepred_line(pe) ge = GenePredBasics.smooth_gaps(GenePredBasics.line_to_entry(genepred_line),args.min_intron_size) refjuns = {} if pe['tName'] in ref: refjuns = ref[pe['tName']] new_ge = nudge(pe,ge,refjuns,args) if args.output_fake_psl: new_psl_line = GenePredBasics.entry_to_fake_psl_line(new_ge,genome) print new_psl_line else: print GenePredBasics.entry_to_line(new_ge)
def get_exons_from_seqs(seqs,d,spcf): sind = 0 oline = '' for seq in seqs: sind+=1 psec = 'P' #primary or secondary if sind > 1: psec = 'S' d1 = d.copy() d1['rname'] = seq[1] if seq[2] == '+': d1['flag'] = 0 else: d1['flag'] = 16 d1['pos'] = seq[3] d1['cigar'] = seq[4] d1['cigar_array'] = SamBasics.parse_cigar(seq[4]) skips = set(['H','D','N']) total_length = 0 possible_matches = 0 indels = 0 qstart = 0 if d1['cigar_array'][0]['op'] == 'S': qstart = d1['cigar_array'][0]['val'] if d1['cigar_array'][0]['op'] == 'H': qstart = d1['cigar_array'][0]['val'] for ce in d1['cigar_array']: if ce['op'] not in skips: total_length += ce['val'] if ce['op'] == 'M': possible_matches += ce['val'] elif ce['op'] == 'I': indels += ce['val'] elif ce['op'] == 'D' and ce['val'] < 68: indels += ce['val'] fakeseq = 'N'*total_length d1['seq'] = fakeseq nline = SamBasics.entry_to_line(d1) pline = spcf.convert_line(nline) pentry = PSLBasics.line_to_entry(pline) #mismatch_count = -1 #if sind == 1 and args.reference_genome: #for primary alignments we can calculate the number of matches # for i in range(0,len(pentry['blockSizes'])): # tseq = spcf.genome[pentry['tName']][pentry['tStarts'][i]:pentry['tStarts'][i]+pentry['blockSizes'][i]] # qseq = sequence[pentry['qStarts'][i]:pentry['qStarts'][i]+pentry['blockSizes'][i]] # print pentry['blockSizes'][i] # print tseq # print qseq # for j in range(0,len(tseq)): # if tseq[j].upper() != qseq[j].upper(): mismatch_count += 1 gline = PSLBasics.convert_entry_to_genepred_line(pentry) gentry = GenePredBasics.line_to_entry(gline) gsmooth = GenePredBasics.smooth_gaps(gentry,68) for i in range(0,len(gsmooth['exonStarts'])): oline += gsmooth['chrom'] + "\t" + str(gsmooth['exonStarts'][i])+"\t"+str(gsmooth['exonEnds'][i])+"\t"+gsmooth['strand']+"\t"+gsmooth['name']+"\t"+str(possible_matches)+"\t"+str(indels)+"\t"+psec+"\t"+str(qstart)+"\n" return oline
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('genepred', help="FILENAME or use - for STDIN") parser.add_argument('--smoothing_size', type=int, default=68, help="INT no gaps less than this size") args = parser.parse_args() inf = sys.stdin if args.genepred != '-': inf = open(args.genepred) for line in inf: e = GenePredBasics.line_to_entry(line) e2 = GenePredBasics.smooth_gaps(e, args.smoothing_size) print GenePredBasics.entry_to_line(e2)
def convert_directionless_gpd_alignment_to_reference(sam_filename,genepred_filename,out_map): conv = GenePredBasics.get_directionless_gpd_conversion(genepred_filename) ofile = open(out_map,'w') with open(sam_filename) as samfile: for line in samfile: line = line.rstrip() if re.match('^@[A-Z][A-Z]\s',line): continue #skip header d = sam_line_to_dictionary(line) if d['rname'] == '*': continue #skip unmapped startposition = d['pos']-1 readcoord = [] z = 0 for entry in d['cigar_array']: if re.match('[MISX=]',entry['op']): # all the entries that map to the read for i in range(0,entry['val']): if re.match('[M=X]',entry['op']): #all the entries that match the reference alignment readcoord.append(conv[d['rname']]['coordinates'][startposition+z]) z+=1 # lets ignore insertions for now #else: # readcoord.append('*') if re.match('[DNH]',entry['op']): z+= entry['val'] abbrev = conv[d['rname']]['chrom']+':'+SequenceBasics.collapse_coordinate_array(readcoord) ofile.write(d['qname'] + "\t" + d['rname'] + "\t" + abbrev + "\n") ofile.close()
def parse_refgpd(tdir,geneprednames,simplenames): # get the reference genepreds ready to use in work column_number = 0 entry_number = 0 of_entries = open(tdir+"/entries.txt",'w') for file in geneprednames: column_number += 1 of_ref = open(tdir+"/reference."+str(column_number)+".bed",'w') gfr = FileBasics.GenericFileReader(file) while True: line = gfr.readline() if not line: break if re.match('^#',line): continue entry_number += 1 line = line.rstrip("\n") entry = GenePredBasics.line_to_entry(line) entry_length = 0 for i in range(0,len(entry['exonStarts'])): entry_length += entry['exonEnds'][i]-entry['exonStarts'][i] of_entries.write(str(column_number)+ "\t" + simplenames[column_number-1] + "\t" + str(entry_number) + "\t" + entry['gene_name'] + "\t" + entry['name']+"\t"+str(entry_length)+"\n") exon_number = 0 for i in range(0,len(entry['exonStarts'])): exon_number += 1 of_ref.write(entry['chrom'] + "\t" + str(entry['exonStarts'][i]) + "\t" \ + str(entry['exonEnds'][i]) + "\t" + str(entry_number) + "\t" \ + entry['gene_name'] + "\t" \ + entry['name'] + "\t" + str(len(entry['exonStarts'])) + "\t" \ + entry['strand'] + "\t" + str(exon_number) \ + "\n") gfr.close() of_ref.close() of_entries.close()
def main(): parser = argparse.ArgumentParser(description="Convert a psl file into a target formated genepred file.") parser.add_argument('--fill_gaps',type=int,default=0,help="Close gaps this size or smaller.") parser.add_argument('input_name',help="Input PSL file, use - to indicate STDIN.") args = parser.parse_args() pslfilehandle = sys.stdin if args.input_name != '-': pslfilehandle = open(args.input_name) with pslfilehandle as infile: for line in infile: psl_entry = PSLBasics.line_to_entry(line) genepred_line = PSLBasics.convert_entry_to_genepred_line(psl_entry) if args.fill_gaps > 0: genepred_entry = GenePredBasics.line_to_entry(genepred_line) genepred_entry2 = GenePredBasics.smooth_gaps(genepred_entry,args.fill_gaps) genepred_line = GenePredBasics.entry_to_line(genepred_entry2) print genepred_line
def main(): parser = argparse.ArgumentParser() parser.add_argument('input', help="GENEPRED file input use - for STDIN") args = parser.parse_args() inf = sys.stdin if args.input != '-': inf = open(args.input) for line in inf: e = GenePredBasics.GenePredEntry() e.line_to_entry(line.rstrip()) print e.entry['gene_name'] + "\t" + e.entry['name'] + "\t" + str( e.length()) inf.close()
def add_genepred_line(self, inline): if not self.ref_hash: sys.stderr.write( "ERROR: Must assign a reference genome dictionary first\n") sys.exit() gpd = GenePredBasics.GenePredEntry(inline) if gpd.value('name') in self.transcripts: sys.stderr.write("WARNING: " + inline + " transcript was already set\n") seq = '' for i in range(0, gpd.value('exonCount')): seq += self.ref_hash[gpd.value('chrom')][ gpd.value('exonStarts')[i]:gpd.value('exonEnds')[i]].upper() if gpd.value('strand') == '-': seq = SequenceBasics.rc(seq) self.transcripts[gpd.value('name')] = seq return
def read_from_fasta_and_genepred(self,genomefastafile,genepredfile): # read in our genome seen_names = {} seen_coords = {} genepred = {} with open(genepredfile) as inf: for line in inf: if re.match('^#',line): continue e = GenePredBasics.line_to_entry(line) hexcoord = hashlib.sha1(e['chrom']+"\t"+e['strand'] + "\t" + str(e['exonStarts'])+"\t" + str(e['exonEnds'])).hexdigest() #print hex #print e['gene_name'] #print e['name'] dupname = 0 dupcoord = 0 if hexcoord in seen_coords: sys.stderr.write("Warning "+ e['name'] + " " + e['gene_name'] + " exists at identical coordinates as another entry\n") dupcoord = 1 seen_coords[hexcoord] = 1 currname = e['name'] if e['name'] in seen_names: if dupcoord == 1: sys.stderr.write("skipping perfect duplicate of "+e['name']+"\n") continue newname = e['name'] + "."+str(len(seen_names[e['name']])+1) currname = newname seen_names[e['name']].append(newname) sys.stderr.write("Warning "+ e['name'] + " " + e['gene_name'] + " is a duplicate name.. renaming to "+newname+ "\n") dupname = 1 else: seen_names[e['name']] = [] seen_names[e['name']].append(e['name']) genepred[currname] = e #print "reading names and locs" ref = read_fasta_into_hash(genomefastafile) #print "converting sequences" for transcript in genepred: e = genepred[transcript] if e['chrom'] in ref: seq = '' self.transcript_names[transcript] = genepred[transcript]['name'] for i in range(0,e['exonCount']): seq += ref[e['chrom']][e['exonStarts'][i]:e['exonEnds'][i]] if e['strand'] == '-': seq = rc(seq) self.transcripts[transcript] = seq
def main(): parser = argparse.ArgumentParser() parser.add_argument('input_file', help="use - for STDIN") args = parser.parse_args() inf = sys.stdin if args.input_file != '-': inf = open(args.input_file) for line in inf: e = GenePredBasics.line_to_entry(line.rstrip()) matches = 0 qstartslist = [] for i in range(0, len(e['exonStarts'])): mylen = e['exonEnds'][i] - e['exonStarts'][i] matches += mylen qstartslist.append(matches - mylen) qstarts = ','.join([str(x) for x in qstartslist]) + ',' oline = str(matches) + "\t" # 1 oline += "0\t" # 2 oline += "0\t" # 3 oline += "0\t" # 4 oline += "0\t" # 5 oline += "0\t" # 6 oline += "0\t" # 7 oline += "0\t" # 8 oline += e['strand'] + "\t" # 9 oline += e['name'] + "\t" # 10 oline += str(matches) + "\t" # 11 oline += "0\t" # 12 oline += str(matches) + "\t" # 13 oline += str(e['chrom']) + "\t" # 14 oline += str(e['exonEnds'][-1]) + "\t" # 15 oline += str(e['exonStarts'][0]) + "\t" # 16 oline += str(e['exonEnds'][-1]) + "\t" # 17 oline += str(len(e['exonStarts'])) + "\t" # 18 oline += ','.join([ str(e['exonEnds'][x] - e['exonStarts'][x]) for x in range(0, len(e['exonStarts'])) ]) + ',' + "\t" # 19 oline += qstarts + "\t" # 20 oline += ','.join([str(x) for x in e['exonStarts']]) + ',' # 21 print oline inf.close()
def main(): parser = argparse.ArgumentParser(description='Create artifical reference sequences from a genepred') parser.add_argument('gpd_file') parser.add_argument('reference_fasta') parser.add_argument('-o','--output',help="output file to write to or STDOUT if not set") args = parser.parse_args() of = sys.stdout if args.output: of = open(args.output,'w') f = read_fasta_into_hash(args.reference_fasta) with open(args.gpd_file) as inf: for line in inf: gpd = GenePredBasics.GenePredEntry() gpd.line_to_entry(line.rstrip()) ars = ARS() beds = [] for i in range(0,gpd.value('exonCount')): b = Bed(gpd.value('chrom'),gpd.value('exonStarts')[i],gpd.value('exonEnds')[i],gpd.value('strand')) beds.append(b) ars.set_bounds(beds) ars.set_name(gpd.value('name')) ars.set_sequence_from_original_reference_hash(f) of.write(ars.get_fasta())
def main(): parser = argparse.ArgumentParser() parser.add_argument('input_file',help="use - for STDIN") args = parser.parse_args() inf = sys.stdin if args.input_file != '-': inf = open(args.input_file) for line in inf: e = GenePredBasics.line_to_entry(line.rstrip()) matches = 0 qstartslist = [] for i in range(0,len(e['exonStarts'])): mylen = e['exonEnds'][i]-e['exonStarts'][i] matches += mylen qstartslist.append(matches-mylen) qstarts = ','.join([str(x) for x in qstartslist])+',' oline = str(matches)+"\t" # 1 oline += "0\t" # 2 oline += "0\t" # 3 oline += "0\t" # 4 oline += "0\t" # 5 oline += "0\t" # 6 oline += "0\t" # 7 oline += "0\t" # 8 oline += e['strand']+"\t" # 9 oline += e['name']+"\t" # 10 oline += str(matches)+"\t" # 11 oline += "0\t" # 12 oline += str(matches)+"\t" # 13 oline += str(e['chrom'])+"\t" # 14 oline += str(e['exonEnds'][-1])+"\t" # 15 oline += str(e['exonStarts'][0])+"\t" # 16 oline += str(e['exonEnds'][-1])+"\t" # 17 oline += str(len(e['exonStarts']))+"\t" # 18 oline += ','.join([str(e['exonEnds'][x]-e['exonStarts'][x]) for x in range(0,len(e['exonStarts']))])+','+"\t" # 19 oline += qstarts + "\t" # 20 oline += ','.join([str(x) for x in e['exonStarts']])+',' # 21 print oline inf.close()
def main(): #do our inputs args = do_inputs() global gout gout = args.output gls = GenePredBasics.GenePredLocusStream(args.input) fgs = GenePredFuzzyBasics.FuzzyGenePredSeparator() if args.threads > 1: p = Pool(processes=args.threads) while True: buffer = gls.read_locus() if not buffer: break if args.threads > 1: p.apply_async(process_buffer, args=(buffer, args), callback=out_gpds) else: v = process_buffer(buffer, args) out_gpds(v) if args.threads > 1: p.close() p.join() sys.stderr.write("\n")
def break_gpdfile(tdir,job_size): bfcr = BigFileBasics.BigFileChunkReader(tdir+'/longreads.gpd') bfcr.set_chunk_size_bytes(job_size) num_jobs = bfcr.chunk_count for i in range(0,bfcr.chunk_count): oc = bfcr.open_chunk(i) job = i+1 of_bed = open(tdir+'/partreads.'+str(job)+'.bed','w') while True: line = oc.read_line() if not line: break line = line.rstrip("\n") entry = GenePredBasics.line_to_entry(line) exon_number = 0 for i in range(0,len(entry['exonStarts'])): exon_number += 1 of_bed.write(entry['chrom'] + "\t" + str(entry['exonStarts'][i]) + "\t" \ + str(entry['exonEnds'][i]) + "\t" + entry['name']+"\t" \ + entry['gene_name'] + "\t" + str(len(entry['exonStarts'])) + "\t" \ + entry['strand'] + "\t" + str(exon_number) + "\n") oc.close() of_bed.close() return num_jobs
def convert_directionless_gpd_alignment_to_reference(sam_filename, genepred_filename, out_map): conv = GenePredBasics.get_directionless_gpd_conversion(genepred_filename) ofile = open(out_map, 'w') with open(sam_filename) as samfile: for line in samfile: line = line.rstrip() if re.match('^@[A-Z][A-Z]\s', line): continue #skip header d = sam_line_to_dictionary(line) if d['rname'] == '*': continue #skip unmapped startposition = d['pos'] - 1 readcoord = [] z = 0 for entry in d['cigar_array']: if re.match( '[MISX=]', entry['op']): # all the entries that map to the read for i in range(0, entry['val']): if re.match( '[M=X]', entry['op'] ): #all the entries that match the reference alignment readcoord.append( conv[d['rname']]['coordinates'][startposition + z]) z += 1 # lets ignore insertions for now #else: # readcoord.append('*') if re.match('[DNH]', entry['op']): z += entry['val'] abbrev = conv[d['rname']][ 'chrom'] + ':' + SequenceBasics.collapse_coordinate_array( readcoord) ofile.write(d['qname'] + "\t" + d['rname'] + "\t" + abbrev + "\n") ofile.close()
def nudge(psl_entry,gpd_entry,refjun,args): junctions = [] fcount = 0 if len(gpd_entry['exonStarts']) == 1: #print "no intron 1" return gpd_entry bounds = [] for i in range(1,len(gpd_entry['exonStarts'])): junc_start = gpd_entry['exonEnds'][i-1] junc_finish = gpd_entry['exonStarts'][i]+1 bounds.append([junc_start, junc_finish,i-1]) if len(bounds) < 1: #print "no intron 2" return gpd_entry bestbounds = [] for bound in bounds: best_distance = [10000000,10000000] best_result = None for z1 in range(bound[0]-args.search_size,bound[0]+args.search_size+1): d1 = abs(z1-bound[0]) if z1 in refjun: for z2 in range(bound[1]-args.search_size,bound[1]+args.search_size+args.search_size+1): d2 = abs(z2-bound[1]) if z2 in refjun[z1]: refstrand = refjun[z1][z2] if d1+d2 < best_distance[0]+best_distance[1]: best_distance = [d1,d2] best_result = [z1,z2,refstrand,bound[2]]+best_distance if best_result: bestbounds.append(best_result) if len(bestbounds) < 1: #nothing fixable #sys.stderr.write("nothing fixable\n") return gpd_entry #Now we have a list of nudgable bounds #Lets pick a strand plus_score = 0 minus_score = 0 #print '----' #print bestbounds for bound in bestbounds: if bound[2] == '+': plus_score += 1/(float(abs(bound[4]))+float(abs(bound[5]))+1) else: minus_score += 1/(float(abs(bound[4]))+float(abs(bound[5]))+1) use_strand = '+' #print [plus_score,minus_score] if plus_score < minus_score: use_strand = '-' #print use_strand choice_bounds = [] for bound in bestbounds: if bound[2] == use_strand: choice_bounds.append(bound) #print '---' #print GenePredBasics.entry_to_line(gpd_entry) #print bestbounds #print choice_bounds if len(choice_bounds) < 1: print "ERROR should have choices" sys.exit() replacements = {} for bound in choice_bounds: replacements[bound[3]] = [bound[0],bound[1]] junctions = [] #print "fixed "+str(len(replacements.keys())) for i in range(0,len(bounds)): val = bounds[i] if i in replacements: #sys.stderr.write("use replacement\n") val = replacements[i] fcount += 1 junctions.append([val[0],val[1]]) #print junctions #sys.stderr.write("replace\n") #print junctions new_gpd_line = gpd_entry['gene_name'] + "\t" new_gpd_line += gpd_entry['name'] + "\t" new_gpd_line += gpd_entry['chrom'] + "\t" new_gpd_line += gpd_entry['strand'] + "\t" new_gpd_line += str(gpd_entry['txStart']) + "\t" new_gpd_line += str(gpd_entry['txEnd']) + "\t" new_gpd_line += str(gpd_entry['cdsStart']) + "\t" new_gpd_line += str(gpd_entry['cdsEnd']) + "\t" new_gpd_line += str(len(junctions)+1) + "\t" exon_starts = [gpd_entry['txStart']] exon_ends = [] #gpd_entry['txEnd']] for junc in junctions: exon_starts.append(junc[1]-1) exon_ends.append(junc[0]) exon_ends.append(gpd_entry['txEnd']) new_gpd_line += ','.join([str(x) for x in exon_starts])+','+"\t" new_gpd_line += ','.join([str(x) for x in exon_ends])+','+"\t" #print new_gpd_line new_gpd_entry = GenePredBasics.line_to_entry(new_gpd_line) #print "got junctions" #print new_gpd_line #print '.........' return new_gpd_entry
def main(): parser = argparse.ArgumentParser() parser.add_argument('a', nargs=1, help='FILENAME genepred file A') parser.add_argument('b', nargs=1, help='FILENAME genepred file B') #parser.add_argument('-p',nargs='?',help='INT the number of threads to run.') parser.add_argument('--minexoncount', nargs='?', help='INT the minimum number of exons required.') parser.add_argument( '--minoverlap_internal', nargs='?', help= 'FLOAT the fraction (0-1) of the required reciprocal overlap of an internal exon to call an exon a match.' ) parser.add_argument( '--minoverlap_first', nargs='?', help= 'FLOAT the fraction (0-1) of the required reciprocal overlap of the first exon to call an exon a match.' ) parser.add_argument( '--minoverlap_last', nargs='?', help= 'FLOAT the fraction (0-1) of the required reciprocal overlap of the last exon to call an exon a match.' ) parser.add_argument( '--minoverlap', nargs='?', help= 'FLOAT the fraction (0-1) of the required reciprocal overlap of any exon to call an exon a match.' ) parser.add_argument( '--leftouterjoin', action='store_true', help= 'Output the entry A regardless of whether a matching entry in B is found' ) parser.add_argument('--output_a_not_in_b', action='store_true', help='Output entries that occur in A but not B') parser.add_argument( '--best_b_only', action='store_true', help= 'Output only one entry of B for each A and try to pick the best based on reciprocal overlap' ) parser.add_argument( '--allow_a_subset_of_b_fragments', action='store_true', help= 'If A is just a subset of B, then call it as a match. This means all exons of A found a conecutive match, but B could have more exons on either end.' ) parser.add_argument( '--allow_any_fragments', action='store_true', help='If set, allow any partial match, not just the best') args = parser.parse_args() #pcount = multiprocessing.cpu_count() #if args.p: pcount = int(args.p) # go through contingencies of overlap requirements and set them overlap = [0, 0, 0] if args.minoverlap: overlap = [ float(args.minoverlap), float(args.minoverlap), float(args.minoverlap) ] if args.minoverlap_first: overlap[0] = float(args.minoverlap_last) if args.minoverlap_last: overlap[2] = float(args.minoverlap_last) if args.minoverlap_internal: overlap[1] = float(args.minoverlap_internal) # read the genepred files gpdA = GenePredBasics.GenePredFile(args.a[0]) gpdB = GenePredBasics.GenePredFile(args.b[0]) #if pcount > 1: # p = multiprocessing.Pool(processes=pcount) for eA in gpdA.entries: #if pcount > 1: # p.apply_async(check_B_entries,[eA,overlap,args]) #else: check_B_entries(eA, gpdB, overlap, args)
def check_B_entries(eA, gpdB, overlap, args): a_unique = True best_exon_count = 0 best_overlap = 0 best_line = '' best_frac = 0 ostring = '' for eB in gpdB.entries: double_line = GenePredBasics.entry_to_line( eA.entry) + "\t" + GenePredBasics.entry_to_line(eB.entry) + "\n" gpd_comparison = GenePredBasics.GenePredComparison() gpd_comparison.set_overlap_requirement(overlap) if eA.entry['chrom'] != eB.entry['chrom']: continue # normal is to do full length matches if not (args.allow_a_subset_of_b_fragments or args.allow_any_fragments): # do some easy checks if eA.get_exon_count() != eB.get_exon_count(): continue gpd_comparison.set_require_all_exons_overlap(True) gpd_comparison.compare(eA, eB) if gpd_comparison.output['full_match']: a_unique = False if args.output_a_not_in_b: break # we can bust out of the inner loop if we are only printing stuff unique to a if not args.best_b_only: # if we aren't waiting for the best, print it ostring += double_line else: # only do the best if gpd_comparison.output['consecutive_exons'] > best_exon_count \ or (gpd_comparison.output['consecutive_exons'] == best_exon_count \ and gpd_comparison.output['overlap_length'] > best_overlap) \ or (gpd_comparison.output['consecutive_exons'] == best_exon_count \ and gpd_comparison.output['overlap_length'] == best_overlap \ and harmonic_mean(gpd_comparison.output['overlap_fractions']) > best_frac): best_exon_count = gpd_comparison.output[ 'consecutive_exons'] best_overlap = gpd_comparison.output['overlap_length'] best_line = double_line best_frac = harmonic_mean( gpd_comparison.output['overlap_fractions']) # Allow partial matches else: gpd_comparison.compare(eA, eB) if gpd_comparison.output['partial_match']: # if we require a to be subset of b if args.allow_a_subset_of_b_fragments \ and not (eA.get_exon_count() < eB.get_exon_count() \ and eA.get_exon_count() == gpd_comparison.output['consecutive_exons']): break a_unique = False if args.output_a_not_in_b: break # only do the best if not args.best_b_only: ostring += double_line else: if gpd_comparison.output['consecutive_exons'] > best_exon_count \ or (gpd_comparison.output['consecutive_exons'] == best_exon_count \ and gpd_comparison.output['overlap_length'] > best_overlap) \ or (gpd_comparison.output['consecutive_exons'] == best_exon_count \ and gpd_comparison.output['overlap_length'] == best_overlap \ and harmonic_mean(gpd_comparison.output['overlap_fractions']) > best_frac): best_exon_count = gpd_comparison.output[ 'consecutive_exons'] best_overlap = gpd_comparison.output['overlap_length'] best_line = double_line best_frac = harmonic_mean( gpd_comparison.output['overlap_fractions']) if best_exon_count > 0 and args.best_b_only: ostring += best_line if a_unique and (args.output_a_not_in_b or args.leftouterjoin): ostring += GenePredBasics.entry_to_line(eA.entry) + "\n" sys.stdout.write(ostring) #oval.put(ostring) return
def main(): parser = argparse.ArgumentParser(description='report regions that lack annotations') parser.add_argument('--read_annotations',help="FILENAME either rawoutput or bestoutput from annotate_psl_with_gpd") parser.add_argument('--bam',help="FILENAME of sorted bam file",required=True) parser.add_argument('--tempdir',default="/tmp",help="DIRECTORY of where temporary files can be stored") parser.add_argument('--depth',type=int,help="INT Instead of checking many depths only check this depth") parser.add_argument('--minintron',default=68,type=int,help="INT minimum size of intron default 68") parser.add_argument('--maxintron',default=100000,type=int,help="INT maximum size of intron default 100000") parser.add_argument('--gpdoutput',help="FILENAME store the genepred file created") parser.add_argument('--output','-o',help="FILENAME bed format output") group2 = parser.add_mutually_exclusive_group() group2.add_argument('--full',action='store_true',help="Exclude reads with full matches, retaining only partial and novel matches.") group2.add_argument('--partial',action='store_true',help="Exclude reads with partial matches, retaining only novel reads DEFAULT.") args = parser.parse_args() depth = {} if not os.path.exists(args.tempdir): sys.stderr.write("could not find temporary directory path\n") return if not os.path.exists(args.tempdir.rstrip("/")+"/weirathe"): os.makedirs(args.tempdir.rstrip("/")+"/weirathe") tdir = args.tempdir.rstrip("/") + "/weirathe/weirathe.orphan"+str(randint(1,10000000)) sys.stderr.write("Using temporary directory: "+tdir+"\n") if not os.path.exists(tdir): os.makedirs(tdir) # iterate though read annotations annotated_reads = set() if args.read_annotations: with open(args.read_annotations) as inf: for line in inf: line = line.rstrip() if re.match('^psl_entry_id\s',line): continue if re.match('^$',line): continue f = line.split("\t") if args.full: # we only want the full matches if f[9] != 'Full': continue annotated_reads.add(f[1]) if args.bam: # Later we will want to have chromosome lengths cmd0 = "samtools view -H "+args.bam ps0 = subprocess.Popen(cmd0.split(),stdout=subprocess.PIPE) of0 = open(tdir+"/lengths.txt",'w') for line in ps0.stdout: line = line.rstrip() if re.match('^@SQ',line): m1 = re.search('\sSN:(\S+)',line) m2 = re.search('\sLN:(\S+)',line) if m1 and m2: of0.write(m1.group(1)+"\t"+m2.group(1)+"\n") of0.close() ps0.communicate() # first filter our bam cmd1 = "samtools view -h "+args.bam ps1 = subprocess.Popen(cmd1.split(),stdout=subprocess.PIPE) cmd2 = "samtools view -Sb -o "+tdir+"/temp.bam"+" -" ps2 = subprocess.Popen(cmd2.split(),stdin=subprocess.PIPE) for line in ps1.stdout: f = line.rstrip().split("\t") if len(f) < 9: ps2.stdin.write(line) if f[0] not in annotated_reads: ps2.stdin.write(line) ps1.stdout.close() ps2.communicate() # Now sort the new bam file cmd3 = "samtools sort "+tdir+"/temp.bam"+" "+tdir+"/temp.sorted" subprocess.call(cmd3.split()) # Now get the coverage information cmd4 = "bedtools genomecov -bg -split -ibam "+tdir+"/temp.sorted.bam" coverage_file = tdir+"/temp.bed" of4 = open(coverage_file,'w') subprocess.call(cmd4.split(),stdout=of4) of4.close() #find our maxdepth maxdepth = 0 with open(coverage_file) as inf: for line in inf: f = line.rstrip().split("\t") cov = int(f[3]) if cov > maxdepth: maxdepth = cov print maxdepth # for all our depths make a bed file to explore fhs = {} depths = [] d = 1 #starting depth while d < maxdepth: depths.append(d) d*=2 depths.append(maxdepth) if args.depth: depths = [args.depth] sys.stderr.write(str(depths)+"\n") for i in depths: fhs[i] = open(tdir+"/depth."+str(i)+".bed",'w') with open(coverage_file) as inf: for line in inf: f = line.rstrip().split("\t") cov = int(f[3]) for i in depths: if cov >= i: fhs[i].write(line) else: continue for i in fhs: fhs[i].close() #sort the bed files for i in depths: cmd5 = "bedtools sort -i "+tdir+"/depth."+str(i)+".bed" of5 = open(tdir+"/depth."+str(i)+".sorted.bed",'w') subprocess.call(cmd5.split(),stdout=of5) of5.close # for each of our depths get the merged bed z = 0 if args.gpdoutput: ofgpd = open(args.gpdoutput,'w') ofout = sys.stdout if args.output: ofout = open(args.output,'w') for i in depths: #compress_depth(tdir,i,args.minintron) bfile = tdir + "/depth."+str(i)+".sorted.bed" gpd_entries = GenePredBasics.bed_to_genepred(args.minintron,args.maxintron,bfile) for e in gpd_entries: z+=1 iter = e.entry['name'] name = "depth-"+str(i)+"_"+str(iter) e.entry['gene_name'] = str(i) e.entry['name'] = name line = e.get_line() length = e.length() exons = e.get_exon_count() if args.gpdoutput: ofgpd.write(line+"\n") ofout.write(e.entry['chrom'] + "\t" + str(e.entry['txStart']) + "\t" + str(e.entry['txEnd']) + "\t" + str(i) + "\t" + str(exons) + "\t" + str(length) + "\t" + name +"\n") rmtree(tdir)
def nudge(psl_entry, gpd_entry, refjun, args): junctions = [] fcount = 0 if len(gpd_entry['exonStarts']) == 1: #print "no intron 1" return gpd_entry bounds = [] for i in range(1, len(gpd_entry['exonStarts'])): junc_start = gpd_entry['exonEnds'][i - 1] junc_finish = gpd_entry['exonStarts'][i] + 1 bounds.append([junc_start, junc_finish, i - 1]) if len(bounds) < 1: #print "no intron 2" return gpd_entry bestbounds = [] for bound in bounds: best_distance = [10000000, 10000000] best_result = None for z1 in range(bound[0] - args.search_size, bound[0] + args.search_size + 1): d1 = abs(z1 - bound[0]) if z1 in refjun: for z2 in range( bound[1] - args.search_size, bound[1] + args.search_size + args.search_size + 1): d2 = abs(z2 - bound[1]) if z2 in refjun[z1]: refstrand = refjun[z1][z2] if d1 + d2 < best_distance[0] + best_distance[1]: best_distance = [d1, d2] best_result = [z1, z2, refstrand, bound[2] ] + best_distance if best_result: bestbounds.append(best_result) if len(bestbounds) < 1: #nothing fixable #sys.stderr.write("nothing fixable\n") return gpd_entry #Now we have a list of nudgable bounds #Lets pick a strand plus_score = 0 minus_score = 0 #print '----' #print bestbounds for bound in bestbounds: if bound[2] == '+': plus_score += 1 / (float(abs(bound[4])) + float(abs(bound[5])) + 1) else: minus_score += 1 / (float(abs(bound[4])) + float(abs(bound[5])) + 1) use_strand = '+' #print [plus_score,minus_score] if plus_score < minus_score: use_strand = '-' #print use_strand choice_bounds = [] for bound in bestbounds: if bound[2] == use_strand: choice_bounds.append(bound) #print '---' #print GenePredBasics.entry_to_line(gpd_entry) #print bestbounds #print choice_bounds if len(choice_bounds) < 1: print "ERROR should have choices" sys.exit() replacements = {} for bound in choice_bounds: replacements[bound[3]] = [bound[0], bound[1]] junctions = [] #print "fixed "+str(len(replacements.keys())) for i in range(0, len(bounds)): val = bounds[i] if i in replacements: #sys.stderr.write("use replacement\n") val = replacements[i] fcount += 1 junctions.append([val[0], val[1]]) #print junctions #sys.stderr.write("replace\n") #print junctions new_gpd_line = gpd_entry['gene_name'] + "\t" new_gpd_line += gpd_entry['name'] + "\t" new_gpd_line += gpd_entry['chrom'] + "\t" new_gpd_line += gpd_entry['strand'] + "\t" new_gpd_line += str(gpd_entry['txStart']) + "\t" new_gpd_line += str(gpd_entry['txEnd']) + "\t" new_gpd_line += str(gpd_entry['cdsStart']) + "\t" new_gpd_line += str(gpd_entry['cdsEnd']) + "\t" new_gpd_line += str(len(junctions) + 1) + "\t" exon_starts = [gpd_entry['txStart']] exon_ends = [] #gpd_entry['txEnd']] for junc in junctions: exon_starts.append(junc[1] - 1) exon_ends.append(junc[0]) exon_ends.append(gpd_entry['txEnd']) new_gpd_line += ','.join([str(x) for x in exon_starts]) + ',' + "\t" new_gpd_line += ','.join([str(x) for x in exon_ends]) + ',' + "\t" #print new_gpd_line new_gpd_entry = GenePredBasics.line_to_entry(new_gpd_line) #print "got junctions" #print new_gpd_line #print '.........' return new_gpd_entry
def check_B_entries(eA,gpdB,overlap,args): a_unique = True best_exon_count = 0 best_overlap = 0 best_line = '' best_frac = 0 ostring = '' for eB in gpdB.entries: double_line = GenePredBasics.entry_to_line(eA.entry) + "\t" + GenePredBasics.entry_to_line(eB.entry) + "\n" gpd_comparison = GenePredBasics.GenePredComparison() gpd_comparison.set_overlap_requirement(overlap) if eA.entry['chrom'] != eB.entry['chrom']: continue # normal is to do full length matches if not (args.allow_a_subset_of_b_fragments or args.allow_any_fragments): # do some easy checks if eA.get_exon_count() != eB.get_exon_count(): continue gpd_comparison.set_require_all_exons_overlap(True) gpd_comparison.compare(eA,eB) if gpd_comparison.output['full_match']: a_unique = False if args.output_a_not_in_b: break # we can bust out of the inner loop if we are only printing stuff unique to a if not args.best_b_only: # if we aren't waiting for the best, print it ostring += double_line else: # only do the best if gpd_comparison.output['consecutive_exons'] > best_exon_count \ or (gpd_comparison.output['consecutive_exons'] == best_exon_count \ and gpd_comparison.output['overlap_length'] > best_overlap) \ or (gpd_comparison.output['consecutive_exons'] == best_exon_count \ and gpd_comparison.output['overlap_length'] == best_overlap \ and harmonic_mean(gpd_comparison.output['overlap_fractions']) > best_frac): best_exon_count = gpd_comparison.output['consecutive_exons'] best_overlap = gpd_comparison.output['overlap_length'] best_line = double_line best_frac = harmonic_mean(gpd_comparison.output['overlap_fractions']) # Allow partial matches else: gpd_comparison.compare(eA,eB) if gpd_comparison.output['partial_match']: # if we require a to be subset of b if args.allow_a_subset_of_b_fragments \ and not (eA.get_exon_count() < eB.get_exon_count() \ and eA.get_exon_count() == gpd_comparison.output['consecutive_exons']): break a_unique = False if args.output_a_not_in_b: break # only do the best if not args.best_b_only: ostring += double_line else: if gpd_comparison.output['consecutive_exons'] > best_exon_count \ or (gpd_comparison.output['consecutive_exons'] == best_exon_count \ and gpd_comparison.output['overlap_length'] > best_overlap) \ or (gpd_comparison.output['consecutive_exons'] == best_exon_count \ and gpd_comparison.output['overlap_length'] == best_overlap \ and harmonic_mean(gpd_comparison.output['overlap_fractions']) > best_frac): best_exon_count = gpd_comparison.output['consecutive_exons'] best_overlap = gpd_comparison.output['overlap_length'] best_line = double_line best_frac = harmonic_mean(gpd_comparison.output['overlap_fractions']) if best_exon_count > 0 and args.best_b_only: ostring += best_line if a_unique and (args.output_a_not_in_b or args.leftouterjoin): ostring += GenePredBasics.entry_to_line(eA.entry)+"\n" sys.stdout.write(ostring) #oval.put(ostring) return
def main(): parser = argparse.ArgumentParser( description='Use reference junctions when they are close', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--min_intron_size', type=int, default=68, help="INT min intron size") parser.add_argument( '--min_local_support', type=int, default=0, help= "INT min number of junctions within search_size of a junction in order to count it" ) parser.add_argument('--search_size', type=int, default=10, help="INT search space for reference") parser.add_argument( '--output_fake_psl', help="FASTAFILE reference genome to make a fake PSL output") parser.add_argument('psl', help="PSLFILENAME or '-' for STDIN") parser.add_argument('reference_genepred', help="FASTAFILENAME for reference genepred") args = parser.parse_args() cpus = multiprocessing.cpu_count() genome = {} if args.output_fake_psl: genome = read_fasta_into_hash(args.output_fake_psl) #read in the reference genepred first gpf = GenePredBasics.GenePredFile(args.reference_genepred) #lets sort entries by chromosome ref = {} for e in [x.entry for x in gpf.entries]: if len(e['exonStarts']) <= 1: continue if e['chrom'] not in ref: ref[e['chrom']] = {} for i in range(1, len(e['exonStarts'])): if e['exonEnds'][i - 1] not in ref[e['chrom']]: ref[e['chrom']][e['exonEnds'][i - 1]] = {} if e['exonStarts'][i] + 1 not in ref[e['chrom']][e['exonEnds'][i - 1]]: ref[e['chrom']][e['exonEnds'][i - 1]][e['exonStarts'][i] + 1] = e['strand'] #Stored all junctions as 1-base read_info = {} pf = GenericFileReader(args.psl) fcount_total = 0 while True: line = pf.readline() if not line: break if re.match('^#', line): continue line = line.rstrip() pe = PSLBasics.line_to_entry(line) if len(pe['tStarts']) != len(pe['blockSizes']) or len( pe['qStarts']) != len(pe['blockSizes']): sys.stderr.write("WARNING invalid psl\n") continue genepred_line = PSLBasics.convert_entry_to_genepred_line(pe) ge = GenePredBasics.smooth_gaps( GenePredBasics.line_to_entry(genepred_line), args.min_intron_size) refjuns = {} if pe['tName'] in ref: refjuns = ref[pe['tName']] new_ge = nudge(pe, ge, refjuns, args) if args.output_fake_psl: new_psl_line = GenePredBasics.entry_to_fake_psl_line( new_ge, genome) print new_psl_line else: print GenePredBasics.entry_to_line(new_ge)
def main(): parser = argparse.ArgumentParser( description= "Make a universal genepred and key for comparing IDP results") parser.add_argument( '--output_directory', default='IDP_output_merge', help='DIRECTORY to write output to. Will not overwrite existing') parser.add_argument( 'genepred_exp_name_sets', nargs='+', help= "three files for each IDP entry 1) a genepred 2) an expression file 3) a sample name." ) args = parser.parse_args() mydir = args.output_directory.rstrip('/') if os.path.isdir(mydir): sys.stderr.write("ERROR: output directory " + mydir + " already exists\n") return os.makedirs(mydir) set_args = args.genepred_exp_name_sets if len(set_args) % 3 != 0: sys.stderr.write("Data must be in sets of three") setnum = 0 resultnumber = 0 numbers = {} byset = {} chromosomes = set() established_names = {} expression = {} sample_names = set() while len(set_args) > 0: setnum += 1 gpd = set_args.pop(0) exp = set_args.pop(0) sample_name = set_args.pop(0) sample_names.add(sample_name) sys.stderr.write("Set: " + str(setnum) + "\n") sys.stderr.write(" GenePred: " + gpd + "\n") sys.stderr.write(" Expression: " + exp + "\n") sys.stderr.write(" Sample: " + sample_name + "\n") with open(gpd) as inf: for line in inf: if re.match('^#', line): continue e = GenePredBasics.GenePredEntry() e.line_to_entry(line) chromosomes.add(e.entry['chrom']) junctions = e.junctions resultnumber += 1 junstring = ";".join(junctions) if junstring not in byset: byset[junstring] = set() byset[junstring].add(resultnumber) numbers[resultnumber] = [sample_name, e.entry['name'], e] with open(exp) as inf: for line in inf: f = line.rstrip().split("\t") if sample_name not in expression: expression[sample_name] = {} expression[sample_name][f[0]] = [float( f[1]), float(f[2])] #transcript and gene exression #bysample = {} gene_records = {} for junc in byset: lowest = False highest = False realnames = set() realgenenames = set() chromnames = set() chromgenenames = set() arbitrary_gpd = False sgpds = {} for i in byset[junc]: [sample, name, gpd] = numbers[i] gene_name = gpd.entry['gene_name'] arbitrary_gpd = gpd sgpds[sample] = gpd # Figure out if its a reference transcript name or an IDP manufactured name m = re.match('^([^:]+):\d+-\d+', name) if not m: realnames.add(name) else: chromnames.add(m.group(1)) # Figure out if its a reference gene name or an IPD manufacture gene anme m = re.match('^([^:]+):\d+-\d+', gene_name) if not m: realgenenames.add(gene_name) else: chromgenenames.add(m.group(1)) if not lowest or gpd.entry['txStart'] < lowest: lowest = gpd.entry['txStart'] if not highest or gpd.entry['txEnd'] > highest: highest = gpd.entry['txEnd'] #if sample not in bysample: # bysample[sample] = {} #if name not in bysample[sample]: # bysample[sample][name] = i usename = False basename = False if len(realnames) > 0: usename = next(iter(realnames)) if len(realnames) > 1: sys.stderr.write( "WARNING: multiple transcript names as with the same junctions.\n" + str(realnames) + "\nUsing: " + str(usename) + "\n") if usename in established_names: sys.stderr.write( "WARNING: reference transcript name " + usename + " refers to different transcripts with different junction compositions. Renaming the second instance to a unique name." ) established_names[usename] += 1 usename = usename + '.' + str(established_names[usename]) else: established_names[usename] = 0 else: usechrom = next(iter(chromnames)) if len(chromnames) > 1: sys.stderr.write( "ERROR: multiple chromosome names are not supported in a single transcript yet.\n" + str(chromnames) + "\n") sys.exit() basename = usechrom + ":" + str(lowest) + '-' + str(highest) if basename not in established_names: established_names[basename] = 0 established_names[basename] += 1 usename = basename + '.' + str(established_names[basename]) # See if we have a real gene name for base name if len(realgenenames) > 0: basename = next(iter(realgenenames)) #print basename + "\t" + usename if basename not in gene_records: gene_records[basename] = {} gene_records[basename][usename] = {} gene_records[basename][usename]['sample_gpd'] = {} gene_records[basename][usename]['sample_exp'] = {} gene_records[basename][usename]['gpd'] = GenePredBasics.GenePredEntry() # copy the old record gene_records[basename][usename]['gpd'].line_to_entry( arbitrary_gpd.get_line()) if lowest < gene_records[basename][usename]['gpd'].entry['txStart']: sys.stderr.write("ADJUSTING NEW GPD TXSTART FOR " + basename + " " + usename + "\n") gene_records[basename][usename]['gpd'].entry['txStart'] = lowest gene_records[basename][usename]['gpd'].entry['cdsStart'] = lowest gene_records[basename][usename]['gpd'].entry['exonStarts'][ 0] = lowest if highest > gene_records[basename][usename]['gpd'].entry['txEnd']: sys.stderr.write("ADJUSTING NEW GPD TXEND FOR " + basename + " " + usename + "\n") gene_records[basename][usename]['gpd'].entry['txEnd'] = highest gene_records[basename][usename]['gpd'].entry['cdsEnd'] = highest gene_records[basename][usename]['gpd'].entry['exonEnds'][ len(gene_records[basename][usename]['gpd'].entry['exonEnds']) - 1] = lowest # Now add the original sample information for sample in sgpds: gene_records[basename][usename]['sample_gpd'][sample] = sgpds[ sample] gene_records[basename][usename]['sample_exp'][sample] = expression[ sample][sgpds[sample].entry['name']][0] #Now all necessary data should be in gene_records sample_list = sorted(list(sample_names)) ofgene = open(mydir + '/gene.exp', 'w') ofgene.write("gene") for sample in sample_list: ofgene.write("\t" + sample) ofgene.write("\n") geneexp = {} for gene in gene_records: total = {} for sample in sample_list: total[sample] = 0 geneexp[gene] = {} for transcript in gene_records[gene]: for sample in gene_records[gene][transcript]['sample_exp']: total[sample] += gene_records[gene][transcript]['sample_exp'][ sample] ofgene.write(gene) for sample in sample_list: geneexp[gene][sample] = total[sample] ofgene.write("\t" + str(total[sample])) ofgene.write("\n") ofgene.close() #Now we can do all the transcript writing ofgeneiso = open(mydir + '/gene_isoform.exp', 'w') ofgeneiso.write("gene\tisoform") for sample in sample_list: ofgeneiso.write("\t" + sample + ".gene" + "\t" + sample + ".isoform") ofgeneiso.write("\n") ofiso = open(mydir + '/isoform.exp', 'w') ofiso.write("isoform") for sample in sample_list: ofiso.write("\t" + sample) ofiso.write("\n") for gene in gene_records: for transcript in gene_records[gene]: ofiso.write(transcript) ofgeneiso.write(gene + "\t" + transcript) for sample in sample_list: if sample in gene_records[gene][transcript]['sample_exp']: ofgeneiso.write("\t" + str(geneexp[gene][sample]) + "\t" + str(gene_records[gene][transcript] ['sample_exp'][sample])) ofiso.write("\t" + str(gene_records[gene][transcript] ['sample_exp'][sample])) else: ofiso.write("\t0") if sample in geneexp[gene]: ofgeneiso.write("\t" + str(geneexp[gene][sample]) + "\t0") else: ofgeneiso.write("0\t0") ofiso.write("\n") ofgeneiso.write("\n") ofiso.close() #Maybe we can finish it all off by writing the new genepred ofgpd = open(mydir + '/isoform.gpd', 'w') for gene in gene_records: for transcript in gene_records[gene]: ofgpd.write(gene_records[gene][transcript]['gpd'].get_line() + "\n") ofgpd.close()