def parse_pslfile(tdir,pslfile,smoothing_factor): # Go through the long reads and make a genepred if pslfile != '-': fr = FileBasics.GenericFileReader(pslfile) else: fr = sys.stdin seennames = {} longreadnumber = 0 of_gpd = open(tdir+'/longreads.gpd','w') while True: line = fr.readline() if not line: break if re.match('^#',line): #skip comments continue longreadnumber += 1 gpd_line = PSLBasics.convert_entry_to_genepred_line(PSLBasics.line_to_entry(line.rstrip())) if not gpd_line: sys.stderr.write("Warning: malformed psl for "+readname+"\n") continue entry = GenePredBasics.smooth_gaps( \ GenePredBasics.line_to_entry(gpd_line),smoothing_factor) readname = entry['name'] if readname in seennames: sys.stderr.write("Warning: repeat name '"+readname+"'\n") #set our first name to our bin entry['name'] = str(longreadnumber) gline = GenePredBasics.entry_to_line(entry) of_gpd.write(gline+"\n") fr.close() of_gpd.close()
def get_exons_from_seqs(seqs, d, spcf): sind = 0 oline = '' for seq in seqs: sind += 1 psec = 'P' #primary or secondary if sind > 1: psec = 'S' d1 = d.copy() d1['rname'] = seq[1] if seq[2] == '+': d1['flag'] = 0 else: d1['flag'] = 16 d1['pos'] = seq[3] d1['cigar'] = seq[4] d1['cigar_array'] = SamBasics.parse_cigar(seq[4]) skips = set(['H', 'D', 'N']) total_length = 0 possible_matches = 0 indels = 0 qstart = 0 if d1['cigar_array'][0]['op'] == 'S': qstart = d1['cigar_array'][0]['val'] if d1['cigar_array'][0]['op'] == 'H': qstart = d1['cigar_array'][0]['val'] for ce in d1['cigar_array']: if ce['op'] not in skips: total_length += ce['val'] if ce['op'] == 'M': possible_matches += ce['val'] elif ce['op'] == 'I': indels += ce['val'] elif ce['op'] == 'D' and ce['val'] < 68: indels += ce['val'] fakeseq = 'N' * total_length d1['seq'] = fakeseq nline = SamBasics.entry_to_line(d1) pline = spcf.convert_line(nline) pentry = PSLBasics.line_to_entry(pline) #mismatch_count = -1 #if sind == 1 and args.reference_genome: #for primary alignments we can calculate the number of matches # for i in range(0,len(pentry['blockSizes'])): # tseq = spcf.genome[pentry['tName']][pentry['tStarts'][i]:pentry['tStarts'][i]+pentry['blockSizes'][i]] # qseq = sequence[pentry['qStarts'][i]:pentry['qStarts'][i]+pentry['blockSizes'][i]] # print pentry['blockSizes'][i] # print tseq # print qseq # for j in range(0,len(tseq)): # if tseq[j].upper() != qseq[j].upper(): mismatch_count += 1 gline = PSLBasics.convert_entry_to_genepred_line(pentry) gentry = GenePredBasics.line_to_entry(gline) gsmooth = GenePredBasics.smooth_gaps(gentry, 68) for i in range(0, len(gsmooth['exonStarts'])): oline += gsmooth['chrom'] + "\t" + str( gsmooth['exonStarts'][i]) + "\t" + str( gsmooth['exonEnds'] [i]) + "\t" + gsmooth['strand'] + "\t" + gsmooth[ 'name'] + "\t" + str(possible_matches) + "\t" + str( indels) + "\t" + psec + "\t" + str(qstart) + "\n" return oline
def main(): parser = argparse.ArgumentParser(description='Use reference junctions when they are close',formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--min_intron_size',type=int,default=68,help="INT min intron size") parser.add_argument('--min_local_support',type=int,default=0,help="INT min number of junctions within search_size of a junction in order to count it") parser.add_argument('--search_size',type=int,default=10,help="INT search space for reference") parser.add_argument('--output_fake_psl',help="FASTAFILE reference genome to make a fake PSL output") parser.add_argument('psl',help="PSLFILENAME or '-' for STDIN") parser.add_argument('reference_genepred',help="FASTAFILENAME for reference genepred") args = parser.parse_args() cpus = multiprocessing.cpu_count() genome = {} if args.output_fake_psl: genome = read_fasta_into_hash(args.output_fake_psl) #read in the reference genepred first gpf = GenePredBasics.GenePredFile(args.reference_genepred) #lets sort entries by chromosome ref = {} for e in [x.entry for x in gpf.entries]: if len(e['exonStarts']) <= 1: continue if e['chrom'] not in ref: ref[e['chrom']] = {} for i in range(1,len(e['exonStarts'])): if e['exonEnds'][i-1] not in ref[e['chrom']]: ref[e['chrom']][e['exonEnds'][i-1]] = {} if e['exonStarts'][i]+1 not in ref[e['chrom']][e['exonEnds'][i-1]]: ref[e['chrom']][e['exonEnds'][i-1]][e['exonStarts'][i]+1] = e['strand'] #Stored all junctions as 1-base read_info = {} pf = GenericFileReader(args.psl) fcount_total = 0 while True: line = pf.readline() if not line: break if re.match('^#',line): continue line = line.rstrip() pe = PSLBasics.line_to_entry(line) if len(pe['tStarts']) != len(pe['blockSizes']) or len(pe['qStarts']) != len(pe['blockSizes']): sys.stderr.write("WARNING invalid psl\n") continue genepred_line = PSLBasics.convert_entry_to_genepred_line(pe) ge = GenePredBasics.smooth_gaps(GenePredBasics.line_to_entry(genepred_line),args.min_intron_size) refjuns = {} if pe['tName'] in ref: refjuns = ref[pe['tName']] new_ge = nudge(pe,ge,refjuns,args) if args.output_fake_psl: new_psl_line = GenePredBasics.entry_to_fake_psl_line(new_ge,genome) print new_psl_line else: print GenePredBasics.entry_to_line(new_ge)
def get_exons_from_seqs(seqs,d,spcf): sind = 0 oline = '' for seq in seqs: sind+=1 psec = 'P' #primary or secondary if sind > 1: psec = 'S' d1 = d.copy() d1['rname'] = seq[1] if seq[2] == '+': d1['flag'] = 0 else: d1['flag'] = 16 d1['pos'] = seq[3] d1['cigar'] = seq[4] d1['cigar_array'] = SamBasics.parse_cigar(seq[4]) skips = set(['H','D','N']) total_length = 0 possible_matches = 0 indels = 0 qstart = 0 if d1['cigar_array'][0]['op'] == 'S': qstart = d1['cigar_array'][0]['val'] if d1['cigar_array'][0]['op'] == 'H': qstart = d1['cigar_array'][0]['val'] for ce in d1['cigar_array']: if ce['op'] not in skips: total_length += ce['val'] if ce['op'] == 'M': possible_matches += ce['val'] elif ce['op'] == 'I': indels += ce['val'] elif ce['op'] == 'D' and ce['val'] < 68: indels += ce['val'] fakeseq = 'N'*total_length d1['seq'] = fakeseq nline = SamBasics.entry_to_line(d1) pline = spcf.convert_line(nline) pentry = PSLBasics.line_to_entry(pline) #mismatch_count = -1 #if sind == 1 and args.reference_genome: #for primary alignments we can calculate the number of matches # for i in range(0,len(pentry['blockSizes'])): # tseq = spcf.genome[pentry['tName']][pentry['tStarts'][i]:pentry['tStarts'][i]+pentry['blockSizes'][i]] # qseq = sequence[pentry['qStarts'][i]:pentry['qStarts'][i]+pentry['blockSizes'][i]] # print pentry['blockSizes'][i] # print tseq # print qseq # for j in range(0,len(tseq)): # if tseq[j].upper() != qseq[j].upper(): mismatch_count += 1 gline = PSLBasics.convert_entry_to_genepred_line(pentry) gentry = GenePredBasics.line_to_entry(gline) gsmooth = GenePredBasics.smooth_gaps(gentry,68) for i in range(0,len(gsmooth['exonStarts'])): oline += gsmooth['chrom'] + "\t" + str(gsmooth['exonStarts'][i])+"\t"+str(gsmooth['exonEnds'][i])+"\t"+gsmooth['strand']+"\t"+gsmooth['name']+"\t"+str(possible_matches)+"\t"+str(indels)+"\t"+psec+"\t"+str(qstart)+"\n" return oline
def set_mapping_counts(self,psl_filename): self.mapping_counts_set = True gfr0 = GenericFileReader(psl_filename) qcnts = {} while True: line = gfr0.readline() if not line: break try: psle = PSLBasics.line_to_entry(line.rstrip()) except: sys.stderr.write("Problem parsing line:\n"+line.rstrip()+"\n") continue if psle['qName'] not in qcnts: qcnts[psle['qName']] = 0 qcnts[psle['qName']] += 1 gfr0.close() self.mapping_counts = qcnts
def set_mapping_counts(self, psl_filename): self.mapping_counts_set = True gfr0 = GenericFileReader(psl_filename) qcnts = {} while True: line = gfr0.readline() if not line: break try: psle = PSLBasics.line_to_entry(line.rstrip()) except: sys.stderr.write("Problem parsing line:\n" + line.rstrip() + "\n") continue if psle['qName'] not in qcnts: qcnts[psle['qName']] = 0 qcnts[psle['qName']] += 1 gfr0.close() self.mapping_counts = qcnts
def main(): parser = argparse.ArgumentParser(description="Convert a psl file into a target formated genepred file.") parser.add_argument('--fill_gaps',type=int,default=0,help="Close gaps this size or smaller.") parser.add_argument('input_name',help="Input PSL file, use - to indicate STDIN.") args = parser.parse_args() pslfilehandle = sys.stdin if args.input_name != '-': pslfilehandle = open(args.input_name) with pslfilehandle as infile: for line in infile: psl_entry = PSLBasics.line_to_entry(line) genepred_line = PSLBasics.convert_entry_to_genepred_line(psl_entry) if args.fill_gaps > 0: genepred_entry = GenePredBasics.line_to_entry(genepred_line) genepred_entry2 = GenePredBasics.smooth_gaps(genepred_entry,args.fill_gaps) genepred_line = GenePredBasics.entry_to_line(genepred_entry2) print genepred_line
def main(): parser = argparse.ArgumentParser(description="Analyze ORDERED psl alignments of long reads.") parser.add_argument('psl_file',help="Alignment file. Must be ordered by query name. use - for stdin") parser.add_argument('-o','--output',help="Write to output file, default is STDIN") parser.add_argument('--noheader',action='store_true') parser.add_argument('--minimum_coverage',type=int,help="Only consider alignments with at least this many bp aligned") parser.add_argument('--threads',type=int,default=multiprocessing.cpu_count(),help="INT default cpu_count") parser.add_argument('--tempbuffer',help="DIRECTORY store the results in a temporary file until they are ready to output. suggest using /tmp if you don't know what to use") args = parser.parse_args() seen_names = set() last_name = '' buffer = PSLBasics.MultiplePSLAlignments() inf = sys.stdin if args.psl_file != '-': inf = open(args.psl_file) global of tname = None if args.tempbuffer: if not args.output: sys.stderr.write("ERROR if you want to buffer outputs in a temp file you need to specify a final output file.\n") sys.exit() rnum = random.randint(1,1000000000); tname = args.tempbuffer.rstrip('/')+'/weirathe.'+str(rnum)+'.meta' of = open(tname,'w') if args.output and not args.tempbuffer: of = open(args.output,'w') global lock if args.threads > 1: pool = multiprocessing.Pool(args.threads) for line in inf: e = PSLBasics.line_to_entry(line.rstrip()) if e['qName'] != last_name: # we have a new name if e['qName'] in seen_names: sys.stderr.write("ERROR psl entries are not ordered by query name.\n") sys.exit() seen_names.add(e['qName']) if buffer.get_alignment_count() > 0: #process_buffer(buffer) if args.threads > 1: pool.apply_async(process_buffer,[buffer],callback=print_result) else: res = process_buffer(buffer) print_result(res) buffer = PSLBasics.MultiplePSLAlignments() if args.minimum_coverage > 1: buffer.set_minimum_coverage(args.minimum_coverage) last_name = e['qName'] buffer.add_entry(PSLBasics.PSL(line.rstrip())) inf.close() if buffer.get_alignment_count() > 0: if args.threads > 1: pool.apply_async(process_buffer,[buffer],callback=print_result) # if we still have something left to do else: res = process_buffer(buffer) print_result(res) if args.threads > 1: pool.close() pool.join() of.close() if args.tempbuffer: of = open(args.output,'w') with open(tname) as inf: for line in inf: of.write(line) of.close() os.remove(tname)
def main(): parser = argparse.ArgumentParser(description="Analyze ORDERED psl alignments of long reads.") parser.add_argument('psl_file',help="Alignment file. Must be ordered by query name. use - for stdin") parser.add_argument('--output',help="Write to output file, default is STDIN") parser.add_argument('--noheader',action='store_true') #parser.add_argument('--best',action='store_true') #parser.add_argument('--split',action='store_true') parser.add_argument('--minimum_coverage',type=int,help="Only consider alignments with at least this many bp aligned") parser.add_argument('--threads',type=int,default=multiprocessing.cpu_count(),help="INT default cpu_count") parser.add_argument('--tempbuffer',help="DIRECTORY store the results in a temporary file until they are ready to output. suggest using /tmp if you don't know what to use") args = parser.parse_args() seen_names = set() last_name = '' buffer = PSLBasics.MultiplePSLAlignments() inf = sys.stdin if args.psl_file != '-': inf = open(args.psl_file) global of tname = None if args.tempbuffer: if not args.output: sys.stderr.write("ERROR if you want to buffer outputs in a temp file you need to specify a final output file.\n") sys.exit() rnum = random.randint(1,1000000000); tname = args.tempbuffer.rstrip('/')+'/weirathe.'+str(rnum)+'.meta' of = open(tname,'w') if args.output and not args.tempbuffer: of = open(args.output,'w') global lock if not args.noheader: lock.acquire() of.write("QueryName\tSegmentCount\tLocusCount\tHasOverlapped\tHasMultiplyMapped\n") lock.release() pool = multiprocessing.Pool(args.threads) for line in inf: e = PSLBasics.line_to_entry(line.rstrip()) if e['qName'] != last_name: # we have a new name if e['qName'] in seen_names: sys.stderr.write("ERROR psl entries are not ordered by query name.\n") sys.exit() seen_names.add(e['qName']) if buffer.get_alignment_count() > 0: #process_buffer(buffer) pool.apply_async(process_buffer,[buffer],callback=print_result) buffer = PSLBasics.MultiplePSLAlignments() if args.minimum_coverage > 1: buffer.set_minimum_coverage(args.minimum_coverage) last_name = e['qName'] buffer.add_entry(e) inf.close() if buffer.get_alignment_count() > 0: #process_buffer(buffer) # if we still have something left to do pool.apply_async(process_buffer,[buffer],callback=print_result) # if we still have something left to do pool.close() pool.join() of.close() if args.tempbuffer: of = open(args.output,'w') with open(tname) as inf: for line in inf: of.write(line) of.close() os.remove(tname)
def do_combine_operation(best_option,left,right,read,seq,args): #print "choice is "+str(best_option) left_target = best_option[0] right_target = best_option[1] left_query = best_option[2] right_query = best_option[3] # store for output q_start_array = [] t_start_array = [] block_size_array = [] left_query_start = left['qStarts'][0] left_target_start = left['tStarts'][0] for i in range(0,len(left['tStarts'])): tstart = left['tStarts'][i] tend = left['tStarts'][i]+left['blockSizes'][i] qstart = left['qStarts'][i] qend = left['qStarts'][i]+left['blockSizes'][i] if left_query <= qstart+1: break left_query_start = qstart left_target_start = tstart if left_query <= qend: break q_start_array.append(qstart) t_start_array.append(tstart) block_size_array.append(left['blockSizes'][i]) #print "left things" #print [left_query_start+1,left_query] #print [left_target_start+1,left_target] right_query_end = right['qStarts'][0]+right['blockSizes'][0] right_target_end = right['tStarts'][0]+right['blockSizes'][0] right_outer_index = 0 for j in range(0,len(right['tStarts'])): tstart = right['tStarts'][j] tend = right['tStarts'][j]+right['blockSizes'][j] qstart = right['qStarts'][j] qend = right['qStarts'][j]+right['blockSizes'][j] right_outer_index = j+1 if right_query <= qstart+1: break right_query_end = qend right_target_end = tend if right_query < qend: break #print "right things" #print [right_query+1,right_query_end] #print [right_target+1,right_target_end] working_read = read.upper() if left['strand'] == '-': working_read = rc(read.upper()) pread = working_read[left_query_start:right_query_end] tseq = seq[left_target_start:left_target].upper()+seq[right_target-1:right_target_end].upper() res = needleman_wunsch(pread,tseq) #print "short needleman wunsch" #print res[0] #print res[1] # Fun part of making the new portion of the alignment qindex = left_query_start tindex = left_target_start in_alignment = 0 alignment = None bynumbers = None for i in range(0,len(res[0])): if res[0][i] == '-': #insertion in target (gap in query) tindex += 1 in_alignment = 0 elif res[1][i] == '-': #insertion in query (gap in target) qindex += 1 in_alignment = 0 else: # we are in an alignment if in_alignment == 0: # output buffered result if alignment: if len(alignment[0]) > 0: q_start_array.append(bynumbers[0]) t_start_array.append(bynumbers[1]) block_size_array.append(len(alignment[0])) alignment = ['',''] bynumbers = [qindex,tindex,qindex,tindex] in_alignment = 1 alignment[0] += res[0][i] alignment[1] += res[1][i] bynumbers[2] += 1 bynumbers[3] += 1 qindex+=1 tindex+=1 if qindex == right_query: # switch forward #print "switch" #print str(tindex) + "\t" + str(right_target) #print str(qindex) + "\t" + str(right_query) if not tindex == right_target: in_alignment = 0 tindex = right_target if alignment: if len(alignment[0]) > 0: q_start_array.append(bynumbers[0]) t_start_array.append(bynumbers[1]) block_size_array.append(len(alignment[0])) #print bynumbers for i in range(right_outer_index,len(right['blockSizes'])): q_start_array.append(right['qStarts'][i]) t_start_array.append(right['tStarts'][i]) block_size_array.append(right['blockSizes'][i]) #now we can finally construct a psl line #we won't keep track of repeats for now matches = 0 misMatches = 0 repMatches = 0 nCount = 0 qNumInsert = 0 qBaseInsert = 0 tNumInsert = 0 tBaseInsert = 0 strand = left['strand'] qName = left['qName'] qSize = len(read) qStart = q_start_array[0] qEnd = q_start_array[len(q_start_array)-1]+block_size_array[len(block_size_array)-1] tName = left['tName'] tSize = len(seq) tStart = t_start_array[0] tEnd = t_start_array[len(t_start_array)-1]+block_size_array[len(block_size_array)-1] blockCount = len(block_size_array) blockSizes = ','.join([str(x) for x in block_size_array])+',' qStarts = ','.join([str(x) for x in q_start_array])+',' tStarts = ','.join([str(x) for x in t_start_array])+',' prev_q_end = None prev_t_end = None for i in range(0,len(block_size_array)): qseg = working_read[q_start_array[i]:q_start_array[i]+block_size_array[i]] tseg = seq[t_start_array[i]:t_start_array[i]+block_size_array[i]].upper() for j in range(0,len(qseg)): if qseg[j] == 'N': nCount += 1 if qseg[j] == tseg[j]: matches += 1 else: misMatches += 1 if prev_t_end: t_dist = t_start_array[i]-prev_t_end if t_dist > 0 and t_dist < args.min_intron_size: #we have an insert into the target and its not an intron tNumInsert += 1 tBaseInsert += t_dist if prev_q_end: q_dist = q_start_array[i]-prev_q_end if q_dist > 0: qNumInsert += 1 qBaseInsert += q_dist prev_q_end = q_start_array[i]+block_size_array[i] prev_t_end = t_start_array[i]+block_size_array[i] # now we have everything to make the line combo_line = str(matches) + "\t" + str(misMatches) + "\t" + str(repMatches) + "\t" \ + str(nCount) + "\t" + str(qNumInsert) + "\t" + str(qBaseInsert) + "\t" \ + str(tNumInsert) + "\t" + str(tBaseInsert) + "\t" \ + strand + "\t" + qName + "\t" + str(qSize) + "\t" \ + str(qStart) + "\t" + str(qEnd) + "\t" \ + tName + "\t" + str(tSize) + "\t" \ + str(tStart) + "\t" + str(tEnd) + "\t" + str(blockCount) + "\t" \ + blockSizes + "\t" + qStarts + "\t" + tStarts #print combo_line #print q_start_array #print t_start_array #print block_size_array # print str(right['qStarts'][i])+"\t"+str(right['qStarts'][i]+right['blockSizes'][i]) # print i return PSLBasics.line_to_entry(combo_line)
def main(): parser = argparse.ArgumentParser(description="splice together partial alignments") group1 = parser.add_mutually_exclusive_group(required=True) group1.add_argument('--fastq_reads') group1.add_argument('--fasta_reads') parser.add_argument('--genome',help="FASTA reference genome",required=True) parser.add_argument('--genepred',help="Transcriptome genepred") parser.add_argument('--max_intron_size',type=int,default=100000,help="INT maximum intron size") parser.add_argument('--min_intron_size',type=int,default=68,help="INT minimum intron size") parser.add_argument('--max_gap_size',type=int,default=10,help="INT gap size in query to join") parser.add_argument('--max_search_expand',type=int,default=10,help="INT max search space to expand search for junction") parser.add_argument('--direction_specific',action='store_true',help="The direction of the transcript is known and properly oriented already") parser.add_argument('--threads',type=int,default=0,help="INT number of threads to use default cpu_count") parser.add_argument('-o','--output',default='-',help="FILENAME output results to here rather than STDOUT which is default") parser.add_argument('input_alignment',help="FILENAME input .psl file or '-' for STDIN") args = parser.parse_args() # Read our reference genome sys.stderr.write("Reading reference\n") ref = read_fasta_into_hash(args.genome) # Make sure our reads are unique sys.stderr.write("Checking for unqiuely named reads\n") reads = check_for_uniquely_named_reads(args) # does a hard exit and error if there are any names repeated sys.stderr.write("Reads are uniquely named\n") # Set number of threads to use cpu_count = multiprocessing.cpu_count() if args.threads > 0: cpu_count = args.threads #Set reference splices (if any are available) reference_splices = {} if args.genepred: sys.stderr.write("Reading reference splices from genepred\n") reference_splices = get_reference_splices(args) sys.stderr.write("Reading alignments into loci\n") # Get locus division (first stage) # Each read (qName) is separated # Then each locus will be specific to at chromosome (tName) # Then by (strand), but keep in mind this is the is based on the read # Each locus should be specific to a direction but we don't necessarily # know direction based on the data we have thus far. inf = sys.stdin if args.input_alignment != '-': inf = open(args.input_alignment,'r') loci = {} for line in inf: line = line.rstrip() if re.match('^#',line): continue psl = PSLBasics.line_to_entry(line) if psl['qName'] not in loci: loci[psl['qName']] = {} if psl['tName'] not in loci[psl['qName']]: loci[psl['qName']][psl['tName']] = {} if psl['strand'] not in loci[psl['qName']][psl['tName']]: loci[psl['qName']][psl['tName']][psl['strand']] = {} if psl['tStarts'][0] not in loci[psl['qName']][psl['tName']][psl['strand']]: loci[psl['qName']][psl['tName']][psl['strand']][psl['tStarts'][0]] = [] loci[psl['qName']][psl['tName']][psl['strand']][psl['tStarts'][0]].append(psl) sys.stderr.write("breaking loci by genomic distance\n") for qname in loci: for chr in loci[qname]: for strand in loci[qname][chr]: #print qname + "\t" + chr + "\t" + strand starts = loci[qname][chr][strand].keys() current_set = [] locus_sets = [] last_end = -1*(args.max_intron_size+2) for start in sorted(starts): for e in loci[qname][chr][strand][start]: start = e['tStarts'][0]+1 # base-1 start of start of alignment if start > last_end+args.max_intron_size: # we have the start of a new set if len(current_set) > 0: locus_sets.append(current_set) current_set = [] last_end = e['tStarts'][len(e['tStarts'])-1]+e['blockSizes'][len(e['tStarts'])-1] current_set.append(e) if len(current_set) > 0: locus_sets.append(current_set) loci[qname][chr][strand] = locus_sets # replace what was there with these ordered sets locus_total = 0 for qname in loci: for chr in loci[qname]: for strand in loci[qname][chr]: for locus_set in loci[qname][chr][strand]: locus_total+=1 sys.stderr.write("Work on each read in each locus with "+str(cpu_count)+" CPUs\n") p = multiprocessing.Pool(processes=cpu_count) locus_count = 0 for qname in loci: for chr in loci[qname]: for strand in loci[qname][chr]: #print qname + "\t" + chr + "\t" + strand for locus_set in loci[qname][chr][strand]: locus_count += 1 onum = len(locus_set) # send blank reference splices unless we have some rsplices = {} if chr in reference_splices: rsplices = reference_splices[chr] #p.apply_async(process_locus_set,(locus_set,args,rsplices,ref[chr],reads[qname],locus_total,locus_count),callback=do_locus_callback) r1 = execute_locus(locus_set,args,rsplices,ref[chr],reads[qname],locus_total,locus_count) do_locus_callback(r1) #nnum = len(new_locus_set) #print str(onum) + " to " + str(nnum) #for e in new_locus_set: # print PSLBasics.entry_to_line(e) p.close() p.join() sys.stderr.write("\nfinished\n") ofh = sys.stdout if not args.output == '-': ofh = open(args.output,'w') for line in combo_results: ofh.write(line)
def do_combine_operation(best_option, left, right, read, seq, args): #print "choice is "+str(best_option) left_target = best_option[0] right_target = best_option[1] left_query = best_option[2] right_query = best_option[3] # store for output q_start_array = [] t_start_array = [] block_size_array = [] left_query_start = left['qStarts'][0] left_target_start = left['tStarts'][0] for i in range(0, len(left['tStarts'])): tstart = left['tStarts'][i] tend = left['tStarts'][i] + left['blockSizes'][i] qstart = left['qStarts'][i] qend = left['qStarts'][i] + left['blockSizes'][i] if left_query <= qstart + 1: break left_query_start = qstart left_target_start = tstart if left_query <= qend: break q_start_array.append(qstart) t_start_array.append(tstart) block_size_array.append(left['blockSizes'][i]) #print "left things" #print [left_query_start+1,left_query] #print [left_target_start+1,left_target] right_query_end = right['qStarts'][0] + right['blockSizes'][0] right_target_end = right['tStarts'][0] + right['blockSizes'][0] right_outer_index = 0 for j in range(0, len(right['tStarts'])): tstart = right['tStarts'][j] tend = right['tStarts'][j] + right['blockSizes'][j] qstart = right['qStarts'][j] qend = right['qStarts'][j] + right['blockSizes'][j] right_outer_index = j + 1 if right_query <= qstart + 1: break right_query_end = qend right_target_end = tend if right_query < qend: break #print "right things" #print [right_query+1,right_query_end] #print [right_target+1,right_target_end] working_read = read.upper() if left['strand'] == '-': working_read = rc(read.upper()) pread = working_read[left_query_start:right_query_end] tseq = seq[left_target_start:left_target].upper( ) + seq[right_target - 1:right_target_end].upper() res = needleman_wunsch(pread, tseq) #print "short needleman wunsch" #print res[0] #print res[1] # Fun part of making the new portion of the alignment qindex = left_query_start tindex = left_target_start in_alignment = 0 alignment = None bynumbers = None for i in range(0, len(res[0])): if res[0][i] == '-': #insertion in target (gap in query) tindex += 1 in_alignment = 0 elif res[1][i] == '-': #insertion in query (gap in target) qindex += 1 in_alignment = 0 else: # we are in an alignment if in_alignment == 0: # output buffered result if alignment: if len(alignment[0]) > 0: q_start_array.append(bynumbers[0]) t_start_array.append(bynumbers[1]) block_size_array.append(len(alignment[0])) alignment = ['', ''] bynumbers = [qindex, tindex, qindex, tindex] in_alignment = 1 alignment[0] += res[0][i] alignment[1] += res[1][i] bynumbers[2] += 1 bynumbers[3] += 1 qindex += 1 tindex += 1 if qindex == right_query: # switch forward #print "switch" #print str(tindex) + "\t" + str(right_target) #print str(qindex) + "\t" + str(right_query) if not tindex == right_target: in_alignment = 0 tindex = right_target if alignment: if len(alignment[0]) > 0: q_start_array.append(bynumbers[0]) t_start_array.append(bynumbers[1]) block_size_array.append(len(alignment[0])) #print bynumbers for i in range(right_outer_index, len(right['blockSizes'])): q_start_array.append(right['qStarts'][i]) t_start_array.append(right['tStarts'][i]) block_size_array.append(right['blockSizes'][i]) #now we can finally construct a psl line #we won't keep track of repeats for now matches = 0 misMatches = 0 repMatches = 0 nCount = 0 qNumInsert = 0 qBaseInsert = 0 tNumInsert = 0 tBaseInsert = 0 strand = left['strand'] qName = left['qName'] qSize = len(read) qStart = q_start_array[0] qEnd = q_start_array[len(q_start_array) - 1] + block_size_array[len(block_size_array) - 1] tName = left['tName'] tSize = len(seq) tStart = t_start_array[0] tEnd = t_start_array[len(t_start_array) - 1] + block_size_array[len(block_size_array) - 1] blockCount = len(block_size_array) blockSizes = ','.join([str(x) for x in block_size_array]) + ',' qStarts = ','.join([str(x) for x in q_start_array]) + ',' tStarts = ','.join([str(x) for x in t_start_array]) + ',' prev_q_end = None prev_t_end = None for i in range(0, len(block_size_array)): qseg = working_read[q_start_array[i]:q_start_array[i] + block_size_array[i]] tseg = seq[t_start_array[i]:t_start_array[i] + block_size_array[i]].upper() for j in range(0, len(qseg)): if qseg[j] == 'N': nCount += 1 if qseg[j] == tseg[j]: matches += 1 else: misMatches += 1 if prev_t_end: t_dist = t_start_array[i] - prev_t_end if t_dist > 0 and t_dist < args.min_intron_size: #we have an insert into the target and its not an intron tNumInsert += 1 tBaseInsert += t_dist if prev_q_end: q_dist = q_start_array[i] - prev_q_end if q_dist > 0: qNumInsert += 1 qBaseInsert += q_dist prev_q_end = q_start_array[i] + block_size_array[i] prev_t_end = t_start_array[i] + block_size_array[i] # now we have everything to make the line combo_line = str(matches) + "\t" + str(misMatches) + "\t" + str(repMatches) + "\t" \ + str(nCount) + "\t" + str(qNumInsert) + "\t" + str(qBaseInsert) + "\t" \ + str(tNumInsert) + "\t" + str(tBaseInsert) + "\t" \ + strand + "\t" + qName + "\t" + str(qSize) + "\t" \ + str(qStart) + "\t" + str(qEnd) + "\t" \ + tName + "\t" + str(tSize) + "\t" \ + str(tStart) + "\t" + str(tEnd) + "\t" + str(blockCount) + "\t" \ + blockSizes + "\t" + qStarts + "\t" + tStarts #print combo_line #print q_start_array #print t_start_array #print block_size_array # print str(right['qStarts'][i])+"\t"+str(right['qStarts'][i]+right['blockSizes'][i]) # print i return PSLBasics.line_to_entry(combo_line)
def main(): parser = argparse.ArgumentParser( description="splice together partial alignments") group1 = parser.add_mutually_exclusive_group(required=True) group1.add_argument('--fastq_reads') group1.add_argument('--fasta_reads') parser.add_argument('--genome', help="FASTA reference genome", required=True) parser.add_argument('--genepred', help="Transcriptome genepred") parser.add_argument('--max_intron_size', type=int, default=100000, help="INT maximum intron size") parser.add_argument('--min_intron_size', type=int, default=68, help="INT minimum intron size") parser.add_argument('--max_gap_size', type=int, default=10, help="INT gap size in query to join") parser.add_argument( '--max_search_expand', type=int, default=10, help="INT max search space to expand search for junction") parser.add_argument( '--direction_specific', action='store_true', help= "The direction of the transcript is known and properly oriented already" ) parser.add_argument('--threads', type=int, default=0, help="INT number of threads to use default cpu_count") parser.add_argument( '-o', '--output', default='-', help= "FILENAME output results to here rather than STDOUT which is default") parser.add_argument('input_alignment', help="FILENAME input .psl file or '-' for STDIN") args = parser.parse_args() # Read our reference genome sys.stderr.write("Reading reference\n") ref = read_fasta_into_hash(args.genome) # Make sure our reads are unique sys.stderr.write("Checking for unqiuely named reads\n") reads = check_for_uniquely_named_reads( args) # does a hard exit and error if there are any names repeated sys.stderr.write("Reads are uniquely named\n") # Set number of threads to use cpu_count = multiprocessing.cpu_count() if args.threads > 0: cpu_count = args.threads #Set reference splices (if any are available) reference_splices = {} if args.genepred: sys.stderr.write("Reading reference splices from genepred\n") reference_splices = get_reference_splices(args) sys.stderr.write("Reading alignments into loci\n") # Get locus division (first stage) # Each read (qName) is separated # Then each locus will be specific to at chromosome (tName) # Then by (strand), but keep in mind this is the is based on the read # Each locus should be specific to a direction but we don't necessarily # know direction based on the data we have thus far. inf = sys.stdin if args.input_alignment != '-': inf = open(args.input_alignment, 'r') loci = {} for line in inf: line = line.rstrip() if re.match('^#', line): continue psl = PSLBasics.line_to_entry(line) if psl['qName'] not in loci: loci[psl['qName']] = {} if psl['tName'] not in loci[psl['qName']]: loci[psl['qName']][psl['tName']] = {} if psl['strand'] not in loci[psl['qName']][psl['tName']]: loci[psl['qName']][psl['tName']][psl['strand']] = {} if psl['tStarts'][0] not in loci[psl['qName']][psl['tName']][ psl['strand']]: loci[psl['qName']][psl['tName']][psl['strand']][psl['tStarts'] [0]] = [] loci[psl['qName']][psl['tName']][psl['strand']][psl['tStarts'] [0]].append(psl) sys.stderr.write("breaking loci by genomic distance\n") for qname in loci: for chr in loci[qname]: for strand in loci[qname][chr]: #print qname + "\t" + chr + "\t" + strand starts = loci[qname][chr][strand].keys() current_set = [] locus_sets = [] last_end = -1 * (args.max_intron_size + 2) for start in sorted(starts): for e in loci[qname][chr][strand][start]: start = e['tStarts'][ 0] + 1 # base-1 start of start of alignment if start > last_end + args.max_intron_size: # we have the start of a new set if len(current_set) > 0: locus_sets.append(current_set) current_set = [] last_end = e['tStarts'][len(e['tStarts']) - 1] + e['blockSizes'][ len(e['tStarts']) - 1] current_set.append(e) if len(current_set) > 0: locus_sets.append(current_set) loci[qname][chr][ strand] = locus_sets # replace what was there with these ordered sets locus_total = 0 for qname in loci: for chr in loci[qname]: for strand in loci[qname][chr]: for locus_set in loci[qname][chr][strand]: locus_total += 1 sys.stderr.write("Work on each read in each locus with " + str(cpu_count) + " CPUs\n") p = multiprocessing.Pool(processes=cpu_count) locus_count = 0 for qname in loci: for chr in loci[qname]: for strand in loci[qname][chr]: #print qname + "\t" + chr + "\t" + strand for locus_set in loci[qname][chr][strand]: locus_count += 1 onum = len(locus_set) # send blank reference splices unless we have some rsplices = {} if chr in reference_splices: rsplices = reference_splices[chr] #p.apply_async(process_locus_set,(locus_set,args,rsplices,ref[chr],reads[qname],locus_total,locus_count),callback=do_locus_callback) r1 = execute_locus(locus_set, args, rsplices, ref[chr], reads[qname], locus_total, locus_count) do_locus_callback(r1) #nnum = len(new_locus_set) #print str(onum) + " to " + str(nnum) #for e in new_locus_set: # print PSLBasics.entry_to_line(e) p.close() p.join() sys.stderr.write("\nfinished\n") ofh = sys.stdout if not args.output == '-': ofh = open(args.output, 'w') for line in combo_results: ofh.write(line)
def main(): parser = argparse.ArgumentParser( description='Use reference junctions when they are close', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--min_intron_size', type=int, default=68, help="INT min intron size") parser.add_argument( '--min_local_support', type=int, default=0, help= "INT min number of junctions within search_size of a junction in order to count it" ) parser.add_argument('--search_size', type=int, default=10, help="INT search space for reference") parser.add_argument( '--output_fake_psl', help="FASTAFILE reference genome to make a fake PSL output") parser.add_argument('psl', help="PSLFILENAME or '-' for STDIN") parser.add_argument('reference_genepred', help="FASTAFILENAME for reference genepred") args = parser.parse_args() cpus = multiprocessing.cpu_count() genome = {} if args.output_fake_psl: genome = read_fasta_into_hash(args.output_fake_psl) #read in the reference genepred first gpf = GenePredBasics.GenePredFile(args.reference_genepred) #lets sort entries by chromosome ref = {} for e in [x.entry for x in gpf.entries]: if len(e['exonStarts']) <= 1: continue if e['chrom'] not in ref: ref[e['chrom']] = {} for i in range(1, len(e['exonStarts'])): if e['exonEnds'][i - 1] not in ref[e['chrom']]: ref[e['chrom']][e['exonEnds'][i - 1]] = {} if e['exonStarts'][i] + 1 not in ref[e['chrom']][e['exonEnds'][i - 1]]: ref[e['chrom']][e['exonEnds'][i - 1]][e['exonStarts'][i] + 1] = e['strand'] #Stored all junctions as 1-base read_info = {} pf = GenericFileReader(args.psl) fcount_total = 0 while True: line = pf.readline() if not line: break if re.match('^#', line): continue line = line.rstrip() pe = PSLBasics.line_to_entry(line) if len(pe['tStarts']) != len(pe['blockSizes']) or len( pe['qStarts']) != len(pe['blockSizes']): sys.stderr.write("WARNING invalid psl\n") continue genepred_line = PSLBasics.convert_entry_to_genepred_line(pe) ge = GenePredBasics.smooth_gaps( GenePredBasics.line_to_entry(genepred_line), args.min_intron_size) refjuns = {} if pe['tName'] in ref: refjuns = ref[pe['tName']] new_ge = nudge(pe, ge, refjuns, args) if args.output_fake_psl: new_psl_line = GenePredBasics.entry_to_fake_psl_line( new_ge, genome) print new_psl_line else: print GenePredBasics.entry_to_line(new_ge)
def convert_line(self,psl_line,query_sequence=None,quality_sequence=None): try: pe = PSLBasics.line_to_entry(psl_line) except: sys.stderr.write("Problem parsing line:\n"+psl_line.rstrip()+"\n") return False if len(pe['tStarts']) != len(pe['blockSizes']): sys.stderr.write("Warning invalid psl entry: "+pe['qName']+"\n") return False #work on the positive strand case first cigar = '*' blocks = len(pe['blockSizes']) starts = pe['qStarts'] #if pe['strand'] == '-': # starts = [x for x in reversed(pe['qStarts_actual'])] # print 'isrev' q_coord_start = starts[0]+1 # base-1 converted starting position q_coord_end = starts[blocks-1]+pe['blockSizes'][blocks-1] # base-1 position t_coord_start = pe['tStarts'][0]+1 # base-1 converted starting position t_coord_end = pe['tStarts'][blocks-1]+pe['blockSizes'][blocks-1] # base-1 position if pe['qName'] not in self.reads and self.reads_set is True: sys.stderr.write("Warning: qName "+pe['qName']+" was not found in reads\n") # we will clip the query sequence to begin and end from the aligned region #q_seq = '' #if self.reads_set: # q_seq = self.reads[pe['qName']] # 1. Get the new query to output q_seq_trimmed = '*' if self.reads_set or query_sequence: q_seq_trimmed = query_sequence if not query_sequence: # get it from the archive we loaded if we didn't give it q_seq_trimmed = self.reads[pe['qName']] if pe['strand'] == '-': q_seq_trimmed = SequenceBasics.rc(q_seq_trimmed) q_seq_trimmed = q_seq_trimmed[q_coord_start-1:q_coord_end] qual_trimmed = '*' if self.qualities_set or quality_sequence: qual_trimmed = quality_sequence if not quality_sequence: qual_trimmed = self.qualities[pe['qName']] if pe['strand'] == '-': qual_trimmed = qual_trimmed[::-1] qual_trimmed = qual_trimmed[q_coord_start-1:q_coord_end] # 2. Get the cigar string to output prev_diff = t_coord_start-q_coord_start cigar = '' #for i in range(0,blocks): # current_diff = pe['tStarts'][i]-starts[i] # delta = current_diff - prev_diff # #print delta # if delta >= self.min_intron_size: # cigar += str(abs(delta))+'N' # elif delta > 0: # we have a # cigar += str(abs(delta))+'D' # elif delta < 0: # we have a # cigar += str(abs(delta))+'I' # cigar += str(pe['blockSizes'][i])+'M' # our matches # #print current_diff # prev_diff = current_diff qstarts = [x-pe['qStarts'][0] for x in pe['qStarts']] tstarts = [x-pe['tStarts'][0] for x in pe['tStarts']] query_index = 0 target_index = 0 junctions = [] for i in range(0,blocks): qdif = qstarts[i] - query_index tdif = tstarts[i] - target_index if qdif > 0: # we have to insert cigar += str(qdif) + 'I' if tdif > self.min_intron_size: # we have an intron cigar += str(tdif) + 'N' junctions.append(i) elif tdif > 0: # we have to delete cigar += str(tdif) + 'D' cigar += str(pe['blockSizes'][i]) + 'M' query_index = qstarts[i]+pe['blockSizes'][i] target_index = tstarts[i]+pe['blockSizes'][i] ### cigar done # inspect junctions if we have a ref_genome spliceflag_set = False if self.ref_genome_set: canon = 0 revcanon = 0 for i in junctions: #blocks following a junction left_num = pe['tStarts'][i-1]+pe['blockSizes'][i-1] left_val = self.ref_genome[pe['tName']][left_num:left_num+2].upper() right_num = pe['tStarts'][i-1]-2 right_val = self.ref_genome[pe['tName']][right_num:right_num+2].upper() junc = left_val + '-' + right_val if junc in self.canonical: canon += 1 if junc in self.revcanonical: revcanon += 1 if canon > revcanon: spliceflag_set = True spliceflag = '+' elif revcanon > canon: spliceflag_set = True spliceflag = '-' # if we have junctions, and we should be setting direction but # we can't figure out the direction skip ambiguous direction if len(junctions) > 0 and self.skip_directionless_splice and spliceflag_set == False: return False samline = pe['qName'] + "\t" # 1. QNAME if pe['strand'] == '-': samline += '16' + "\t" # 2. FLAG else: samline += '0' + "\t" samline += pe['tName'] + "\t" # 3. RNAME samline += str(t_coord_start) + "\t" # 4. POS samline += '0' + "\t" # 5. MAPQ samline += cigar + "\t" # 6. CIGAR samline += '*' + "\t" # 7. RNEXT samline += '0' + "\t" # 8. PNEXT samline += '0' + "\t" # 9. TLEN samline += q_seq_trimmed + "\t" # 10. SEQ samline += qual_trimmed + "\t" # 11. QUAL if spliceflag_set: samline += 'XS:A:'+spliceflag + "\t" if self.ref_genome_set: samline += 'NH:i:'+str(self.mapping_counts[pe['qName']]) + "\t" samline += 'XC:i:'+str(len(junctions)) + "\t" samline += 'NM:i:0' return samline
def convert_line(self, psl_line, query_sequence=None, quality_sequence=None): try: pe = PSLBasics.line_to_entry(psl_line) except: sys.stderr.write("Problem parsing line:\n" + psl_line.rstrip() + "\n") return False if len(pe['tStarts']) != len(pe['blockSizes']): sys.stderr.write("Warning invalid psl entry: " + pe['qName'] + "\n") return False #work on the positive strand case first cigar = '*' blocks = len(pe['blockSizes']) starts = pe['qStarts'] #if pe['strand'] == '-': # starts = [x for x in reversed(pe['qStarts_actual'])] # print 'isrev' q_coord_start = starts[0] + 1 # base-1 converted starting position q_coord_end = starts[blocks - 1] + pe['blockSizes'][blocks - 1] # base-1 position t_coord_start = pe['tStarts'][ 0] + 1 # base-1 converted starting position t_coord_end = pe['tStarts'][blocks - 1] + pe['blockSizes'][blocks - 1] # base-1 position if pe['qName'] not in self.reads and self.reads_set is True: sys.stderr.write("Warning: qName " + pe['qName'] + " was not found in reads\n") # we will clip the query sequence to begin and end from the aligned region #q_seq = '' #if self.reads_set: # q_seq = self.reads[pe['qName']] # 1. Get the new query to output q_seq_trimmed = '*' if self.reads_set or query_sequence: q_seq_trimmed = query_sequence if not query_sequence: # get it from the archive we loaded if we didn't give it q_seq_trimmed = self.reads[pe['qName']] if pe['strand'] == '-': q_seq_trimmed = SequenceBasics.rc(q_seq_trimmed) q_seq_trimmed = q_seq_trimmed[q_coord_start - 1:q_coord_end] qual_trimmed = '*' if self.qualities_set or quality_sequence: qual_trimmed = quality_sequence if not quality_sequence: qual_trimmed = self.qualities[pe['qName']] if pe['strand'] == '-': qual_trimmed = qual_trimmed[::-1] qual_trimmed = qual_trimmed[q_coord_start - 1:q_coord_end] # 2. Get the cigar string to output prev_diff = t_coord_start - q_coord_start cigar = '' #for i in range(0,blocks): # current_diff = pe['tStarts'][i]-starts[i] # delta = current_diff - prev_diff # #print delta # if delta >= self.min_intron_size: # cigar += str(abs(delta))+'N' # elif delta > 0: # we have a # cigar += str(abs(delta))+'D' # elif delta < 0: # we have a # cigar += str(abs(delta))+'I' # cigar += str(pe['blockSizes'][i])+'M' # our matches # #print current_diff # prev_diff = current_diff qstarts = [x - pe['qStarts'][0] for x in pe['qStarts']] tstarts = [x - pe['tStarts'][0] for x in pe['tStarts']] query_index = 0 target_index = 0 junctions = [] for i in range(0, blocks): qdif = qstarts[i] - query_index tdif = tstarts[i] - target_index if qdif > 0: # we have to insert cigar += str(qdif) + 'I' if tdif > self.min_intron_size: # we have an intron cigar += str(tdif) + 'N' junctions.append(i) elif tdif > 0: # we have to delete cigar += str(tdif) + 'D' cigar += str(pe['blockSizes'][i]) + 'M' query_index = qstarts[i] + pe['blockSizes'][i] target_index = tstarts[i] + pe['blockSizes'][i] ### cigar done # inspect junctions if we have a ref_genome spliceflag_set = False if self.ref_genome_set: canon = 0 revcanon = 0 for i in junctions: #blocks following a junction left_num = pe['tStarts'][i - 1] + pe['blockSizes'][i - 1] left_val = self.ref_genome[pe['tName']][left_num:left_num + 2].upper() right_num = pe['tStarts'][i - 1] - 2 right_val = self.ref_genome[pe['tName']][right_num:right_num + 2].upper() junc = left_val + '-' + right_val if junc in self.canonical: canon += 1 if junc in self.revcanonical: revcanon += 1 if canon > revcanon: spliceflag_set = True spliceflag = '+' elif revcanon > canon: spliceflag_set = True spliceflag = '-' # if we have junctions, and we should be setting direction but # we can't figure out the direction skip ambiguous direction if len( junctions ) > 0 and self.skip_directionless_splice and spliceflag_set == False: return False samline = pe['qName'] + "\t" # 1. QNAME if pe['strand'] == '-': samline += '16' + "\t" # 2. FLAG else: samline += '0' + "\t" samline += pe['tName'] + "\t" # 3. RNAME samline += str(t_coord_start) + "\t" # 4. POS samline += '0' + "\t" # 5. MAPQ samline += cigar + "\t" # 6. CIGAR samline += '*' + "\t" # 7. RNEXT samline += '0' + "\t" # 8. PNEXT samline += '0' + "\t" # 9. TLEN samline += q_seq_trimmed + "\t" # 10. SEQ samline += qual_trimmed + "\t" # 11. QUAL if spliceflag_set: samline += 'XS:A:' + spliceflag + "\t" if self.ref_genome_set: samline += 'NH:i:' + str(self.mapping_counts[pe['qName']]) + "\t" samline += 'XC:i:' + str(len(junctions)) + "\t" samline += 'NM:i:0' return samline