def InitializeObjects( bam_file, Contigs, Scaffolds, param, Information, G_prime, small_contigs, small_scaffolds, C_dict ): singeled_out = 0 contig_threshold = param.contig_threshold cont_lengths = bam_file.lengths cont_lengths = [int(nr) for nr in cont_lengths] # convert long to int object cont_names = bam_file.references # Calculate NG50 and LG 50 param.tot_assembly_length = sum(cont_lengths) sorted_lengths = sorted(cont_lengths, reverse=True) N50, L50 = CalculateStats(sorted_lengths, [], param, Information) param.current_L50 = L50 param.current_N50 = N50 # extend_paths = param.extend_paths counter = 0 start = time() for i in range(0, len(cont_names)): counter += 1 if counter % 100000 == 0: print >> Information, "Time adding 100k keys", time() - start start = time() if cont_names[i] not in C_dict: # errorhandle.unknown_contig(cont_names[i]) continue if cont_lengths[i] >= contig_threshold: C = Contig.contig(cont_names[i]) # Create object contig C.length = cont_lengths[i] C.sequence = C_dict[cont_names[i]] del C_dict[cont_names[i]] scaf_length = C.length # Initially, scaffold consists of only this contig C.direction = True # always in same direction first, False=reverse C.position = 0 # position always 0 # C.links = {} Contigs[C.name] = C # Create a dict with name as key and the object container as value S = Scaffold.scaffold(param.scaffold_indexer, [C], scaf_length) # Create object scaffold Scaffolds[S.name] = S C.scaffold = S.name param.scaffold_indexer += 1 else: if cont_lengths[i] > 0: # In case of contigs with size 0 (due to some error in fasta file) C = Contig.contig(cont_names[i]) # Create object contig C.length = cont_lengths[i] C.sequence = C_dict[cont_names[i]] del C_dict[cont_names[i]] scaf_length = C.length # Initially, scaffold consists of only this contig C.direction = True # always in same direction first, False=reverse C.position = 0 # position always 0 small_contigs[C.name] = C # Create a dict with name as key and the object container as value S = Scaffold.scaffold(param.scaffold_indexer, [C], scaf_length) # Create object scaffold small_scaffolds[S.name] = S C.scaffold = S.name param.scaffold_indexer += 1 singeled_out += 1 del C_dict print >> Information, "Nr of contigs that was singeled out due to length constraints " + str(singeled_out) return ()
def make_contigs(SAG_fasta_file, contig_name_tsv, names_map): # uses SAG fasta file to load contig sequences # and mapped name/tsv files to update names # for contigs that were simplified using anvi-script SAG_fa_file = open(SAG_fasta_file) contig_names = open(contig_name_tsv) names_map_file = open(names_map) if SAG_fasta_file[7:10] == contig_name_tsv[:3]: #checkfilenames print 'filenames checked out' contig_list = [] #output list for contigs SAG_fa_lines = SAG_fa_file.readlines() contig_name_lines = contig_names.readlines() for i in range(0, len(SAG_fa_lines), 2): #make contigs from fasta file ID = SAG_fa_lines[i][1 : (len(SAG_fa_lines[i]) - 1)] sequence = SAG_fa_lines[i + 1] SAG = contig_name_tsv[:3] contig = Contig(ID, sequence, SAG) contig_list.append(contig) #update contig names from simplified versions #anvi-script -> SAG.fasta names for line in contig_name_lines: line = line.split() old_ID = line[0] new_ID = line[1] for i in range(len(contig_list)): if contig_list[i].update_ID(old_ID, new_ID): break #SAG.fasta names -> JGI contig names names_map_lines = names_map_file.readlines() for line in names_map_lines: line = line.split() old_ID = line[0] new_ID = line[1] for i in range(len(contig_list)): if contig_list[i].update_ID(old_ID, new_ID): break else: print "The SAG FASTA file and contig name .tsv file do not match!" SAG_fa_file.close() contig_names.close() names_map_file.close() #print contig_list return contig_list
def AddEdges(Contigs,Scaffolds,bamfile,mean,std_dev,scaffold_indexer,F,read_len): #Clean contig_library singeled_out=0 cont_lengths= bam_file.lengths cont_lengths=[int(nr) for nr in cont_lengths] #convert long to int object #print cont_lengths cont_names = bam_file.references ####### WHEN ADDING SHORTER CONTIGS NOT INCLUDED IN THE SCAFFOLDING, ####### WE NEED TO ALSO INITIALIZE OBJECTS FOR THESE, THIS SHOULD BE DONE SOMEWHERE HERE for i in range(0,len(cont_names)): if cont_lengths[i] >= 300: C=Contig.contig(cont_names[i]) # Create object contig C.length = cont_lengths[i] C.scaf_length = C.length # Initially, scaffold consists of only this contig C.direction = True # always in same direction first, False=reverse C.position = 0 #position always 0 C.links = {} Contigs[C.name] = C # Create a dict with name as key and the object container as value S=Scaffold.scaffold('s'+str(scaffold_indexer),[C],C.length) # Create object scaffold Scaffolds[S.name]=S C.scaffold=S.name G.add_node((S.name,'L'),length=cont_lengths[i]) G.add_node((S.name,'R'),length=cont_lengths[i]) scaffold_indexer+=1 #Create "node graph" of contigs (that passed the length criteria). Each having a left and right node #print 'Nr of contigs/scaffolds included in scaffolding: '+ str(len(Scaffolds))#,Scaffolds.keys() for scaffold_ in Scaffolds: G.add_edge((scaffold_,'L'),(scaffold_,'R'),nr_links=None) #this is a scaffold object but can be both a single contig or a scaffold. # Create the link edges in the graph by fetching info from bam file fishy_edges = defaultdict(int) for alignedread in bam_file: try: #check that read is aligned OBS: not with is_unmapped since this flag is fishy for e.g. BWA contig1=bam_file.getrname(alignedread.rname) contig2=bam_file.getrname(alignedread.mrnm) except ValueError: continue if contig1 in Contigs and contig2 in Contigs: #TODO: this if-statement is an ad hoc implementation to deal with BWA's buggy SAM-flag reporting #if BWA fixes this -> remove this statement. If the links in fishy edges is equal to or ore than #the links in the graph G or G'. The edge will be removed. if alignedread.is_unmapped and alignedread.is_read1: # and contig1 != contig2: #Some BWA error in mappings can still slip through, these edges are caracterized by very few links cont_obj1 = Contigs[contig1] scaf_obj1 = Scaffolds[cont_obj1.scaffold] cont_obj2 = Contigs[contig2] scaf_obj2 = Scaffolds[cont_obj2.scaffold] if scaf_obj2.name != scaf_obj1.name: (side1,side2) = CheckDir(cont_obj1,cont_obj2,alignedread) #get scaffold name for contig s1 = Contigs[contig1].scaffold #if contig1 in Contigs else small_contigs[contig1].scaffold s2 = Contigs[contig2].scaffold #if contig2 in Contigs else small_contigs[contig2].scaffold fishy_edges[((s1,side1),(s2,side2))] +=1 fishy_edges[((s2,side2),(s1,side1))] +=1 #if contig1 in Contigs and contig2 in Contigs and Contigs[contig2].scaffold != Contigs[contig1].scaffold: if contig1 != contig2 and alignedread.is_read2 and not alignedread.is_unmapped and alignedread.mapq > 20: (read_dir,mate_dir) = (not alignedread.is_reverse,not alignedread.mate_is_reverse ) scaf1=Contigs[contig1].scaffold scaf2=Contigs[contig2].scaffold #Calculate actual position on scaffold here #position1 cont/scaf1 cont_dir1 = Contigs[contig1].direction #if pos : L if neg: R cont1_pos = Contigs[contig1].position readpos = alignedread.pos cont1_len = Contigs[contig1].length s1len = Scaffolds[scaf1].s_length #position1 cont1/scaf1 cont_dir2 = Contigs[contig2].direction cont2_pos = Contigs[contig2].position matepos = alignedread.mpos cont2_len = Contigs[contig2].length s2len = Scaffolds[scaf2].s_length (obs,scaf_side1,scaf_side2)=PosDirCalculatorPE(cont_dir1,read_dir,cont1_pos,readpos,s1len,cont1_len,cont_dir2,mate_dir,cont2_pos,matepos,s2len,cont2_len,read_len) if obs < mean+ 6*std_dev: if (scaf2,scaf_side2) not in G[(scaf1,scaf_side1)]: G.add_edge((scaf2,scaf_side2),(scaf1,scaf_side1),nr_links=1,gap_dist=[obs]) #print 'Added edge' else: G.edge[(scaf1,scaf_side1)][(scaf2,scaf_side2)]['nr_links'] += 1 #print 'edge' G.edge[(scaf1,scaf_side1)][(scaf2,scaf_side2)]['gap_dist'].append(obs) elif contig1 in Contigs and contig2 in Contigs and Contigs[contig2].scaffold != Contigs[contig1].scaffold: ########################Use to validate scaffold herein previous step here pass RemoveBugEdges(G,fishy_edges)
def PE(Contigs, Scaffolds, F, Information, output_dest, C_dict, param): G = nx.Graph() print 'Parsing BAM file...' #informative_pair={81:(False,True),97:(True,False),113:(False,False),65:(True,True)} #I switched to look at mates instead since BWA can give false flag combinations for # read-mate when read is mapped but not mate eg 97-149 81-165. But the reverse #does not happen. #informative_pair={161:(True,False),145:(False,True),129:(True,True),177:(False,False)} #,131:(True,True),179:(False,False)} #147:(False,True),163:(True,False), with pysam.Samfile( param.bamfile, 'rb' ) as bam_file: #once real data, change to 'rb', simulated files are on SAM format #Get parameters -r, -m, -s, -T, -t for library print 'Computing parameters not set by user...' GetParams(bam_file, param, Scaffolds, C_dict, F, Contigs) #Clean contig_library singeled_out = 0 if param.first_lib: cont_lengths = bam_file.lengths cont_lengths = [int(nr) for nr in cont_lengths ] #convert long to int object cont_names = bam_file.references #Calculate NG50 and LG 50 param.tot_assembly_length = sum(cont_lengths) sorted_lengths = sorted(cont_lengths, reverse=True) N50, L50 = CalculateStats(sorted_lengths, param) param.current_L50 = L50 param.current_N50 = N50 ####### WHEN ADDING SHORTER CONTIGS NOT INCLUDED IN THE SCAFFOLDING, ####### WE NEED TO ALSO INITIALIZE OBJECTS FOR THESE, THIS SHOULD BE DONE SOMEWHERE HERE for i in range(0, len(cont_names)): if cont_lengths[i] >= param.contig_threshold: C = Contig.contig(cont_names[i]) # Create object contig C.length = cont_lengths[i] scaf_length = C.length # Initially, scaffold consists of only this contig C.direction = True # always in same direction first, False=reverse C.position = 0 #position always 0 C.links = {} Contigs[ C. name] = C # Create a dict with name as key and the object container as value S = Scaffold.scaffold(param.scaffold_indexer, [C], scaf_length, {}, {}) # Create object scaffold Scaffolds[S.name] = S C.scaffold = S.name param.scaffold_indexer += 1 else: singeled_out += 1 F.append([ (cont_names[i], True, 0, cont_lengths[i], {}) ]) #list of (contig_name, pos_direction, position,length) print >> Information, 'Nr of contigs that was singeled out due to length constraints ' + str( singeled_out) else: #Clean contig_library/scaffold_library scaf_lengths = [ Scaffolds[scaffold_].s_length for scaffold_ in Scaffolds.keys() ] sorted_lengths = sorted(scaf_lengths, reverse=True) N50, L50 = CalculateStats(sorted_lengths, param) param.current_L50 = L50 param.current_N50 = N50 for scaffold_ in Scaffolds.keys( ): #iterate over keys in hash, so that we can remove keys while iterating over it if Scaffolds[scaffold_].s_length < param.contig_threshold: ### Go to function and print to F ### Remove Scaf_obj from Scaffolds and Contig_obj from contigs S_obj = Scaffolds[scaffold_] list_of_contigs = S_obj.contigs #list of contig objects contained in scaffold object Contigs, F = GO.WriteToF( F, Contigs, list_of_contigs ) #Don't worry, the contig objects are removed in WriteTOF function del Scaffolds[scaffold_] singeled_out += 1 print >> Information, 'Nr of contigs/scaffolds that was singeled out due to length constraints ' + str( singeled_out) #Create "node graph" of contigs (that passed the length criteria). Each having a left and right node print 'Nr of contigs/scaffolds included in scaffolding: ' + str( len(Scaffolds)) #,Scaffolds.keys() if len(Scaffolds) == 0: return (None, Contigs, Scaffolds, F, param) cnt = 0 tot_start = time() start1 = time() for scaffold_ in Scaffolds: G.add_edge( (scaffold_, 'L'), (scaffold_, 'R'), nr_links=None ) #this is a scaffold object but can be both a single contig or a scaffold. Scaffolds[scaffold_].scaffold_left_nbrs = {} Scaffolds[scaffold_].scaffold_right_nbrs = {} if cnt % 100000 == 0 and cnt > 0: elapsed = time() - start1 print >> Information, 'Total nr of keys added: ', cnt, 'Time for adding last 100 000 keys: ', elapsed start1 = time() cnt += 1 print 'Total time elapsed: ', time() - tot_start # Create the link edges in the graph by fetching info from bam file cont_aligned_len = {} for contig in Contigs: cont_aligned_len[contig] = [0, Contigs[contig].length] count = 0 non_unique = 0 non_unique_for_scaf = 0 nr_of_duplicates = 0 prev_obs1 = -1 prev_obs2 = -1 reads_with_too_long_insert = 0 #fishy_reads = {} for alignedread in bam_file: try: #check that read is aligned OBS: not with is_unmapped since this flag is fishy for e.g. BWA contig1 = bam_file.getrname(alignedread.rname) contig2 = bam_file.getrname(alignedread.mrnm) except ValueError: continue #contig1=bam_file.getrname(alignedread.rname) ## add to coverage computation if contig is still in the list of considered contigs try: cont_aligned_len[contig1][0] += alignedread.rlen except KeyError: pass ########## CREATE EDGES IN SCAFFOLD GRAPH ########## if contig1 != contig2 and alignedread.is_read2: #check how many non unique reads out of the useful ones (mapping to two different contigs) #This only works for BWA!! implement for other aligners as well if alignedread.mapq == 0: non_unique += 1 #print contig1,contig2 if contig1 in Contigs and contig2 in Contigs and Contigs[ contig2].scaffold != Contigs[ contig1].scaffold and alignedread.mapq > param.map_quality: # and alignedread.tags[0][1] == 'U': #if alignedread.tags[0][1] != 'U': # non_unique_for_scaf += 1 if alignedread.mapq == 0: non_unique_for_scaf += 1 count += 1 #(read_dir,mate_dir)=informative_pair[flag_type] (read_dir, mate_dir) = (not alignedread.is_reverse, not alignedread.mate_is_reverse) scaf1 = Contigs[contig1].scaffold scaf2 = Contigs[contig2].scaffold #Calculate actual position on scaffold here #position1 cont/scaf1 cont_dir1 = Contigs[ contig1].direction #if pos : L if neg: R cont1_pos = Contigs[contig1].position readpos = alignedread.pos cont1_len = Contigs[contig1].length s1len = Scaffolds[scaf1].s_length #position1 cont1/scaf1 cont_dir2 = Contigs[contig2].direction cont2_pos = Contigs[contig2].position matepos = alignedread.mpos cont2_len = Contigs[contig2].length s2len = Scaffolds[scaf2].s_length (obs1, obs2, scaf_side1, scaf_side2) = PosDirCalculatorPE( cont_dir1, read_dir, cont1_pos, readpos, s1len, cont1_len, cont_dir2, mate_dir, cont2_pos, matepos, s2len, cont2_len, param.read_len) if obs1 == prev_obs1 and obs2 == prev_obs2: nr_of_duplicates += 1 if param.detect_duplicate: continue if obs1 + obs2 < param.ins_size_threshold: # if obs1 == 3 or obs2 ==3: # print alignedread.pos,alignedread.mpos, contig1, contig2, scaf1, scaf2, s1len,s2len if scaf_side1 == 'R': if (scaf2, scaf_side2 ) in Scaffolds[scaf1].right_nbrs_obs: if obs1 < Scaffolds[scaf1].right_nbrs_obs[( scaf2, scaf_side2)]: Scaffolds[scaf1].right_nbrs_obs[( scaf2, scaf_side2)] = obs1 else: Scaffolds[scaf1].right_nbrs_obs[( scaf2, scaf_side2)] = obs1 if scaf_side1 == 'L': if (scaf2, scaf_side2 ) in Scaffolds[scaf1].left_nbrs_obs: if obs1 < Scaffolds[scaf1].left_nbrs_obs[( scaf2, scaf_side2)]: Scaffolds[scaf1].left_nbrs_obs[( scaf2, scaf_side2)] = obs1 else: Scaffolds[scaf1].left_nbrs_obs[( scaf2, scaf_side2)] = obs1 if scaf_side2 == 'R': if (scaf1, scaf_side1 ) in Scaffolds[scaf2].right_nbrs_obs: if obs2 < Scaffolds[scaf2].right_nbrs_obs[( scaf1, scaf_side1)]: Scaffolds[scaf2].right_nbrs_obs[( scaf1, scaf_side1)] = obs2 else: Scaffolds[scaf2].right_nbrs_obs[( scaf1, scaf_side1)] = obs2 if scaf_side2 == 'L': if (scaf1, scaf_side1 ) in Scaffolds[scaf2].left_nbrs_obs: if obs2 < Scaffolds[scaf2].left_nbrs_obs[( scaf1, scaf_side1)]: Scaffolds[scaf2].left_nbrs_obs[( scaf1, scaf_side1)] = obs2 else: Scaffolds[scaf2].left_nbrs_obs[( scaf1, scaf_side1)] = obs2 if (scaf2, scaf_side2) not in G[(scaf1, scaf_side1)]: G.add_edge((scaf2, scaf_side2), (scaf1, scaf_side1), nr_links=1, gap_dist=obs1 + obs2) else: G.edge[(scaf1, scaf_side1)][(scaf2, scaf_side2)]['nr_links'] += 1 G.edge[(scaf1, scaf_side1)][( scaf2, scaf_side2)]['gap_dist'] += obs1 + obs2 else: reads_with_too_long_insert += 1 #fishy_reads[alignedread.qname[:-1]]=[contig2,alignedread.is_read2] ## add to haplotype graph here!! prev_obs1 = obs1 prev_obs2 = obs2 elif contig1 in Contigs and contig2 in Contigs and Contigs[ contig2].scaffold != Contigs[contig1].scaffold: ########################Use to validate scaffold in previous step here ############ pass # print 'NR OF FISHY EDGES: ', len(fishy_reads) print 'USEFUL READS (reads mapping to different contigs): ', count #print 'Non unique portion out of "USEFUL READS" (filtered out from scaffolding): ', non_unique #print 'Non unique used for scaf: ', non_unique_for_scaf print 'Reads with too large insert size from "USEFUL READS" (filtered out): ', reads_with_too_long_insert if param.detect_duplicate: print 'Number of duplicated reads indicated and removed: ', nr_of_duplicates ##### Calc coverage for all contigs with current lib here ##### sum_x = 0 sum_x_sq = 0 n = 0 for contig in cont_aligned_len: cont_coverage = cont_aligned_len[contig][0] / float( cont_aligned_len[contig][1]) #print key, cont_aligned_len[key]/float(cont_lengths[i]) try: Contigs[contig].coverage = cont_coverage except KeyError: pass sum_x += cont_coverage sum_x_sq += cont_coverage**2 n += 1 mean_cov, std_dev_cov = CalculateMeanCoverage(Contigs, param.first_lib, output_dest, param.bamfile) param.mean_coverage = mean_cov param.std_dev_coverage = std_dev_cov return (G, Contigs, Scaffolds, F, param)
def InitializeObjects(bam_file, Contigs, Scaffolds, param, Information, G_prime, small_contigs, small_scaffolds, C_dict): singeled_out = 0 contig_threshold = param.contig_threshold cont_lengths = bam_file.lengths cont_lengths = [int(nr) for nr in cont_lengths] #convert long to int object cont_names = bam_file.references #Calculate NG50 and LG 50 param.tot_assembly_length = sum(cont_lengths) sorted_lengths = sorted(cont_lengths, reverse=True) N50, L50 = CalculateStats(sorted_lengths, [], param, Information) param.current_L50 = L50 param.current_N50 = N50 #extend_paths = param.extend_paths counter = 0 start = time() for i in range(0, len(cont_names)): counter += 1 if counter % 100000 == 0: print >> Information, 'Time adding 100k keys', time() - start start = time() if cont_names[i] not in C_dict: #errorhandle.unknown_contig(cont_names[i]) continue if cont_lengths[i] >= contig_threshold: C = Contig.contig(cont_names[i]) # Create object contig C.length = cont_lengths[i] C.sequence = C_dict[cont_names[i]] del C_dict[cont_names[i]] scaf_length = C.length # Initially, scaffold consists of only this contig C.direction = True # always in same direction first, False=reverse C.position = 0 #position always 0 #C.links = {} Contigs[ C. name] = C # Create a dict with name as key and the object container as value S = Scaffold.scaffold(param.scaffold_indexer, [C], scaf_length) # Create object scaffold Scaffolds[S.name] = S C.scaffold = S.name param.scaffold_indexer += 1 else: if cont_lengths[ i] > 0: #In case of contigs with size 0 (due to some error in fasta file) C = Contig.contig(cont_names[i]) # Create object contig C.length = cont_lengths[i] C.sequence = C_dict[cont_names[i]] del C_dict[cont_names[i]] scaf_length = C.length # Initially, scaffold consists of only this contig C.direction = True # always in same direction first, False=reverse C.position = 0 #position always 0 small_contigs[ C. name] = C # Create a dict with name as key and the object container as value S = Scaffold.scaffold(param.scaffold_indexer, [C], scaf_length) # Create object scaffold small_scaffolds[S.name] = S C.scaffold = S.name param.scaffold_indexer += 1 singeled_out += 1 del C_dict print >> Information, 'Nr of contigs that was singeled out due to length constraints ' + str( singeled_out) return ()
def PE(Contigs, Scaffolds, bamfile, mean, scaffold_indexer, F, read_len): G = nx.Graph() print 'Parsing BAM file...' #read_len=50 #informative_pair={81:(False,True),97:(True,False),113:(False,False),65:(True,True)} #I switched to look at mates instead since BWA can give false flag combinations for # read-mate when read is mapped but not mate eg 97-149 81-165. But the reverse #does not happen. informative_pair = { 161: (True, False), 145: (False, True), 129: (True, True), 177: (False, False) } #threshold=800 with pysam.Samfile( bamfile, 'r' ) as bam_file: #once real data, change to 'rb', simulated files are on SAM format #Clean contig_library singeled_out = 0 cont_lengths = bam_file.lengths cont_lengths = [int(nr) for nr in cont_lengths] #convert long to int object #print cont_lengths cont_names = bam_file.references ####### WHEN ADDING SHORTER CONTIGS NOT INCLUDED IN THE SCAFFOLDING, ####### WE NEED TO ALSO INITIALIZE OBJECTS FOR THESE, THIS SHOULD BE DONE SOMEWHERE HERE for i in range(0, len(cont_names)): C = Contig.contig(cont_names[i]) # Create object contig C.length = cont_lengths[i] C.scaf_length = C.length # Initially, scaffold consists of only this contig C.direction = True # always in same direction first, False=reverse C.position = 0 #position always 0 C.links = {} Contigs[ C. name] = C # Create a dict with name as key and the object container as value S = Scaffold.scaffold('s' + str(scaffold_indexer), [C], C.length) # Create object scaffold Scaffolds[S.name] = S C.scaffold = S.name G.add_node((S.name, 'L'), length=cont_lengths[i]) G.add_node((S.name, 'R'), length=cont_lengths[i]) scaffold_indexer += 1 #Create "node graph" of contigs (that passed the length criteria). Each having a left and right node print 'Nr of contigs/scaffolds included in scaffolding: ' + str( len(Scaffolds)) #,Scaffolds.keys() for scaffold_ in Scaffolds: G.add_edge( (scaffold_, 'L'), (scaffold_, 'R'), nr_links=None ) #this is a scaffold object but can be both a single contig or a scaffold. # Create the link edges in the graph by fetching info from bam file for alignedread in bam_file: flag_type = alignedread.flag if flag_type in informative_pair: contig1 = bam_file.getrname(alignedread.rname) contig2 = bam_file.getrname(alignedread.mrnm) if contig1 in Contigs and contig2 in Contigs and Contigs[ contig2].scaffold != Contigs[contig1].scaffold: (read_dir, mate_dir) = informative_pair[flag_type] scaf1 = Contigs[contig1].scaffold scaf2 = Contigs[contig2].scaffold #Calculate actual position on scaffold here #position1 cont/scaf1 cont_dir1 = Contigs[ contig1].direction #if pos : L if neg: R cont1_pos = Contigs[contig1].position readpos = alignedread.pos cont1_len = Contigs[contig1].length s1len = Scaffolds[scaf1].s_length #position1 cont1/scaf1 cont_dir2 = Contigs[contig2].direction cont2_pos = Contigs[contig2].position matepos = alignedread.mpos cont2_len = Contigs[contig2].length s2len = Scaffolds[scaf2].s_length (gap, scaf_side1, scaf_side2) = PosDirCalculatorPE( cont_dir1, read_dir, cont1_pos, readpos, s1len, cont1_len, cont_dir2, mate_dir, cont2_pos, matepos, s2len, cont2_len, read_len) if (scaf2, scaf_side2) not in G[(scaf1, scaf_side1)]: G.add_edge((scaf2, scaf_side2), (scaf1, scaf_side1), nr_links=1, gap_dist=[gap]) #print 'Added edge' else: G.edge[(scaf1, scaf_side1)][(scaf2, scaf_side2)]['nr_links'] += 1 #print 'edge' G.edge[(scaf1, scaf_side1)][( scaf2, scaf_side2)]['gap_dist'].append(gap) elif contig1 in Contigs and contig2 in Contigs and Contigs[ contig2].scaffold != Contigs[contig1].scaffold: ########################Use to validate scaffold herein previous step here pass #for edge in G.edges(): # if G[edge[0]][edge[1]]['nr_reads']: # print G[edge[0]][edge[1]]['gap_dist'] #print G.edges(data=True) return (G, Contigs, Scaffolds, F, scaffold_indexer)
def PE(Contigs, Scaffolds, F, Information, output_dest, C_dict, param): G = nx.Graph() print 'Parsing BAM file...' #informative_pair={81:(False,True),97:(True,False),113:(False,False),65:(True,True)} #I switched to look at mates instead since BWA can give false flag combinations for # read-mate when read is mapped but not mate eg 97-149 81-165. But the reverse #does not happen. #informative_pair={161:(True,False),145:(False,True),129:(True,True),177:(False,False)} #,131:(True,True),179:(False,False)} #147:(False,True),163:(True,False), with pysam.Samfile(param.bamfile, 'rb') as bam_file: #once real data, change to 'rb', simulated files are on SAM format #Get parameters -r, -m, -s, -T, -t for library print 'Computing parameters not set by user...' GetParams(bam_file, param, Scaffolds, C_dict, F, Contigs) #Clean contig_library singeled_out = 0 if param.first_lib: cont_lengths = bam_file.lengths cont_lengths = [int(nr) for nr in cont_lengths] #convert long to int object cont_names = bam_file.references #Calculate NG50 and LG 50 param.tot_assembly_length = sum(cont_lengths) sorted_lengths = sorted(cont_lengths, reverse=True) N50, L50 = CalculateStats(sorted_lengths, param) param.current_L50 = L50 param.current_N50 = N50 ####### WHEN ADDING SHORTER CONTIGS NOT INCLUDED IN THE SCAFFOLDING, ####### WE NEED TO ALSO INITIALIZE OBJECTS FOR THESE, THIS SHOULD BE DONE SOMEWHERE HERE for i in range(0, len(cont_names)): if cont_lengths[i] >= param.contig_threshold: C = Contig.contig(cont_names[i]) # Create object contig C.length = cont_lengths[i] scaf_length = C.length # Initially, scaffold consists of only this contig C.direction = True # always in same direction first, False=reverse C.position = 0 #position always 0 C.links = {} Contigs[C.name] = C # Create a dict with name as key and the object container as value S = Scaffold.scaffold(param.scaffold_indexer, [C], scaf_length, {}, {}) # Create object scaffold Scaffolds[S.name] = S C.scaffold = S.name param.scaffold_indexer += 1 else: singeled_out += 1 F.append([(cont_names[i], True, 0, cont_lengths[i], {})]) #list of (contig_name, pos_direction, position,length) print >> Information, 'Nr of contigs that was singeled out due to length constraints ' + str(singeled_out) else: #Clean contig_library/scaffold_library scaf_lengths = [Scaffolds[scaffold_].s_length for scaffold_ in Scaffolds.keys()] sorted_lengths = sorted(scaf_lengths, reverse=True) N50, L50 = CalculateStats(sorted_lengths, param) param.current_L50 = L50 param.current_N50 = N50 for scaffold_ in Scaffolds.keys(): #iterate over keys in hash, so that we can remove keys while iterating over it if Scaffolds[scaffold_].s_length < param.contig_threshold: ### Go to function and print to F ### Remove Scaf_obj from Scaffolds and Contig_obj from contigs S_obj = Scaffolds[scaffold_] list_of_contigs = S_obj.contigs #list of contig objects contained in scaffold object Contigs, F = GO.WriteToF(F, Contigs, list_of_contigs) #Don't worry, the contig objects are removed in WriteTOF function del Scaffolds[scaffold_] singeled_out += 1 print >> Information, 'Nr of contigs/scaffolds that was singeled out due to length constraints ' + str(singeled_out) #Create "node graph" of contigs (that passed the length criteria). Each having a left and right node print 'Nr of contigs/scaffolds included in scaffolding: ' + str(len(Scaffolds))#,Scaffolds.keys() if len(Scaffolds) == 0: return(None, Contigs, Scaffolds, F, param) cnt = 0 tot_start = time() start1 = time() for scaffold_ in Scaffolds: G.add_edge((scaffold_, 'L'), (scaffold_, 'R'), nr_links=None) #this is a scaffold object but can be both a single contig or a scaffold. Scaffolds[ scaffold_ ].scaffold_left_nbrs = {} Scaffolds[ scaffold_ ].scaffold_right_nbrs = {} if cnt % 100000 == 0 and cnt > 0: elapsed = time() - start1 print >> Information, 'Total nr of keys added: ', cnt, 'Time for adding last 100 000 keys: ', elapsed start1 = time() cnt += 1 print 'Total time elapsed: ', time() - tot_start # Create the link edges in the graph by fetching info from bam file cont_aligned_len = {} for contig in Contigs: cont_aligned_len[contig] = [0, Contigs[contig].length] count = 0 non_unique = 0 non_unique_for_scaf = 0 nr_of_duplicates = 0 prev_obs1 = -1 prev_obs2 = -1 reads_with_too_long_insert = 0 #fishy_reads = {} for alignedread in bam_file: try: #check that read is aligned OBS: not with is_unmapped since this flag is fishy for e.g. BWA contig1 = bam_file.getrname(alignedread.rname) contig2 = bam_file.getrname(alignedread.mrnm) except ValueError: continue #contig1=bam_file.getrname(alignedread.rname) ## add to coverage computation if contig is still in the list of considered contigs try: cont_aligned_len[contig1][0] += alignedread.rlen except KeyError: pass ########## CREATE EDGES IN SCAFFOLD GRAPH ########## if contig1 != contig2 and alignedread.is_read2: #check how many non unique reads out of the useful ones (mapping to two different contigs) #This only works for BWA!! implement for other aligners as well if alignedread.mapq == 0: non_unique += 1 #print contig1,contig2 if contig1 in Contigs and contig2 in Contigs and Contigs[contig2].scaffold != Contigs[contig1].scaffold and alignedread.mapq > param.map_quality: # and alignedread.tags[0][1] == 'U': #if alignedread.tags[0][1] != 'U': # non_unique_for_scaf += 1 if alignedread.mapq == 0: non_unique_for_scaf += 1 count += 1 #(read_dir,mate_dir)=informative_pair[flag_type] (read_dir, mate_dir) = (not alignedread.is_reverse, not alignedread.mate_is_reverse) scaf1 = Contigs[contig1].scaffold scaf2 = Contigs[contig2].scaffold #Calculate actual position on scaffold here #position1 cont/scaf1 cont_dir1 = Contigs[contig1].direction #if pos : L if neg: R cont1_pos = Contigs[contig1].position readpos = alignedread.pos cont1_len = Contigs[contig1].length s1len = Scaffolds[scaf1].s_length #position1 cont1/scaf1 cont_dir2 = Contigs[contig2].direction cont2_pos = Contigs[contig2].position matepos = alignedread.mpos cont2_len = Contigs[contig2].length s2len = Scaffolds[scaf2].s_length (obs1, obs2, scaf_side1, scaf_side2) = PosDirCalculatorPE(cont_dir1, read_dir, cont1_pos, readpos, s1len, cont1_len, cont_dir2, mate_dir, cont2_pos, matepos, s2len, cont2_len, param.read_len) if obs1 == prev_obs1 and obs2 == prev_obs2: nr_of_duplicates += 1 if param.detect_duplicate: continue if obs1 + obs2 < param.ins_size_threshold: # if obs1 == 3 or obs2 ==3: # print alignedread.pos,alignedread.mpos, contig1, contig2, scaf1, scaf2, s1len,s2len if scaf_side1 == 'R': if (scaf2, scaf_side2) in Scaffolds[scaf1].right_nbrs_obs: if obs1 < Scaffolds[scaf1].right_nbrs_obs[(scaf2, scaf_side2)]: Scaffolds[scaf1].right_nbrs_obs[(scaf2, scaf_side2)] = obs1 else: Scaffolds[scaf1].right_nbrs_obs[(scaf2, scaf_side2)] = obs1 if scaf_side1 == 'L': if (scaf2, scaf_side2) in Scaffolds[scaf1].left_nbrs_obs: if obs1 < Scaffolds[scaf1].left_nbrs_obs[(scaf2, scaf_side2)]: Scaffolds[scaf1].left_nbrs_obs[(scaf2, scaf_side2)] = obs1 else: Scaffolds[scaf1].left_nbrs_obs[(scaf2, scaf_side2)] = obs1 if scaf_side2 == 'R': if (scaf1, scaf_side1) in Scaffolds[scaf2].right_nbrs_obs: if obs2 < Scaffolds[scaf2].right_nbrs_obs[(scaf1, scaf_side1)]: Scaffolds[scaf2].right_nbrs_obs[(scaf1, scaf_side1)] = obs2 else: Scaffolds[scaf2].right_nbrs_obs[(scaf1, scaf_side1)] = obs2 if scaf_side2 == 'L': if (scaf1, scaf_side1) in Scaffolds[scaf2].left_nbrs_obs: if obs2 < Scaffolds[scaf2].left_nbrs_obs[(scaf1, scaf_side1)]: Scaffolds[scaf2].left_nbrs_obs[(scaf1, scaf_side1)] = obs2 else: Scaffolds[scaf2].left_nbrs_obs[(scaf1, scaf_side1)] = obs2 if (scaf2, scaf_side2) not in G[(scaf1, scaf_side1)]: G.add_edge((scaf2, scaf_side2), (scaf1, scaf_side1), nr_links=1, gap_dist=obs1 + obs2) else: G.edge[(scaf1, scaf_side1)][(scaf2, scaf_side2)]['nr_links'] += 1 G.edge[(scaf1, scaf_side1)][(scaf2, scaf_side2)]['gap_dist'] += obs1 + obs2 else: reads_with_too_long_insert += 1 #fishy_reads[alignedread.qname[:-1]]=[contig2,alignedread.is_read2] ## add to haplotype graph here!! prev_obs1 = obs1 prev_obs2 = obs2 elif contig1 in Contigs and contig2 in Contigs and Contigs[contig2].scaffold != Contigs[contig1].scaffold: ########################Use to validate scaffold in previous step here ############ pass # print 'NR OF FISHY EDGES: ', len(fishy_reads) print 'USEFUL READS (reads mapping to different contigs): ', count #print 'Non unique portion out of "USEFUL READS" (filtered out from scaffolding): ', non_unique #print 'Non unique used for scaf: ', non_unique_for_scaf print 'Reads with too large insert size from "USEFUL READS" (filtered out): ', reads_with_too_long_insert if param.detect_duplicate: print 'Number of duplicated reads indicated and removed: ', nr_of_duplicates ##### Calc coverage for all contigs with current lib here ##### sum_x = 0 sum_x_sq = 0 n = 0 for contig in cont_aligned_len: cont_coverage = cont_aligned_len[contig][0] / float(cont_aligned_len[contig][1]) #print key, cont_aligned_len[key]/float(cont_lengths[i]) try: Contigs[contig].coverage = cont_coverage except KeyError: pass sum_x += cont_coverage sum_x_sq += cont_coverage ** 2 n += 1 mean_cov, std_dev_cov = CalculateMeanCoverage(Contigs, param.first_lib, output_dest, param.bamfile) param.mean_coverage = mean_cov param.std_dev_coverage = std_dev_cov return(G, Contigs, Scaffolds, F, param)
def addContig(self, contigId): """The method addContig creates a contig object and adds this object to the list of contigs involved with this phenotype. """ self.contigs.append(Contig.Contig(contigId, self))
def AddEdges(Contigs,Scaffolds,bamfile,mean,std_dev,scaffold_indexer,F,read_len): #Clean contig_library bam_object = BamParser(bamfile) singeled_out=0 cont_lengths= bam_object.bam_file.lengths cont_lengths=[int(nr) for nr in cont_lengths] #convert long to int object #print cont_lengths cont_names = bam_object.bam_file.references ####### WHEN ADDING SHORTER CONTIGS NOT INCLUDED IN THE SCAFFOLDING, ####### WE NEED TO ALSO INITIALIZE OBJECTS FOR THESE, THIS SHOULD BE DONE SOMEWHERE HERE for i in range(0,len(cont_names)): if cont_lengths[i] >= 300: C=Contig.contig(cont_names[i]) # Create object contig C.length = cont_lengths[i] C.scaf_length = C.length # Initially, scaffold consists of only this contig C.direction = True # always in same direction first, False=reverse C.position = 0 #position always 0 C.links = {} Contigs[C.name] = C # Create a dict with name as key and the object container as value S=Scaffold.scaffold('s'+str(scaffold_indexer),[C],C.length) # Create object scaffold Scaffolds[S.name]=S C.scaffold=S.name G.add_node((S.name,'L'),length=cont_lengths[i]) G.add_node((S.name,'R'),length=cont_lengths[i]) scaffold_indexer+=1 #Create "node graph" of contigs (that passed the length criteria). Each having a left and right node #print 'Nr of contigs/scaffolds included in scaffolding: '+ str(len(Scaffolds))#,Scaffolds.keys() for scaffold_ in Scaffolds: G.add_edge((scaffold_,'L'),(scaffold_,'R'),nr_links=None) #this is a scaffold object but can be both a single contig or a scaffold. # Create the link edges in the graph by fetching info from bam file def nr_softclipps(read): max_soft = 0 for type_,length in read.cigar: if type_ == 4 and length >= max_soft: max_soft = length return max_soft global_max_softclipps = 0 global_min_obs = 100000 links_used = 0 #r_len = float(read_len) for read1,read2 in bam_object.unique_reads_on_different_references(): contig1=bam_object.bam_file.getrname(read1.rname) contig2=bam_object.bam_file.getrname(read2.rname) max_soft_readpair = max(nr_softclipps(read1),nr_softclipps(read2)) if max_soft_readpair > global_max_softclipps: global_max_softclipps = max_soft_readpair # print read1.cigar #if read1.qlen/r_len < 0.7 or read2.qlen/r_len < 0.7: # continue # print 'midddle1',o1, o1+o2, read1.pos, read1.mapq,read1.qlen,read1.rlen, read1.cigar, read1.tags # if read2.qlen < 50: # print 'midddle2',o2, o1+o2, read2.pos, read2.mapq, read2.qlen,read2.rlen, read2.cigar, read2.tags if contig1 in Contigs and contig2 in Contigs: (read_dir,mate_dir) = (not read1.is_reverse,not read2.is_reverse ) scaf1=Contigs[contig1].scaffold scaf2=Contigs[contig2].scaffold #Calculate actual position on scaffold here #position1 cont/scaf1 cont_dir1 = Contigs[contig1].direction #if pos : L if neg: R cont1_pos = Contigs[contig1].position readpos = read1.pos cont1_len = Contigs[contig1].length s1len = Scaffolds[scaf1].s_length #position1 cont1/scaf1 cont_dir2 = Contigs[contig2].direction cont2_pos = Contigs[contig2].position matepos = read2.pos cont2_len = Contigs[contig2].length s2len = Scaffolds[scaf2].s_length (obs,scaf_side1,scaf_side2, (o1,o2))=PosDirCalculatorPE(cont_dir1,read_dir,cont1_pos,readpos,s1len,cont1_len,cont_dir2,mate_dir,cont2_pos,matepos,s2len,cont2_len,read_len) if obs < mean+ 4*std_dev: links_used += 1 if (scaf2,scaf_side2) not in G[(scaf1,scaf_side1)]: G.add_edge((scaf2,scaf_side2),(scaf1,scaf_side1),nr_links=1,gap_dist=[obs],obs_pos=set() ) G[(scaf2,scaf_side2)][(scaf1,scaf_side1)]['obs_pos'].add((o1,o2)) if o1 < global_min_obs: global_min_obs = o1 if o2 < global_min_obs: global_min_obs = o2 #print 'Added edge' else: try: if (o1,o2) in G.edge[(scaf1,scaf_side1)][(scaf2,scaf_side2)]['obs_pos']: continue except KeyError: #print G.edge[(scaf1,scaf_side1)][(scaf2,scaf_side2)] continue # if (o1,o2) in G.edge[(scaf1,scaf_side1)][(scaf2,scaf_side2)]['obs_pos']: # #print 'detected duplicate' # continue else: G.edge[(scaf1,scaf_side1)][(scaf2,scaf_side2)]['nr_links'] += 1 G.edge[(scaf1,scaf_side1)][(scaf2,scaf_side2)]['gap_dist'].append(obs) G.edge[(scaf1,scaf_side1)][(scaf2,scaf_side2)]['obs_pos'].add((o1,o2)) G.edge[(scaf1,scaf_side1)][(scaf2,scaf_side2)]['obs_pos'].add((o2,o1)) if o1 < global_min_obs: global_min_obs = o1 if o2 < global_min_obs: global_min_obs = o2 # if o1 < 50: # print o1, o1+o2, read1.pos, read1.mapq,read1.qlen,read1.rlen, read1.cigar, read1.tags # #print fancy_str(read1) # if o2 < 50: # print o2, o1+o2, read2.pos, read2.mapq, read2.qlen,read2.rlen, read2.cigar, read2.tags # #print fancy_str(read2) print 'Max softclipps:', global_max_softclipps print 'Min obs:', global_min_obs # sys.exit() #print 'Nr links used:', links_used return global_max_softclipps
def PE(Contigs, Scaffolds, bamfile, mean, scaffold_indexer, F, read_len): G = nx.Graph() print "Parsing BAM file..." # read_len=50 # informative_pair={81:(False,True),97:(True,False),113:(False,False),65:(True,True)} # I switched to look at mates instead since BWA can give false flag combinations for # read-mate when read is mapped but not mate eg 97-149 81-165. But the reverse # does not happen. informative_pair = {161: (True, False), 145: (False, True), 129: (True, True), 177: (False, False)} # threshold=800 with pysam.Samfile(bamfile, "r") as bam_file: # once real data, change to 'rb', simulated files are on SAM format # Clean contig_library singeled_out = 0 cont_lengths = bam_file.lengths cont_lengths = [int(nr) for nr in cont_lengths] # convert long to int object # print cont_lengths cont_names = bam_file.references ####### WHEN ADDING SHORTER CONTIGS NOT INCLUDED IN THE SCAFFOLDING, ####### WE NEED TO ALSO INITIALIZE OBJECTS FOR THESE, THIS SHOULD BE DONE SOMEWHERE HERE for i in range(0, len(cont_names)): C = Contig.contig(cont_names[i]) # Create object contig C.length = cont_lengths[i] C.scaf_length = C.length # Initially, scaffold consists of only this contig C.direction = True # always in same direction first, False=reverse C.position = 0 # position always 0 C.links = {} Contigs[C.name] = C # Create a dict with name as key and the object container as value S = Scaffold.scaffold("s" + str(scaffold_indexer), [C], C.length) # Create object scaffold Scaffolds[S.name] = S C.scaffold = S.name G.add_node((S.name, "L"), length=cont_lengths[i]) G.add_node((S.name, "R"), length=cont_lengths[i]) scaffold_indexer += 1 # Create "node graph" of contigs (that passed the length criteria). Each having a left and right node print "Nr of contigs/scaffolds included in scaffolding: " + str(len(Scaffolds)) # ,Scaffolds.keys() for scaffold_ in Scaffolds: G.add_edge( (scaffold_, "L"), (scaffold_, "R"), nr_links=None ) # this is a scaffold object but can be both a single contig or a scaffold. # Create the link edges in the graph by fetching info from bam file for alignedread in bam_file: flag_type = alignedread.flag if flag_type in informative_pair: contig1 = bam_file.getrname(alignedread.rname) contig2 = bam_file.getrname(alignedread.mrnm) if contig1 in Contigs and contig2 in Contigs and Contigs[contig2].scaffold != Contigs[contig1].scaffold: (read_dir, mate_dir) = informative_pair[flag_type] scaf1 = Contigs[contig1].scaffold scaf2 = Contigs[contig2].scaffold # Calculate actual position on scaffold here # position1 cont/scaf1 cont_dir1 = Contigs[contig1].direction # if pos : L if neg: R cont1_pos = Contigs[contig1].position readpos = alignedread.pos cont1_len = Contigs[contig1].length s1len = Scaffolds[scaf1].s_length # position1 cont1/scaf1 cont_dir2 = Contigs[contig2].direction cont2_pos = Contigs[contig2].position matepos = alignedread.mpos cont2_len = Contigs[contig2].length s2len = Scaffolds[scaf2].s_length (gap, scaf_side1, scaf_side2) = PosDirCalculatorPE( cont_dir1, read_dir, cont1_pos, readpos, s1len, cont1_len, cont_dir2, mate_dir, cont2_pos, matepos, s2len, cont2_len, read_len, ) if (scaf2, scaf_side2) not in G[(scaf1, scaf_side1)]: G.add_edge((scaf2, scaf_side2), (scaf1, scaf_side1), nr_links=1, gap_dist=[gap]) # print 'Added edge' else: G.edge[(scaf1, scaf_side1)][(scaf2, scaf_side2)]["nr_links"] += 1 # print 'edge' G.edge[(scaf1, scaf_side1)][(scaf2, scaf_side2)]["gap_dist"].append(gap) elif ( contig1 in Contigs and contig2 in Contigs and Contigs[contig2].scaffold != Contigs[contig1].scaffold ): ########################Use to validate scaffold herein previous step here pass # for edge in G.edges(): # if G[edge[0]][edge[1]]['nr_reads']: # print G[edge[0]][edge[1]]['gap_dist'] # print G.edges(data=True) return (G, Contigs, Scaffolds, F, scaffold_indexer)