def InitializeObjects( bam_file, Contigs, Scaffolds, param, Information, G_prime, small_contigs, small_scaffolds, C_dict ): singeled_out = 0 contig_threshold = param.contig_threshold cont_lengths = bam_file.lengths cont_lengths = [int(nr) for nr in cont_lengths] # convert long to int object cont_names = bam_file.references # Calculate NG50 and LG 50 param.tot_assembly_length = sum(cont_lengths) sorted_lengths = sorted(cont_lengths, reverse=True) N50, L50 = CalculateStats(sorted_lengths, [], param, Information) param.current_L50 = L50 param.current_N50 = N50 # extend_paths = param.extend_paths counter = 0 start = time() for i in range(0, len(cont_names)): counter += 1 if counter % 100000 == 0: print >> Information, "Time adding 100k keys", time() - start start = time() if cont_names[i] not in C_dict: # errorhandle.unknown_contig(cont_names[i]) continue if cont_lengths[i] >= contig_threshold: C = Contig.contig(cont_names[i]) # Create object contig C.length = cont_lengths[i] C.sequence = C_dict[cont_names[i]] del C_dict[cont_names[i]] scaf_length = C.length # Initially, scaffold consists of only this contig C.direction = True # always in same direction first, False=reverse C.position = 0 # position always 0 # C.links = {} Contigs[C.name] = C # Create a dict with name as key and the object container as value S = Scaffold.scaffold(param.scaffold_indexer, [C], scaf_length) # Create object scaffold Scaffolds[S.name] = S C.scaffold = S.name param.scaffold_indexer += 1 else: if cont_lengths[i] > 0: # In case of contigs with size 0 (due to some error in fasta file) C = Contig.contig(cont_names[i]) # Create object contig C.length = cont_lengths[i] C.sequence = C_dict[cont_names[i]] del C_dict[cont_names[i]] scaf_length = C.length # Initially, scaffold consists of only this contig C.direction = True # always in same direction first, False=reverse C.position = 0 # position always 0 small_contigs[C.name] = C # Create a dict with name as key and the object container as value S = Scaffold.scaffold(param.scaffold_indexer, [C], scaf_length) # Create object scaffold small_scaffolds[S.name] = S C.scaffold = S.name param.scaffold_indexer += 1 singeled_out += 1 del C_dict print >> Information, "Nr of contigs that was singeled out due to length constraints " + str(singeled_out) return ()
def NewContigsScaffolds(G, G_prime, Contigs, small_contigs, Scaffolds, small_scaffolds, Information, dValuesTable, param, already_visited): ### Remaining scaffolds are true sensible scaffolds, we must now update both the library of scaffold objects and the library of contig objects new_scaffolds_ = nx.connected_component_subgraphs(G) print >> Information, 'Nr of new scaffolds created in this step: ' + str(len(new_scaffolds_)) for new_scaffold_ in new_scaffolds_: param.scaffold_indexer += 1 #scaf_size=len(new_scaffold_) scaffold_length = 0 contig_list = [] ##### Here PathExtension algorithm is called if PRO is activated ##### if param.extend_paths: PROWithinScaf(G, G_prime, Contigs, small_contigs, Scaffolds, small_scaffolds, param, new_scaffold_, dValuesTable, already_visited) for node in new_scaffold_: if len(G.neighbors(node)) == 1: start = node break for node in new_scaffold_: if len(G.neighbors(node)) == 1 and node != start: end = node #Create info to new scaffold object such as total length and the contig objects included prev_node = ('', '') pos = 0 (G, contig_list, scaffold_length) = UpdateInfo(G, Contigs, small_contigs, Scaffolds, small_scaffolds, start, prev_node, pos, contig_list, scaffold_length, dValuesTable, param) S = Scaffold.scaffold(param.scaffold_indexer, contig_list, scaffold_length, defaultdict(constant_large), defaultdict(constant_large), defaultdict(constant_small), defaultdict(constant_small)) #Create the new scaffold object Scaffolds[S.name] = S #include in scaffold library if param.extend_paths: # Find the ends of the old subgraph new_scaffold_. We want them to be able to relabel these end nodes as the new sides on the new scaffold object created #only these ends are allowed to have links because they are of size mean+ 4*sigma so nothing is supposed to span over. #add the new scaffold object to G_prime G_prime.add_node((S.name, 'L')) #start node G_prime.add_node((S.name, 'R')) # end node G_prime.add_edge((S.name, 'L'), (S.name, 'R'), nr_links=None) try: for nbr in G_prime.neighbors(start): nr_links_ = G_prime[start][nbr]['nr_links'] if nr_links_: obs_ = G_prime[start][nbr]['obs'] G_prime.add_edge((S.name, 'L'), nbr, nr_links=nr_links_, obs=obs_) for nbr in G_prime.neighbors(end): nr_links_ = G_prime[end][nbr]['nr_links'] if nr_links_: obs_ = G_prime[end][nbr]['obs'] G_prime.add_edge((S.name, 'R'), nbr, nr_links=nr_links_, obs=obs_) #remove the old scaffold objects from G_prime G_prime.remove_nodes_from(new_scaffold_) except nx.exception.NetworkXError: pass return(Contigs, Scaffolds, param)
def NewContigsScaffolds(G, Contigs, Scaffolds, F, Information, C_dict, dValuesTable, param): ### Remaining scaffolds are true sensible scaffolds, we must now update both the library of scaffold objects and the library of contig objects new_scaffolds_ = list(nx.connected_component_subgraphs(G)) print 'Nr of new scaffolds created: ' + str(len(new_scaffolds_)) print >> Information, 'Nr of new scaffolds created in this step: ' + str( len(new_scaffolds_)) for new_scaffold_ in new_scaffolds_: param.scaffold_indexer += 1 #scaf_size=len(new_scaffold_) scaffold_length = 0 contig_list = [] #Store nr_of links between contigs before "destroying" the graph for edge in new_scaffold_.edges_iter(): nr_links = G[edge[0]][edge[1]]['nr_links'] side1 = edge[0][1] side2 = edge[1][1] if nr_links: contig_objects1 = Scaffolds[edge[0][0]].contigs contig_objects2 = Scaffolds[edge[1][0]].contigs GiveLinkConnection(Contigs, contig_objects1, contig_objects2, side1, side2, nr_links) for node in new_scaffold_: if len(G.neighbors(node)) == 1: break #Create info to new scaffold object such as total length and the contig objects included prev_node = ('', '') pos = 0 (G, contig_list, scaffold_length) = UpdateInfo(G, Contigs, Scaffolds, node, prev_node, pos, contig_list, scaffold_length, C_dict, dValuesTable, param) S = Scaffold.scaffold(param.scaffold_indexer, contig_list, scaffold_length, {}, {}) #Create the new scaffold object Scaffolds[S.name] = S #include in scaffold library return (Contigs, Scaffolds, F, param)
def NewContigsScaffolds(G, Contigs, Scaffolds, F, Information, C_dict, dValuesTable, param): ### Remaining scaffolds are true sensible scaffolds, we must now update both the library of scaffold objects and the library of contig objects new_scaffolds_ = list(nx.connected_component_subgraphs(G)) print "Nr of new scaffolds created: " + str(len(new_scaffolds_)) print >> Information, "Nr of new scaffolds created in this step: " + str(len(new_scaffolds_)) for new_scaffold_ in new_scaffolds_: param.scaffold_indexer += 1 # scaf_size=len(new_scaffold_) scaffold_length = 0 contig_list = [] # Store nr_of links between contigs before "destroying" the graph for edge in new_scaffold_.edges_iter(): nr_links = G[edge[0]][edge[1]]["nr_links"] side1 = edge[0][1] side2 = edge[1][1] if nr_links: contig_objects1 = Scaffolds[edge[0][0]].contigs contig_objects2 = Scaffolds[edge[1][0]].contigs GiveLinkConnection(Contigs, contig_objects1, contig_objects2, side1, side2, nr_links) for node in new_scaffold_: if len(G.neighbors(node)) == 1: break # Create info to new scaffold object such as total length and the contig objects included prev_node = ("", "") pos = 0 (G, contig_list, scaffold_length) = UpdateInfo( G, Contigs, Scaffolds, node, prev_node, pos, contig_list, scaffold_length, C_dict, dValuesTable, param ) S = Scaffold.scaffold( param.scaffold_indexer, contig_list, scaffold_length, {}, {} ) # Create the new scaffold object Scaffolds[S.name] = S # include in scaffold library return (Contigs, Scaffolds, F, param)
if len(paths) > 1: ScorePaths(G_prime, nodes_present_in_path, paths, all_paths_sorted_wrt_score,param) # for path in all_paths_sorted_wrt_score: # print path if len(all_paths_sorted_wrt_score) > 0: #all_paths_sorted_wrt_score = ExtendScaffolds(all_paths_sorted_wrt_score) return all_paths_sorted_wrt_score #return(all_paths_sorted_wrt_score[-1][2], all_paths_sorted_wrt_score[-1][1], all_paths_sorted_wrt_score[-1][0], all_paths_sorted_wrt_score[-1][3]) #return(all_paths_sorted_wrt_score) # return [] #return([], 0, 0, 0) if __name__ == '__main__': import Scaffold small_scaffolds_test = {} for i in range(1, 7): S = Scaffold.scaffold(i, 0, 0, {}, {}) small_scaffolds_test[S.name] = S start = time() G_prime = nx.Graph() #G.add_nodes_from([(1, 'L'), (1, 'R'), (2, 'L'), (2, 'R'), (3, 'L'), (3, 'R'), (4, 'L'), (4, 'R'), (5, 'L'), (5, 'R')]) for i in range(1, 7): G_prime.add_edge((i, 'L'), (i, 'R'), {'nr_links':0}) G_prime.add_edges_from([((1, 'R'), (2, 'R'), {'nr_links':1}), ((3, 'L'), (4, 'L'), {'nr_links':1}), ((2, 'L'), (3, 'R'), {'nr_links':1}), ((1, 'R'), (5, 'L'), {'nr_links':2}), ((5, 'R'), (4, 'L'), {'nr_links':3}), ((2, 'L'), (5, 'L'), {'nr_links':2}), ((1, 'R'), (4, 'L'), {'nr_links':8}), ((2, 'L'), (6, 'L'), {'nr_links':3}), ((1, 'L'), (4, 'R'), {'nr_links':1}), ((1, 'L'), (4, 'L'), {'nr_links':1}), ((3, 'L'), (4, 'R'), {'nr_links':1}), ((1, 'R'), (2, 'L'), {'nr_links':1}), ((1, 'R'), (5, 'R'), {'nr_links':1}), ((2, 'L'), (5, 'R'), {'nr_links':1})]) G = nx.Graph() G.add_nodes_from([(1, 'L'), (1, 'R'), (4, 'L'), (4, 'R'), (6, 'L'), (6, 'R')]) contigs = [1, 2, 3, 4, 5, 6] print 'Between'
def AddEdges(Contigs,Scaffolds,bamfile,mean,std_dev,scaffold_indexer,F,read_len): #Clean contig_library singeled_out=0 cont_lengths= bam_file.lengths cont_lengths=[int(nr) for nr in cont_lengths] #convert long to int object #print cont_lengths cont_names = bam_file.references ####### WHEN ADDING SHORTER CONTIGS NOT INCLUDED IN THE SCAFFOLDING, ####### WE NEED TO ALSO INITIALIZE OBJECTS FOR THESE, THIS SHOULD BE DONE SOMEWHERE HERE for i in range(0,len(cont_names)): if cont_lengths[i] >= 300: C=Contig.contig(cont_names[i]) # Create object contig C.length = cont_lengths[i] C.scaf_length = C.length # Initially, scaffold consists of only this contig C.direction = True # always in same direction first, False=reverse C.position = 0 #position always 0 C.links = {} Contigs[C.name] = C # Create a dict with name as key and the object container as value S=Scaffold.scaffold('s'+str(scaffold_indexer),[C],C.length) # Create object scaffold Scaffolds[S.name]=S C.scaffold=S.name G.add_node((S.name,'L'),length=cont_lengths[i]) G.add_node((S.name,'R'),length=cont_lengths[i]) scaffold_indexer+=1 #Create "node graph" of contigs (that passed the length criteria). Each having a left and right node #print 'Nr of contigs/scaffolds included in scaffolding: '+ str(len(Scaffolds))#,Scaffolds.keys() for scaffold_ in Scaffolds: G.add_edge((scaffold_,'L'),(scaffold_,'R'),nr_links=None) #this is a scaffold object but can be both a single contig or a scaffold. # Create the link edges in the graph by fetching info from bam file fishy_edges = defaultdict(int) for alignedread in bam_file: try: #check that read is aligned OBS: not with is_unmapped since this flag is fishy for e.g. BWA contig1=bam_file.getrname(alignedread.rname) contig2=bam_file.getrname(alignedread.mrnm) except ValueError: continue if contig1 in Contigs and contig2 in Contigs: #TODO: this if-statement is an ad hoc implementation to deal with BWA's buggy SAM-flag reporting #if BWA fixes this -> remove this statement. If the links in fishy edges is equal to or ore than #the links in the graph G or G'. The edge will be removed. if alignedread.is_unmapped and alignedread.is_read1: # and contig1 != contig2: #Some BWA error in mappings can still slip through, these edges are caracterized by very few links cont_obj1 = Contigs[contig1] scaf_obj1 = Scaffolds[cont_obj1.scaffold] cont_obj2 = Contigs[contig2] scaf_obj2 = Scaffolds[cont_obj2.scaffold] if scaf_obj2.name != scaf_obj1.name: (side1,side2) = CheckDir(cont_obj1,cont_obj2,alignedread) #get scaffold name for contig s1 = Contigs[contig1].scaffold #if contig1 in Contigs else small_contigs[contig1].scaffold s2 = Contigs[contig2].scaffold #if contig2 in Contigs else small_contigs[contig2].scaffold fishy_edges[((s1,side1),(s2,side2))] +=1 fishy_edges[((s2,side2),(s1,side1))] +=1 #if contig1 in Contigs and contig2 in Contigs and Contigs[contig2].scaffold != Contigs[contig1].scaffold: if contig1 != contig2 and alignedread.is_read2 and not alignedread.is_unmapped and alignedread.mapq > 20: (read_dir,mate_dir) = (not alignedread.is_reverse,not alignedread.mate_is_reverse ) scaf1=Contigs[contig1].scaffold scaf2=Contigs[contig2].scaffold #Calculate actual position on scaffold here #position1 cont/scaf1 cont_dir1 = Contigs[contig1].direction #if pos : L if neg: R cont1_pos = Contigs[contig1].position readpos = alignedread.pos cont1_len = Contigs[contig1].length s1len = Scaffolds[scaf1].s_length #position1 cont1/scaf1 cont_dir2 = Contigs[contig2].direction cont2_pos = Contigs[contig2].position matepos = alignedread.mpos cont2_len = Contigs[contig2].length s2len = Scaffolds[scaf2].s_length (obs,scaf_side1,scaf_side2)=PosDirCalculatorPE(cont_dir1,read_dir,cont1_pos,readpos,s1len,cont1_len,cont_dir2,mate_dir,cont2_pos,matepos,s2len,cont2_len,read_len) if obs < mean+ 6*std_dev: if (scaf2,scaf_side2) not in G[(scaf1,scaf_side1)]: G.add_edge((scaf2,scaf_side2),(scaf1,scaf_side1),nr_links=1,gap_dist=[obs]) #print 'Added edge' else: G.edge[(scaf1,scaf_side1)][(scaf2,scaf_side2)]['nr_links'] += 1 #print 'edge' G.edge[(scaf1,scaf_side1)][(scaf2,scaf_side2)]['gap_dist'].append(obs) elif contig1 in Contigs and contig2 in Contigs and Contigs[contig2].scaffold != Contigs[contig1].scaffold: ########################Use to validate scaffold herein previous step here pass RemoveBugEdges(G,fishy_edges)
def PROBetweenScaf(G_prime, Contigs, small_contigs, Scaffolds, small_scaffolds, param, dValuesTable, Information): start_scaf_index = param.scaffold_indexer G = nx.Graph() for node in G_prime: if node[0] in Scaffolds: # meets the length criteria G.add_node(node) # Filtering and heuristic here to reduce computation if needed O(n^2) in contigs on pathfinder #remove all solated contigs for node in G.nodes(): if node in G: nbr = G_prime.neighbors(node)[0] if len(G_prime.neighbors(node)) == 1 and len(G_prime.neighbors(nbr)) == 1: G.remove_nodes_from([node, nbr]) if len(G.nodes()) / 2.0 > 10000: # Too few short contigs compared to long (ratio set to 0.1) or lib ins size + 2*std_dev - 2*read_len < 200 ) and too many large contigs (> 10 000) do not enter path extension algm since to low payoff: if len(small_scaffolds) / float(len(Scaffolds)) < 0.1: print >> Information, "Did not enter path seartching algorithm between scaffolds due to too small fraction of small scaffolds, fraction were: ", len(small_scaffolds) / float(len(Scaffolds)) return(start_scaf_index) ########### Find paths between scaffolds here ############### # Multi Processing (if available), check nr of available cores num_cores = multiprocessing.cpu_count() #TODO: If we get too many paths back and run into memory issues we could change so that only paths with score over 0 are stored in ELS module if param.multiprocess and num_cores > 1: import workerprocess import heapq print >> Information, 'Entering ELS.BetweenScaffolds parallelized with ', num_cores, ' cores.' start = time.time() # load up work queue work_queue = multiprocessing.Queue() end = set() for node in G: end.add(node) nodes = G.nodes() nr_jobs = len(nodes) chunk = nr_jobs / (num_cores) counter = 0 nr_processes = 0 # partition equally many nodes in G to each core while counter < nr_jobs: work_queue.put((set(nodes[counter:counter + chunk]), G_prime, end, param)) nr_processes += 1 print >> Information, 'node nr', counter, 'to', counter + chunk - 1, 'added' #print work_queue.get() counter += chunk # create a queue to pass to workers to store the results result_queue = multiprocessing.Queue() # spawn workers while not work_queue.empty(): worker = workerprocess.Worker(work_queue.get(), result_queue) worker.start() # collect the results off the queue results = [] for i in range(nr_processes): res = result_queue.get() results.append(res) def wrapper(func, args): return(func(*args)) all_paths_sorted_wrt_score_itr = wrapper(heapq.merge, results) #tot_result all_paths_sorted_wrt_score = [i for i in all_paths_sorted_wrt_score_itr] elapsed = time.time() - start print >> Information, "Elapsed time multiprocessing: ", elapsed else: start = time.time() end = set() for node in G: end.add(node) iter_nodes = end.copy() print >> Information, 'Entering ELS.BetweenScaffolds single core' all_paths_sorted_wrt_score = ELS.BetweenScaffolds(G_prime, end, iter_nodes, param) elapsed = time.time() - start print >> Information, "Elapsed time single core pathfinder: ", elapsed ################################################################ start_end_node_update_storage = {} print >> Information, 'Total number of paths between scaffolds detected:', len(all_paths_sorted_wrt_score) for sublist in reversed(all_paths_sorted_wrt_score): path = sublist[2] bad_links = sublist[1] score = sublist[0] path_len = sublist[3] print >> Information, 'Path: path length: {0}, nr bad links: {1}, score: {2} '.format((path_len - 2) / 2.0, bad_links, score) ## Need something here that keeps track on which contigs that are added to Scaffolds so that a ## contig is only present once in each path #print start_end_node_update_storage # Either a small contig/scaffold has been included in a path earlier and thus has moved it's object to Scaffolds (and changed index) small_scaf_is_already_in = 0 for scaf_ in path[1:-1]: if scaf_[0] not in small_scaffolds: small_scaf_is_already_in = 1 #print 'At least one of the contigs is already in another scaffold' break if small_scaf_is_already_in: continue # A very special corner case (circular paths) if path[0][0] not in Scaffolds and path[-1][0] not in Scaffolds: try: strt = start_end_node_update_storage[path[0]][0] nd = start_end_node_update_storage[path[-1]][0] if strt[0] == nd[0]: print >> Information, 'Rare case (circular paths) detected and treated. ' continue except KeyError: pass # Or a large scaffold/contig has changed scaffold index due to one of it's sides is present in another path (we still want to allow for paths from the other side) case1 = 0 case2 = 0 if path[0][0] not in Scaffolds: if path[0] in start_end_node_update_storage: case1 = 1 else: print >> Information, 'Beginning is already in path' continue if path[-1][0] not in Scaffolds: if path[-1] in start_end_node_update_storage: case2 = 1 else: print >> Information, 'End is already in path' continue original_start_node = path[0] if path[0][0] not in Scaffolds: #large scaffold has changed index before. This suggested path is however from it's other side node_to_remove1 = path[0] path[0] = start_end_node_update_storage[node_to_remove1][0] #update the node on the other end of the end scaffold to point at the newest index node_to_refresh1 = start_end_node_update_storage[node_to_remove1][1] #print 'Enter 1' try: node_ptr = start_end_node_update_storage[ path[-1] ][1] #print '1.1', node_ptr,start_end_node_update_storage[ path[-1] ] except KeyError: other_side = 'L' if path[-1][1] == 'R' else 'R' node_ptr = (path[-1][0], other_side) #print '1.2', node_ptr, path[-1] start_end_node_update_storage[node_to_refresh1] = [(param.scaffold_indexer + 1, 'L'), node_ptr ] #path pointer can be accesed only once needs to be destroyed after del start_end_node_update_storage[node_to_remove1] if path[-1][0] not in Scaffolds: #large scaffold has changed index before. This suggested path is however from it's other side #print 'case2.2' node_to_remove2 = path[-1] path[-1] = start_end_node_update_storage[node_to_remove2][0] #update the node on the other end of the end scaffold to point at the newest index node_to_refresh2 = start_end_node_update_storage[node_to_remove2][1] #print 'Enter 2' try: node_ptr = start_end_node_update_storage[ original_start_node ][1] #print '2.1', node_ptr, start_end_node_update_storage[ original_start_node ] except KeyError: other_side = 'L' if original_start_node[1] == 'R' else 'R' node_ptr = (original_start_node[0], other_side) #print '2.2', node_ptr,original_start_node start_end_node_update_storage[node_to_refresh2] = [(param.scaffold_indexer + 1, 'R'), node_ptr ] #path pointer can be accesed only once needs to be destroyed after del start_end_node_update_storage[node_to_remove2] # Here we update the contigs that lies in small_contigs to Contigs. We need to do this here because # we update the scaffold index below # move all contig and scaffold objects from "small" structure to large structure to fit with UpdateInfo structure small_scafs = map(lambda i: path[i], filter(lambda i: i % 2 == 1, range(len(path) - 1))) for item in small_scafs: scaf_obj = small_scaffolds[item[0]] Scaffolds[item[0]] = scaf_obj cont_objects = scaf_obj.contigs for obj_ in cont_objects: ctg_name = obj_.name Contigs[ctg_name] = obj_ del small_contigs[ctg_name] del small_scaffolds[item[0]] ## Here we do the "joining of two scaffolds with the new path if no contig/scaffold is present ## in another path, we need to update "Scaffolds" structure here along as we go in order for ## the above dublette checking function to work #make the path a small linear graph G_ = nx.Graph() # if path[0][1] == 'L': # path.insert(0,(path[0][0],'R')) # else: # path.insert(0,(path[0][0],'L')) # if path[len(path)-1][1] == 'L': # path.insert(len(path),(path[len(path)-1][0],'R')) # else: # path.insert(len(path),(path[len(path)-1][0],'L')) path.insert(0, (path[0][0], 'R')) if path[0][1] == 'L' else path.insert(0, (path[0][0], 'L')) path.insert(len(path), (path[-1][0], 'R')) if path[-1][1] == 'L' else path.insert(len(path), (path[-1][0], 'L')) start_end_node_update_storage[path[0]] = 0 start_end_node_update_storage[path[-1]] = 0 G_.add_edges_from(zip(path[::1], path[1::])) for edge in G_.edges(): try: G_[edge[0]][edge[1]]['nr_links'] = G_prime[edge[0]][edge[1]]['nr_links'] except KeyError: print >> Information, path try: Scaffolds[edge[0][0]] print >> Information, edge[0][0] , 'is in Scaffolds' except KeyError: print >> Information, edge[0][0] , 'is not in Scaffolds' try: Scaffolds[edge[1][0]] print >> Information, edge[1][0] , 'is in Scaffolds' except KeyError: print >> Information, edge[1][0] , 'is not in Scaffolds' try: small_scaffolds[edge[0][0]] print >> Information, edge[0][0] , 'is in small_scaffolds' except KeyError: print >> Information, edge[0][0] , 'is not in small_scaffolds' try: small_scaffolds[edge[1][0]] print >> Information, edge[1][0] , 'is in small_scaffolds' except KeyError: print >> Information, edge[1][0] , 'is not in small_scaffolds' try: G_prime[edge[0]] print >> Information, edge[0] , 'is in G_prime' print >> Information, G_prime[edge[0]] except KeyError: print >> Information, edge[0] , 'is not in G_prime' try: G_prime[edge[1]] print >> Information, edge[1] , 'is in G_prime' print >> Information, G_prime[edge[1]] except KeyError: print >> Information, edge[1] , 'is not in G_prime' G_[edge[0]][edge[1]]['nr_links'] = G_prime[edge[0]][edge[1]]['nr_links'] sys.exit() try: G_[edge[0]][edge[1]]['obs'] = G_prime[edge[0]][edge[1]]['obs'] except KeyError: #may be the two different sides of a contig (has no gap dist) pass start = path[0] end = path[-1] prev_node = ('', '') pos = 0 scaffold_length = 0 contig_list = [] param.scaffold_indexer += 1 (G, contig_list, scaffold_length) = UpdateInfo(G_, Contigs, small_contigs, Scaffolds, small_scaffolds, start, prev_node, pos, contig_list, scaffold_length, dValuesTable, param) S = Scaffold.scaffold(param.scaffold_indexer, contig_list, scaffold_length, defaultdict(constant_large), defaultdict(constant_large), defaultdict(constant_small), defaultdict(constant_small)) #Create the new scaffold object Scaffolds[S.name] = S #include in scaffold library #add the new scaffold object to G_prime G_prime.add_node((S.name, 'L')) #start node G_prime.add_node((S.name, 'R')) # end node G_prime.add_edge((S.name, 'L'), (S.name, 'R'), nr_links=None) for nbr in G_prime.neighbors(start): nr_links_ = G_prime[start][nbr]['nr_links'] if nr_links_: obs_ = G_prime[start][nbr]['obs'] G_prime.add_edge((S.name, 'L'), nbr, nr_links=nr_links_, obs=obs_) for nbr in G_prime.neighbors(end): nr_links_ = G_prime[end][nbr]['nr_links'] if nr_links_: obs_ = G_prime[end][nbr]['obs'] G_prime.add_edge((S.name, 'R'), nbr, nr_links=nr_links_, obs=obs_) #remove the old scaffold objects from G_prime G_prime.remove_nodes_from(path) #updated beginning if case1 and not case2: start_end_node_update_storage[node_to_refresh1] = [(S.name, 'L'), path[-1] ] start_end_node_update_storage[path[-1]] = [(S.name, 'R'), node_to_refresh1 ] elif case2 and not case1: start_end_node_update_storage[path[0]] = [(S.name, 'L'), node_to_refresh2 ] start_end_node_update_storage[node_to_refresh2] = [(S.name, 'R'), path[0] ] elif case1 and case2: start_end_node_update_storage[node_to_refresh1] = [(S.name, 'L'), node_to_refresh2 ] start_end_node_update_storage[node_to_refresh2] = [(S.name, 'R'), node_to_refresh1 ] else: start_end_node_update_storage[path[0]] = [(S.name, 'L'), path[-1] ] start_end_node_update_storage[path[-1]] = [(S.name, 'R'), path[0] ] return(start_scaf_index)
def PE(Contigs, Scaffolds, F, Information, output_dest, C_dict, param): G = nx.Graph() print 'Parsing BAM file...' #informative_pair={81:(False,True),97:(True,False),113:(False,False),65:(True,True)} #I switched to look at mates instead since BWA can give false flag combinations for # read-mate when read is mapped but not mate eg 97-149 81-165. But the reverse #does not happen. #informative_pair={161:(True,False),145:(False,True),129:(True,True),177:(False,False)} #,131:(True,True),179:(False,False)} #147:(False,True),163:(True,False), with pysam.Samfile( param.bamfile, 'rb' ) as bam_file: #once real data, change to 'rb', simulated files are on SAM format #Get parameters -r, -m, -s, -T, -t for library print 'Computing parameters not set by user...' GetParams(bam_file, param, Scaffolds, C_dict, F, Contigs) #Clean contig_library singeled_out = 0 if param.first_lib: cont_lengths = bam_file.lengths cont_lengths = [int(nr) for nr in cont_lengths ] #convert long to int object cont_names = bam_file.references #Calculate NG50 and LG 50 param.tot_assembly_length = sum(cont_lengths) sorted_lengths = sorted(cont_lengths, reverse=True) N50, L50 = CalculateStats(sorted_lengths, param) param.current_L50 = L50 param.current_N50 = N50 ####### WHEN ADDING SHORTER CONTIGS NOT INCLUDED IN THE SCAFFOLDING, ####### WE NEED TO ALSO INITIALIZE OBJECTS FOR THESE, THIS SHOULD BE DONE SOMEWHERE HERE for i in range(0, len(cont_names)): if cont_lengths[i] >= param.contig_threshold: C = Contig.contig(cont_names[i]) # Create object contig C.length = cont_lengths[i] scaf_length = C.length # Initially, scaffold consists of only this contig C.direction = True # always in same direction first, False=reverse C.position = 0 #position always 0 C.links = {} Contigs[ C. name] = C # Create a dict with name as key and the object container as value S = Scaffold.scaffold(param.scaffold_indexer, [C], scaf_length, {}, {}) # Create object scaffold Scaffolds[S.name] = S C.scaffold = S.name param.scaffold_indexer += 1 else: singeled_out += 1 F.append([ (cont_names[i], True, 0, cont_lengths[i], {}) ]) #list of (contig_name, pos_direction, position,length) print >> Information, 'Nr of contigs that was singeled out due to length constraints ' + str( singeled_out) else: #Clean contig_library/scaffold_library scaf_lengths = [ Scaffolds[scaffold_].s_length for scaffold_ in Scaffolds.keys() ] sorted_lengths = sorted(scaf_lengths, reverse=True) N50, L50 = CalculateStats(sorted_lengths, param) param.current_L50 = L50 param.current_N50 = N50 for scaffold_ in Scaffolds.keys( ): #iterate over keys in hash, so that we can remove keys while iterating over it if Scaffolds[scaffold_].s_length < param.contig_threshold: ### Go to function and print to F ### Remove Scaf_obj from Scaffolds and Contig_obj from contigs S_obj = Scaffolds[scaffold_] list_of_contigs = S_obj.contigs #list of contig objects contained in scaffold object Contigs, F = GO.WriteToF( F, Contigs, list_of_contigs ) #Don't worry, the contig objects are removed in WriteTOF function del Scaffolds[scaffold_] singeled_out += 1 print >> Information, 'Nr of contigs/scaffolds that was singeled out due to length constraints ' + str( singeled_out) #Create "node graph" of contigs (that passed the length criteria). Each having a left and right node print 'Nr of contigs/scaffolds included in scaffolding: ' + str( len(Scaffolds)) #,Scaffolds.keys() if len(Scaffolds) == 0: return (None, Contigs, Scaffolds, F, param) cnt = 0 tot_start = time() start1 = time() for scaffold_ in Scaffolds: G.add_edge( (scaffold_, 'L'), (scaffold_, 'R'), nr_links=None ) #this is a scaffold object but can be both a single contig or a scaffold. Scaffolds[scaffold_].scaffold_left_nbrs = {} Scaffolds[scaffold_].scaffold_right_nbrs = {} if cnt % 100000 == 0 and cnt > 0: elapsed = time() - start1 print >> Information, 'Total nr of keys added: ', cnt, 'Time for adding last 100 000 keys: ', elapsed start1 = time() cnt += 1 print 'Total time elapsed: ', time() - tot_start # Create the link edges in the graph by fetching info from bam file cont_aligned_len = {} for contig in Contigs: cont_aligned_len[contig] = [0, Contigs[contig].length] count = 0 non_unique = 0 non_unique_for_scaf = 0 nr_of_duplicates = 0 prev_obs1 = -1 prev_obs2 = -1 reads_with_too_long_insert = 0 #fishy_reads = {} for alignedread in bam_file: try: #check that read is aligned OBS: not with is_unmapped since this flag is fishy for e.g. BWA contig1 = bam_file.getrname(alignedread.rname) contig2 = bam_file.getrname(alignedread.mrnm) except ValueError: continue #contig1=bam_file.getrname(alignedread.rname) ## add to coverage computation if contig is still in the list of considered contigs try: cont_aligned_len[contig1][0] += alignedread.rlen except KeyError: pass ########## CREATE EDGES IN SCAFFOLD GRAPH ########## if contig1 != contig2 and alignedread.is_read2: #check how many non unique reads out of the useful ones (mapping to two different contigs) #This only works for BWA!! implement for other aligners as well if alignedread.mapq == 0: non_unique += 1 #print contig1,contig2 if contig1 in Contigs and contig2 in Contigs and Contigs[ contig2].scaffold != Contigs[ contig1].scaffold and alignedread.mapq > param.map_quality: # and alignedread.tags[0][1] == 'U': #if alignedread.tags[0][1] != 'U': # non_unique_for_scaf += 1 if alignedread.mapq == 0: non_unique_for_scaf += 1 count += 1 #(read_dir,mate_dir)=informative_pair[flag_type] (read_dir, mate_dir) = (not alignedread.is_reverse, not alignedread.mate_is_reverse) scaf1 = Contigs[contig1].scaffold scaf2 = Contigs[contig2].scaffold #Calculate actual position on scaffold here #position1 cont/scaf1 cont_dir1 = Contigs[ contig1].direction #if pos : L if neg: R cont1_pos = Contigs[contig1].position readpos = alignedread.pos cont1_len = Contigs[contig1].length s1len = Scaffolds[scaf1].s_length #position1 cont1/scaf1 cont_dir2 = Contigs[contig2].direction cont2_pos = Contigs[contig2].position matepos = alignedread.mpos cont2_len = Contigs[contig2].length s2len = Scaffolds[scaf2].s_length (obs1, obs2, scaf_side1, scaf_side2) = PosDirCalculatorPE( cont_dir1, read_dir, cont1_pos, readpos, s1len, cont1_len, cont_dir2, mate_dir, cont2_pos, matepos, s2len, cont2_len, param.read_len) if obs1 == prev_obs1 and obs2 == prev_obs2: nr_of_duplicates += 1 if param.detect_duplicate: continue if obs1 + obs2 < param.ins_size_threshold: # if obs1 == 3 or obs2 ==3: # print alignedread.pos,alignedread.mpos, contig1, contig2, scaf1, scaf2, s1len,s2len if scaf_side1 == 'R': if (scaf2, scaf_side2 ) in Scaffolds[scaf1].right_nbrs_obs: if obs1 < Scaffolds[scaf1].right_nbrs_obs[( scaf2, scaf_side2)]: Scaffolds[scaf1].right_nbrs_obs[( scaf2, scaf_side2)] = obs1 else: Scaffolds[scaf1].right_nbrs_obs[( scaf2, scaf_side2)] = obs1 if scaf_side1 == 'L': if (scaf2, scaf_side2 ) in Scaffolds[scaf1].left_nbrs_obs: if obs1 < Scaffolds[scaf1].left_nbrs_obs[( scaf2, scaf_side2)]: Scaffolds[scaf1].left_nbrs_obs[( scaf2, scaf_side2)] = obs1 else: Scaffolds[scaf1].left_nbrs_obs[( scaf2, scaf_side2)] = obs1 if scaf_side2 == 'R': if (scaf1, scaf_side1 ) in Scaffolds[scaf2].right_nbrs_obs: if obs2 < Scaffolds[scaf2].right_nbrs_obs[( scaf1, scaf_side1)]: Scaffolds[scaf2].right_nbrs_obs[( scaf1, scaf_side1)] = obs2 else: Scaffolds[scaf2].right_nbrs_obs[( scaf1, scaf_side1)] = obs2 if scaf_side2 == 'L': if (scaf1, scaf_side1 ) in Scaffolds[scaf2].left_nbrs_obs: if obs2 < Scaffolds[scaf2].left_nbrs_obs[( scaf1, scaf_side1)]: Scaffolds[scaf2].left_nbrs_obs[( scaf1, scaf_side1)] = obs2 else: Scaffolds[scaf2].left_nbrs_obs[( scaf1, scaf_side1)] = obs2 if (scaf2, scaf_side2) not in G[(scaf1, scaf_side1)]: G.add_edge((scaf2, scaf_side2), (scaf1, scaf_side1), nr_links=1, gap_dist=obs1 + obs2) else: G.edge[(scaf1, scaf_side1)][(scaf2, scaf_side2)]['nr_links'] += 1 G.edge[(scaf1, scaf_side1)][( scaf2, scaf_side2)]['gap_dist'] += obs1 + obs2 else: reads_with_too_long_insert += 1 #fishy_reads[alignedread.qname[:-1]]=[contig2,alignedread.is_read2] ## add to haplotype graph here!! prev_obs1 = obs1 prev_obs2 = obs2 elif contig1 in Contigs and contig2 in Contigs and Contigs[ contig2].scaffold != Contigs[contig1].scaffold: ########################Use to validate scaffold in previous step here ############ pass # print 'NR OF FISHY EDGES: ', len(fishy_reads) print 'USEFUL READS (reads mapping to different contigs): ', count #print 'Non unique portion out of "USEFUL READS" (filtered out from scaffolding): ', non_unique #print 'Non unique used for scaf: ', non_unique_for_scaf print 'Reads with too large insert size from "USEFUL READS" (filtered out): ', reads_with_too_long_insert if param.detect_duplicate: print 'Number of duplicated reads indicated and removed: ', nr_of_duplicates ##### Calc coverage for all contigs with current lib here ##### sum_x = 0 sum_x_sq = 0 n = 0 for contig in cont_aligned_len: cont_coverage = cont_aligned_len[contig][0] / float( cont_aligned_len[contig][1]) #print key, cont_aligned_len[key]/float(cont_lengths[i]) try: Contigs[contig].coverage = cont_coverage except KeyError: pass sum_x += cont_coverage sum_x_sq += cont_coverage**2 n += 1 mean_cov, std_dev_cov = CalculateMeanCoverage(Contigs, param.first_lib, output_dest, param.bamfile) param.mean_coverage = mean_cov param.std_dev_coverage = std_dev_cov return (G, Contigs, Scaffolds, F, param)
#print paths if len(paths) > 1: ScorePaths(G_prime, paths, all_paths, param) all_paths.sort(key=lambda list_: list_[0]) if len(all_paths) > 0: return all_paths return [] if __name__ == '__main__': import Scaffold small_scaffolds_test = {} for i in range(1, 7): S = Scaffold.scaffold(i, 0, 0, {}, {}) small_scaffolds_test[S.name] = S start = time() G_prime = nx.Graph() #G.add_nodes_from([(1, 'L'), (1, 'R'), (2, 'L'), (2, 'R'), (3, 'L'), (3, 'R'), (4, 'L'), (4, 'R'), (5, 'L'), (5, 'R')]) for i in range(1, 7): G_prime.add_edge((i, 'L'), (i, 'R'), {'nr_links': 0}) G_prime.add_edges_from([((1, 'R'), (2, 'R'), { 'nr_links': 1 }), ((3, 'L'), (4, 'L'), { 'nr_links': 1 }), ((2, 'L'), (3, 'R'), { 'nr_links': 1 }), ((1, 'R'), (5, 'L'), { 'nr_links': 2 }), ((5, 'R'), (4, 'L'), {
def InitializeObjects(bam_file, Contigs, Scaffolds, param, Information, G_prime, small_contigs, small_scaffolds, C_dict): singeled_out = 0 contig_threshold = param.contig_threshold cont_lengths = bam_file.lengths cont_lengths = [int(nr) for nr in cont_lengths] #convert long to int object cont_names = bam_file.references #Calculate NG50 and LG 50 param.tot_assembly_length = sum(cont_lengths) sorted_lengths = sorted(cont_lengths, reverse=True) N50, L50 = CalculateStats(sorted_lengths, [], param, Information) param.current_L50 = L50 param.current_N50 = N50 #extend_paths = param.extend_paths counter = 0 start = time() for i in range(0, len(cont_names)): counter += 1 if counter % 100000 == 0: print >> Information, 'Time adding 100k keys', time() - start start = time() if cont_names[i] not in C_dict: #errorhandle.unknown_contig(cont_names[i]) continue if cont_lengths[i] >= contig_threshold: C = Contig.contig(cont_names[i]) # Create object contig C.length = cont_lengths[i] C.sequence = C_dict[cont_names[i]] del C_dict[cont_names[i]] scaf_length = C.length # Initially, scaffold consists of only this contig C.direction = True # always in same direction first, False=reverse C.position = 0 #position always 0 #C.links = {} Contigs[ C. name] = C # Create a dict with name as key and the object container as value S = Scaffold.scaffold(param.scaffold_indexer, [C], scaf_length) # Create object scaffold Scaffolds[S.name] = S C.scaffold = S.name param.scaffold_indexer += 1 else: if cont_lengths[ i] > 0: #In case of contigs with size 0 (due to some error in fasta file) C = Contig.contig(cont_names[i]) # Create object contig C.length = cont_lengths[i] C.sequence = C_dict[cont_names[i]] del C_dict[cont_names[i]] scaf_length = C.length # Initially, scaffold consists of only this contig C.direction = True # always in same direction first, False=reverse C.position = 0 #position always 0 small_contigs[ C. name] = C # Create a dict with name as key and the object container as value S = Scaffold.scaffold(param.scaffold_indexer, [C], scaf_length) # Create object scaffold small_scaffolds[S.name] = S C.scaffold = S.name param.scaffold_indexer += 1 singeled_out += 1 del C_dict print >> Information, 'Nr of contigs that was singeled out due to length constraints ' + str( singeled_out) return ()
def PE(Contigs, Scaffolds, bamfile, mean, scaffold_indexer, F, read_len): G = nx.Graph() print 'Parsing BAM file...' #read_len=50 #informative_pair={81:(False,True),97:(True,False),113:(False,False),65:(True,True)} #I switched to look at mates instead since BWA can give false flag combinations for # read-mate when read is mapped but not mate eg 97-149 81-165. But the reverse #does not happen. informative_pair = { 161: (True, False), 145: (False, True), 129: (True, True), 177: (False, False) } #threshold=800 with pysam.Samfile( bamfile, 'r' ) as bam_file: #once real data, change to 'rb', simulated files are on SAM format #Clean contig_library singeled_out = 0 cont_lengths = bam_file.lengths cont_lengths = [int(nr) for nr in cont_lengths] #convert long to int object #print cont_lengths cont_names = bam_file.references ####### WHEN ADDING SHORTER CONTIGS NOT INCLUDED IN THE SCAFFOLDING, ####### WE NEED TO ALSO INITIALIZE OBJECTS FOR THESE, THIS SHOULD BE DONE SOMEWHERE HERE for i in range(0, len(cont_names)): C = Contig.contig(cont_names[i]) # Create object contig C.length = cont_lengths[i] C.scaf_length = C.length # Initially, scaffold consists of only this contig C.direction = True # always in same direction first, False=reverse C.position = 0 #position always 0 C.links = {} Contigs[ C. name] = C # Create a dict with name as key and the object container as value S = Scaffold.scaffold('s' + str(scaffold_indexer), [C], C.length) # Create object scaffold Scaffolds[S.name] = S C.scaffold = S.name G.add_node((S.name, 'L'), length=cont_lengths[i]) G.add_node((S.name, 'R'), length=cont_lengths[i]) scaffold_indexer += 1 #Create "node graph" of contigs (that passed the length criteria). Each having a left and right node print 'Nr of contigs/scaffolds included in scaffolding: ' + str( len(Scaffolds)) #,Scaffolds.keys() for scaffold_ in Scaffolds: G.add_edge( (scaffold_, 'L'), (scaffold_, 'R'), nr_links=None ) #this is a scaffold object but can be both a single contig or a scaffold. # Create the link edges in the graph by fetching info from bam file for alignedread in bam_file: flag_type = alignedread.flag if flag_type in informative_pair: contig1 = bam_file.getrname(alignedread.rname) contig2 = bam_file.getrname(alignedread.mrnm) if contig1 in Contigs and contig2 in Contigs and Contigs[ contig2].scaffold != Contigs[contig1].scaffold: (read_dir, mate_dir) = informative_pair[flag_type] scaf1 = Contigs[contig1].scaffold scaf2 = Contigs[contig2].scaffold #Calculate actual position on scaffold here #position1 cont/scaf1 cont_dir1 = Contigs[ contig1].direction #if pos : L if neg: R cont1_pos = Contigs[contig1].position readpos = alignedread.pos cont1_len = Contigs[contig1].length s1len = Scaffolds[scaf1].s_length #position1 cont1/scaf1 cont_dir2 = Contigs[contig2].direction cont2_pos = Contigs[contig2].position matepos = alignedread.mpos cont2_len = Contigs[contig2].length s2len = Scaffolds[scaf2].s_length (gap, scaf_side1, scaf_side2) = PosDirCalculatorPE( cont_dir1, read_dir, cont1_pos, readpos, s1len, cont1_len, cont_dir2, mate_dir, cont2_pos, matepos, s2len, cont2_len, read_len) if (scaf2, scaf_side2) not in G[(scaf1, scaf_side1)]: G.add_edge((scaf2, scaf_side2), (scaf1, scaf_side1), nr_links=1, gap_dist=[gap]) #print 'Added edge' else: G.edge[(scaf1, scaf_side1)][(scaf2, scaf_side2)]['nr_links'] += 1 #print 'edge' G.edge[(scaf1, scaf_side1)][( scaf2, scaf_side2)]['gap_dist'].append(gap) elif contig1 in Contigs and contig2 in Contigs and Contigs[ contig2].scaffold != Contigs[contig1].scaffold: ########################Use to validate scaffold herein previous step here pass #for edge in G.edges(): # if G[edge[0]][edge[1]]['nr_reads']: # print G[edge[0]][edge[1]]['gap_dist'] #print G.edges(data=True) return (G, Contigs, Scaffolds, F, scaffold_indexer)
def PE(Contigs, Scaffolds, F, Information, output_dest, C_dict, param): G = nx.Graph() print 'Parsing BAM file...' #informative_pair={81:(False,True),97:(True,False),113:(False,False),65:(True,True)} #I switched to look at mates instead since BWA can give false flag combinations for # read-mate when read is mapped but not mate eg 97-149 81-165. But the reverse #does not happen. #informative_pair={161:(True,False),145:(False,True),129:(True,True),177:(False,False)} #,131:(True,True),179:(False,False)} #147:(False,True),163:(True,False), with pysam.Samfile(param.bamfile, 'rb') as bam_file: #once real data, change to 'rb', simulated files are on SAM format #Get parameters -r, -m, -s, -T, -t for library print 'Computing parameters not set by user...' GetParams(bam_file, param, Scaffolds, C_dict, F, Contigs) #Clean contig_library singeled_out = 0 if param.first_lib: cont_lengths = bam_file.lengths cont_lengths = [int(nr) for nr in cont_lengths] #convert long to int object cont_names = bam_file.references #Calculate NG50 and LG 50 param.tot_assembly_length = sum(cont_lengths) sorted_lengths = sorted(cont_lengths, reverse=True) N50, L50 = CalculateStats(sorted_lengths, param) param.current_L50 = L50 param.current_N50 = N50 ####### WHEN ADDING SHORTER CONTIGS NOT INCLUDED IN THE SCAFFOLDING, ####### WE NEED TO ALSO INITIALIZE OBJECTS FOR THESE, THIS SHOULD BE DONE SOMEWHERE HERE for i in range(0, len(cont_names)): if cont_lengths[i] >= param.contig_threshold: C = Contig.contig(cont_names[i]) # Create object contig C.length = cont_lengths[i] scaf_length = C.length # Initially, scaffold consists of only this contig C.direction = True # always in same direction first, False=reverse C.position = 0 #position always 0 C.links = {} Contigs[C.name] = C # Create a dict with name as key and the object container as value S = Scaffold.scaffold(param.scaffold_indexer, [C], scaf_length, {}, {}) # Create object scaffold Scaffolds[S.name] = S C.scaffold = S.name param.scaffold_indexer += 1 else: singeled_out += 1 F.append([(cont_names[i], True, 0, cont_lengths[i], {})]) #list of (contig_name, pos_direction, position,length) print >> Information, 'Nr of contigs that was singeled out due to length constraints ' + str(singeled_out) else: #Clean contig_library/scaffold_library scaf_lengths = [Scaffolds[scaffold_].s_length for scaffold_ in Scaffolds.keys()] sorted_lengths = sorted(scaf_lengths, reverse=True) N50, L50 = CalculateStats(sorted_lengths, param) param.current_L50 = L50 param.current_N50 = N50 for scaffold_ in Scaffolds.keys(): #iterate over keys in hash, so that we can remove keys while iterating over it if Scaffolds[scaffold_].s_length < param.contig_threshold: ### Go to function and print to F ### Remove Scaf_obj from Scaffolds and Contig_obj from contigs S_obj = Scaffolds[scaffold_] list_of_contigs = S_obj.contigs #list of contig objects contained in scaffold object Contigs, F = GO.WriteToF(F, Contigs, list_of_contigs) #Don't worry, the contig objects are removed in WriteTOF function del Scaffolds[scaffold_] singeled_out += 1 print >> Information, 'Nr of contigs/scaffolds that was singeled out due to length constraints ' + str(singeled_out) #Create "node graph" of contigs (that passed the length criteria). Each having a left and right node print 'Nr of contigs/scaffolds included in scaffolding: ' + str(len(Scaffolds))#,Scaffolds.keys() if len(Scaffolds) == 0: return(None, Contigs, Scaffolds, F, param) cnt = 0 tot_start = time() start1 = time() for scaffold_ in Scaffolds: G.add_edge((scaffold_, 'L'), (scaffold_, 'R'), nr_links=None) #this is a scaffold object but can be both a single contig or a scaffold. Scaffolds[ scaffold_ ].scaffold_left_nbrs = {} Scaffolds[ scaffold_ ].scaffold_right_nbrs = {} if cnt % 100000 == 0 and cnt > 0: elapsed = time() - start1 print >> Information, 'Total nr of keys added: ', cnt, 'Time for adding last 100 000 keys: ', elapsed start1 = time() cnt += 1 print 'Total time elapsed: ', time() - tot_start # Create the link edges in the graph by fetching info from bam file cont_aligned_len = {} for contig in Contigs: cont_aligned_len[contig] = [0, Contigs[contig].length] count = 0 non_unique = 0 non_unique_for_scaf = 0 nr_of_duplicates = 0 prev_obs1 = -1 prev_obs2 = -1 reads_with_too_long_insert = 0 #fishy_reads = {} for alignedread in bam_file: try: #check that read is aligned OBS: not with is_unmapped since this flag is fishy for e.g. BWA contig1 = bam_file.getrname(alignedread.rname) contig2 = bam_file.getrname(alignedread.mrnm) except ValueError: continue #contig1=bam_file.getrname(alignedread.rname) ## add to coverage computation if contig is still in the list of considered contigs try: cont_aligned_len[contig1][0] += alignedread.rlen except KeyError: pass ########## CREATE EDGES IN SCAFFOLD GRAPH ########## if contig1 != contig2 and alignedread.is_read2: #check how many non unique reads out of the useful ones (mapping to two different contigs) #This only works for BWA!! implement for other aligners as well if alignedread.mapq == 0: non_unique += 1 #print contig1,contig2 if contig1 in Contigs and contig2 in Contigs and Contigs[contig2].scaffold != Contigs[contig1].scaffold and alignedread.mapq > param.map_quality: # and alignedread.tags[0][1] == 'U': #if alignedread.tags[0][1] != 'U': # non_unique_for_scaf += 1 if alignedread.mapq == 0: non_unique_for_scaf += 1 count += 1 #(read_dir,mate_dir)=informative_pair[flag_type] (read_dir, mate_dir) = (not alignedread.is_reverse, not alignedread.mate_is_reverse) scaf1 = Contigs[contig1].scaffold scaf2 = Contigs[contig2].scaffold #Calculate actual position on scaffold here #position1 cont/scaf1 cont_dir1 = Contigs[contig1].direction #if pos : L if neg: R cont1_pos = Contigs[contig1].position readpos = alignedread.pos cont1_len = Contigs[contig1].length s1len = Scaffolds[scaf1].s_length #position1 cont1/scaf1 cont_dir2 = Contigs[contig2].direction cont2_pos = Contigs[contig2].position matepos = alignedread.mpos cont2_len = Contigs[contig2].length s2len = Scaffolds[scaf2].s_length (obs1, obs2, scaf_side1, scaf_side2) = PosDirCalculatorPE(cont_dir1, read_dir, cont1_pos, readpos, s1len, cont1_len, cont_dir2, mate_dir, cont2_pos, matepos, s2len, cont2_len, param.read_len) if obs1 == prev_obs1 and obs2 == prev_obs2: nr_of_duplicates += 1 if param.detect_duplicate: continue if obs1 + obs2 < param.ins_size_threshold: # if obs1 == 3 or obs2 ==3: # print alignedread.pos,alignedread.mpos, contig1, contig2, scaf1, scaf2, s1len,s2len if scaf_side1 == 'R': if (scaf2, scaf_side2) in Scaffolds[scaf1].right_nbrs_obs: if obs1 < Scaffolds[scaf1].right_nbrs_obs[(scaf2, scaf_side2)]: Scaffolds[scaf1].right_nbrs_obs[(scaf2, scaf_side2)] = obs1 else: Scaffolds[scaf1].right_nbrs_obs[(scaf2, scaf_side2)] = obs1 if scaf_side1 == 'L': if (scaf2, scaf_side2) in Scaffolds[scaf1].left_nbrs_obs: if obs1 < Scaffolds[scaf1].left_nbrs_obs[(scaf2, scaf_side2)]: Scaffolds[scaf1].left_nbrs_obs[(scaf2, scaf_side2)] = obs1 else: Scaffolds[scaf1].left_nbrs_obs[(scaf2, scaf_side2)] = obs1 if scaf_side2 == 'R': if (scaf1, scaf_side1) in Scaffolds[scaf2].right_nbrs_obs: if obs2 < Scaffolds[scaf2].right_nbrs_obs[(scaf1, scaf_side1)]: Scaffolds[scaf2].right_nbrs_obs[(scaf1, scaf_side1)] = obs2 else: Scaffolds[scaf2].right_nbrs_obs[(scaf1, scaf_side1)] = obs2 if scaf_side2 == 'L': if (scaf1, scaf_side1) in Scaffolds[scaf2].left_nbrs_obs: if obs2 < Scaffolds[scaf2].left_nbrs_obs[(scaf1, scaf_side1)]: Scaffolds[scaf2].left_nbrs_obs[(scaf1, scaf_side1)] = obs2 else: Scaffolds[scaf2].left_nbrs_obs[(scaf1, scaf_side1)] = obs2 if (scaf2, scaf_side2) not in G[(scaf1, scaf_side1)]: G.add_edge((scaf2, scaf_side2), (scaf1, scaf_side1), nr_links=1, gap_dist=obs1 + obs2) else: G.edge[(scaf1, scaf_side1)][(scaf2, scaf_side2)]['nr_links'] += 1 G.edge[(scaf1, scaf_side1)][(scaf2, scaf_side2)]['gap_dist'] += obs1 + obs2 else: reads_with_too_long_insert += 1 #fishy_reads[alignedread.qname[:-1]]=[contig2,alignedread.is_read2] ## add to haplotype graph here!! prev_obs1 = obs1 prev_obs2 = obs2 elif contig1 in Contigs and contig2 in Contigs and Contigs[contig2].scaffold != Contigs[contig1].scaffold: ########################Use to validate scaffold in previous step here ############ pass # print 'NR OF FISHY EDGES: ', len(fishy_reads) print 'USEFUL READS (reads mapping to different contigs): ', count #print 'Non unique portion out of "USEFUL READS" (filtered out from scaffolding): ', non_unique #print 'Non unique used for scaf: ', non_unique_for_scaf print 'Reads with too large insert size from "USEFUL READS" (filtered out): ', reads_with_too_long_insert if param.detect_duplicate: print 'Number of duplicated reads indicated and removed: ', nr_of_duplicates ##### Calc coverage for all contigs with current lib here ##### sum_x = 0 sum_x_sq = 0 n = 0 for contig in cont_aligned_len: cont_coverage = cont_aligned_len[contig][0] / float(cont_aligned_len[contig][1]) #print key, cont_aligned_len[key]/float(cont_lengths[i]) try: Contigs[contig].coverage = cont_coverage except KeyError: pass sum_x += cont_coverage sum_x_sq += cont_coverage ** 2 n += 1 mean_cov, std_dev_cov = CalculateMeanCoverage(Contigs, param.first_lib, output_dest, param.bamfile) param.mean_coverage = mean_cov param.std_dev_coverage = std_dev_cov return(G, Contigs, Scaffolds, F, param)
def AddEdges(Contigs,Scaffolds,bamfile,mean,std_dev,scaffold_indexer,F,read_len): #Clean contig_library bam_object = BamParser(bamfile) singeled_out=0 cont_lengths= bam_object.bam_file.lengths cont_lengths=[int(nr) for nr in cont_lengths] #convert long to int object #print cont_lengths cont_names = bam_object.bam_file.references ####### WHEN ADDING SHORTER CONTIGS NOT INCLUDED IN THE SCAFFOLDING, ####### WE NEED TO ALSO INITIALIZE OBJECTS FOR THESE, THIS SHOULD BE DONE SOMEWHERE HERE for i in range(0,len(cont_names)): if cont_lengths[i] >= 300: C=Contig.contig(cont_names[i]) # Create object contig C.length = cont_lengths[i] C.scaf_length = C.length # Initially, scaffold consists of only this contig C.direction = True # always in same direction first, False=reverse C.position = 0 #position always 0 C.links = {} Contigs[C.name] = C # Create a dict with name as key and the object container as value S=Scaffold.scaffold('s'+str(scaffold_indexer),[C],C.length) # Create object scaffold Scaffolds[S.name]=S C.scaffold=S.name G.add_node((S.name,'L'),length=cont_lengths[i]) G.add_node((S.name,'R'),length=cont_lengths[i]) scaffold_indexer+=1 #Create "node graph" of contigs (that passed the length criteria). Each having a left and right node #print 'Nr of contigs/scaffolds included in scaffolding: '+ str(len(Scaffolds))#,Scaffolds.keys() for scaffold_ in Scaffolds: G.add_edge((scaffold_,'L'),(scaffold_,'R'),nr_links=None) #this is a scaffold object but can be both a single contig or a scaffold. # Create the link edges in the graph by fetching info from bam file def nr_softclipps(read): max_soft = 0 for type_,length in read.cigar: if type_ == 4 and length >= max_soft: max_soft = length return max_soft global_max_softclipps = 0 global_min_obs = 100000 links_used = 0 #r_len = float(read_len) for read1,read2 in bam_object.unique_reads_on_different_references(): contig1=bam_object.bam_file.getrname(read1.rname) contig2=bam_object.bam_file.getrname(read2.rname) max_soft_readpair = max(nr_softclipps(read1),nr_softclipps(read2)) if max_soft_readpair > global_max_softclipps: global_max_softclipps = max_soft_readpair # print read1.cigar #if read1.qlen/r_len < 0.7 or read2.qlen/r_len < 0.7: # continue # print 'midddle1',o1, o1+o2, read1.pos, read1.mapq,read1.qlen,read1.rlen, read1.cigar, read1.tags # if read2.qlen < 50: # print 'midddle2',o2, o1+o2, read2.pos, read2.mapq, read2.qlen,read2.rlen, read2.cigar, read2.tags if contig1 in Contigs and contig2 in Contigs: (read_dir,mate_dir) = (not read1.is_reverse,not read2.is_reverse ) scaf1=Contigs[contig1].scaffold scaf2=Contigs[contig2].scaffold #Calculate actual position on scaffold here #position1 cont/scaf1 cont_dir1 = Contigs[contig1].direction #if pos : L if neg: R cont1_pos = Contigs[contig1].position readpos = read1.pos cont1_len = Contigs[contig1].length s1len = Scaffolds[scaf1].s_length #position1 cont1/scaf1 cont_dir2 = Contigs[contig2].direction cont2_pos = Contigs[contig2].position matepos = read2.pos cont2_len = Contigs[contig2].length s2len = Scaffolds[scaf2].s_length (obs,scaf_side1,scaf_side2, (o1,o2))=PosDirCalculatorPE(cont_dir1,read_dir,cont1_pos,readpos,s1len,cont1_len,cont_dir2,mate_dir,cont2_pos,matepos,s2len,cont2_len,read_len) if obs < mean+ 4*std_dev: links_used += 1 if (scaf2,scaf_side2) not in G[(scaf1,scaf_side1)]: G.add_edge((scaf2,scaf_side2),(scaf1,scaf_side1),nr_links=1,gap_dist=[obs],obs_pos=set() ) G[(scaf2,scaf_side2)][(scaf1,scaf_side1)]['obs_pos'].add((o1,o2)) if o1 < global_min_obs: global_min_obs = o1 if o2 < global_min_obs: global_min_obs = o2 #print 'Added edge' else: try: if (o1,o2) in G.edge[(scaf1,scaf_side1)][(scaf2,scaf_side2)]['obs_pos']: continue except KeyError: #print G.edge[(scaf1,scaf_side1)][(scaf2,scaf_side2)] continue # if (o1,o2) in G.edge[(scaf1,scaf_side1)][(scaf2,scaf_side2)]['obs_pos']: # #print 'detected duplicate' # continue else: G.edge[(scaf1,scaf_side1)][(scaf2,scaf_side2)]['nr_links'] += 1 G.edge[(scaf1,scaf_side1)][(scaf2,scaf_side2)]['gap_dist'].append(obs) G.edge[(scaf1,scaf_side1)][(scaf2,scaf_side2)]['obs_pos'].add((o1,o2)) G.edge[(scaf1,scaf_side1)][(scaf2,scaf_side2)]['obs_pos'].add((o2,o1)) if o1 < global_min_obs: global_min_obs = o1 if o2 < global_min_obs: global_min_obs = o2 # if o1 < 50: # print o1, o1+o2, read1.pos, read1.mapq,read1.qlen,read1.rlen, read1.cigar, read1.tags # #print fancy_str(read1) # if o2 < 50: # print o2, o1+o2, read2.pos, read2.mapq, read2.qlen,read2.rlen, read2.cigar, read2.tags # #print fancy_str(read2) print 'Max softclipps:', global_max_softclipps print 'Min obs:', global_min_obs # sys.exit() #print 'Nr links used:', links_used return global_max_softclipps
def PE(Contigs, Scaffolds, bamfile, mean, scaffold_indexer, F, read_len): G = nx.Graph() print "Parsing BAM file..." # read_len=50 # informative_pair={81:(False,True),97:(True,False),113:(False,False),65:(True,True)} # I switched to look at mates instead since BWA can give false flag combinations for # read-mate when read is mapped but not mate eg 97-149 81-165. But the reverse # does not happen. informative_pair = {161: (True, False), 145: (False, True), 129: (True, True), 177: (False, False)} # threshold=800 with pysam.Samfile(bamfile, "r") as bam_file: # once real data, change to 'rb', simulated files are on SAM format # Clean contig_library singeled_out = 0 cont_lengths = bam_file.lengths cont_lengths = [int(nr) for nr in cont_lengths] # convert long to int object # print cont_lengths cont_names = bam_file.references ####### WHEN ADDING SHORTER CONTIGS NOT INCLUDED IN THE SCAFFOLDING, ####### WE NEED TO ALSO INITIALIZE OBJECTS FOR THESE, THIS SHOULD BE DONE SOMEWHERE HERE for i in range(0, len(cont_names)): C = Contig.contig(cont_names[i]) # Create object contig C.length = cont_lengths[i] C.scaf_length = C.length # Initially, scaffold consists of only this contig C.direction = True # always in same direction first, False=reverse C.position = 0 # position always 0 C.links = {} Contigs[C.name] = C # Create a dict with name as key and the object container as value S = Scaffold.scaffold("s" + str(scaffold_indexer), [C], C.length) # Create object scaffold Scaffolds[S.name] = S C.scaffold = S.name G.add_node((S.name, "L"), length=cont_lengths[i]) G.add_node((S.name, "R"), length=cont_lengths[i]) scaffold_indexer += 1 # Create "node graph" of contigs (that passed the length criteria). Each having a left and right node print "Nr of contigs/scaffolds included in scaffolding: " + str(len(Scaffolds)) # ,Scaffolds.keys() for scaffold_ in Scaffolds: G.add_edge( (scaffold_, "L"), (scaffold_, "R"), nr_links=None ) # this is a scaffold object but can be both a single contig or a scaffold. # Create the link edges in the graph by fetching info from bam file for alignedread in bam_file: flag_type = alignedread.flag if flag_type in informative_pair: contig1 = bam_file.getrname(alignedread.rname) contig2 = bam_file.getrname(alignedread.mrnm) if contig1 in Contigs and contig2 in Contigs and Contigs[contig2].scaffold != Contigs[contig1].scaffold: (read_dir, mate_dir) = informative_pair[flag_type] scaf1 = Contigs[contig1].scaffold scaf2 = Contigs[contig2].scaffold # Calculate actual position on scaffold here # position1 cont/scaf1 cont_dir1 = Contigs[contig1].direction # if pos : L if neg: R cont1_pos = Contigs[contig1].position readpos = alignedread.pos cont1_len = Contigs[contig1].length s1len = Scaffolds[scaf1].s_length # position1 cont1/scaf1 cont_dir2 = Contigs[contig2].direction cont2_pos = Contigs[contig2].position matepos = alignedread.mpos cont2_len = Contigs[contig2].length s2len = Scaffolds[scaf2].s_length (gap, scaf_side1, scaf_side2) = PosDirCalculatorPE( cont_dir1, read_dir, cont1_pos, readpos, s1len, cont1_len, cont_dir2, mate_dir, cont2_pos, matepos, s2len, cont2_len, read_len, ) if (scaf2, scaf_side2) not in G[(scaf1, scaf_side1)]: G.add_edge((scaf2, scaf_side2), (scaf1, scaf_side1), nr_links=1, gap_dist=[gap]) # print 'Added edge' else: G.edge[(scaf1, scaf_side1)][(scaf2, scaf_side2)]["nr_links"] += 1 # print 'edge' G.edge[(scaf1, scaf_side1)][(scaf2, scaf_side2)]["gap_dist"].append(gap) elif ( contig1 in Contigs and contig2 in Contigs and Contigs[contig2].scaffold != Contigs[contig1].scaffold ): ########################Use to validate scaffold herein previous step here pass # for edge in G.edges(): # if G[edge[0]][edge[1]]['nr_reads']: # print G[edge[0]][edge[1]]['gap_dist'] # print G.edges(data=True) return (G, Contigs, Scaffolds, F, scaffold_indexer)