def run(self): signal.signal(signal.SIGTERM, lambda a, b: sys.exit()) # Exit quietly. while True: with self.run_config.start() as controller: for _ in range(FLAGS.batch_size): try: replay_path = self.replay_queue.get() except Queue.Empty: return try: with self.counter.get_lock(): self.counter.value += 1 print('Processing {}/{} ...'.format(self.counter.value, self.total_num)) sampled_action_path = os.path.join(FLAGS.save_path.replace( 'SampledObservations', 'SampledActions'), os.path.basename(replay_path)) if not os.path.isfile(sampled_action_path): return with open(sampled_action_path) as f: actions = json.load(f) actions.insert(0, 0) replay_data = self.run_config.replay_data(replay_path) info = controller.replay_info(replay_data) map_data = None if info.local_map_path: map_data = self.run_config.map_data(info.local_map_path) for player_info in info.player_info: race = sc_common.Race.Name(player_info.player_info.race_actual) player_id = player_info.player_info.player_id observation_path = os.path.join(FLAGS.save_path, race, '{}@{}'.format(player_id, os.path.basename(replay_path))) global_info_path = observation_path.replace('SampledObservations', 'GlobalInfos') if os.path.isfile(observation_path) and os.path.isfile(global_info_path): continue ostream = stream.open(observation_path, 'wb', buffer_size=1000) self.process_replay(controller, replay_data, map_data, player_id, actions, ostream, global_info_path) ostream.close() except Exception as e: try: ostream.close() if os.path.isfile(observation_path): os.remove(observation_path) if os.path.isfile(global_info_path): os.remove(global_info_path) except: pass print(e) break finally: self.replay_queue.task_done()
def close_old_and_begin_new_stream(protobuf_out_stream, RASPI_IMAGE_GIT_HASH): if protobuf_out_stream != None: protobuf_out_stream.close() protobuf_out_stream = stream.open( PROTOBUF_DATA_FOLDER + datetime.utcnow().strftime('%Y-%m-%dT%H_%M_%S') + 'GH' + hex(RASPI_IMAGE_GIT_HASH) + ".proto.gz", 'ab') return protobuf_out_stream
def build_reads_dict(nodes, gam_file_path, min_cutoff): all_reads = dict() # reading gam file with stream.open(str(gam_file_path), "rb") as in_stream: counter = 0 read_mappings = ReadMappings(name="first") for data in in_stream: counter += 1 if (counter % 10000000) == 0: logging.info("{} mappings processed".format(counter)) align = Alignment() align.ParseFromString(data) # skipping alignments with less than minimum cutoff if len(align.sequence) < min_cutoff: continue if align.name not in all_reads: # either first or new read mapping = Mapping() # mapping now has the nodes and the length mapping.fill_mapping(nodes, align, len(align.sequence)) all_reads[align.name] = [] # all_reads[align.name] = read_mappings if read_mappings.name == "first": # only once for first read # all_reads[align.name].name = align.name read_mappings.name = align.name read_mappings.add_mapping(mapping) # all_reads[read_mappings.name].add_mapping(mapping) # all_reads[align.name].add_mapping(mapping) # new read, need to store the previous read_mappings # in all_reads and start a new ReadMappings object to fill else: # all_reads[align.name] = ReadMappings(name=align.name) # all_reads[align.name].add_mapping(mapping) all_reads[read_mappings.name] = read_mappings.list_nodes() read_mappings = ReadMappings(name=align.name) read_mappings.add_mapping(mapping) # this read already exists (from previous mapping) # add this mapping # add mapping checks if this mapping is a new chain or for a chain # already seen with this read, then compares the length and keep # the longer one else: mapping = Mapping() mapping.fill_mapping(nodes, align, len(align.sequence)) read_mappings.add_mapping(mapping) # all_reads[align.name].add_mapping(mapping) return all_reads
def load_stream(self, stream_path): res = [] with stream.open(stream_path, 'rb') as proto_stream: for data in proto_stream: trial = Trial() trial.ParseFromString(data) assert trial.HasField("damage_class") res.append((self.to_vec(trial), trial.damage_class)) return res
def vg_graph_reader(inp): with stream.open(str(inp), "rb") as istream: for data in istream: l = vg_pb2.Graph() l.ParseFromString(data) g = Graph(len(l.node)) for j in range(len(l.edge)): from_edge = getattr(l.edge[j], "from") g.addEdge(from_edge, l.edge[j].to) return g
def vg_graph_reader(vg_file): node_seq_list = defaultdict() edge_connections = defaultdict(list) with stream.open(str(vg_file), "rb") as istream: for data in istream: l = vg_pb2.Graph() l.ParseFromString(data) for i in range(len(l.node)): index = l.node[i].id seq = l.node[i].sequence node_seq_list[index] = seq for j in range(len(l.edge)): from_edge = getattr(l.edge[j], "from") edge_connections[from_edge].append(l.edge[j].to) return node_seq_list, edge_connections
def main(): nomidi = True for filename in os.listdir(devices_dir): path = os.path.join(devices_dir, filename) if filename.startswith('midi'): fd = stream.open(path) schedule(read_midi, fd) nomidi = False if nomidi: print("no midi devices found") sys.exit(1) log = Logger(sys.stdout, sys.stdout) server = http.Server(http_application, log, port=9898) schedule(server.run) schedule.run()
def run(self): signal.signal( signal.SIGTERM, lambda a, b: sys.exit()) # Kill thread upon termination signal while True: with self.run_config.start() as controller: for _ in range(FLAGS.batch_size): try: replay_path = self.replay_queue.get() except Queue.Empty: return try: with self.counter.get_lock(): self.counter.value += 1 print('Processing {}/{} ...'.format( self.counter.value, self.total_num)) sampled_action_path = os.path.join( FLAGS.save_path.replace('SampledObservations', 'SampledActions'), os.path.basename(replay_path)) if not os.path.isfile( sampled_action_path ): # Unable to find the sampled observations of replay print('Unable to locate', sampled_action_path) return with open(sampled_action_path ) as f: # Get all macro action frames actions = json.load(f) actions.insert(0, 0) # Add 0th frame to the start replay_data = self.run_config.replay_data(replay_path) info = controller.replay_info(replay_data) map_data = None if info.local_map_path: # Special handling for custom maps map_data = self.run_config.map_data( info.local_map_path) for player_info in info.player_info: # Parse replay from each player's point of view race = common_pb.Race.Name( player_info.player_info.race_actual) player_id = player_info.player_info.player_id observation_path = os.path.join( FLAGS.save_path, race, '{}@{}'.format(player_id, os.path.basename(replay_path))) global_info_path = observation_path.replace( 'SampledObservations', 'GlobalInfos') if os.path.isfile( observation_path ) and os.path.isfile( global_info_path ): # Skip replay if it has already been processed continue ostream = stream.open(observation_path, 'wb', buffer_size=1000) self.process_replay(controller, replay_data, map_data, player_id, actions, ostream, global_info_path) ostream.close() except Exception as e: try: ostream.close() if os.path.isfile(observation_path): os.remove(observation_path) if os.path.isfile(global_info_path): os.remove(global_info_path) except: pass print(e) break finally: self.replay_queue.task_done()
import vg_pb2 from collections import Counter from collections import defaultdict import collections from collections import OrderedDict, namedtuple from collections import defaultdict # assumption ... all S' and before L's #filename = sys.argv[1] #out = sys.argv[2] d = {} count = 1 bubbles_start = set() #with stream.open('assembly_graph.P.int.remn2n.X_100.chrXIII.trans' ,"rb") as istream: with stream.open('chrXIII.filtered.ordered.trans', "rb") as istream: for data in istream: l = vg_pb2.SnarlTraversal() l.ParseFromString(data) if l.snarl.start.backward == True: start_node = l.snarl.end.node_id else: start_node = l.snarl.start.node_id bubbles_start.add(start_node) multiplicity_bubbles = defaultdict(list) read_details = defaultdict(list) with stream.open('../out.new.gam', "rb") as istream: #with stream.open('true_haps.chrXIII.gam', "rb") as istream: for data in istream: g = vg_pb2.Alignment()
#@jit def to_cigar(edit, extended): if edit.from_length == edit.to_length: if extended and not edit.sequence: return (7, edit.from_length) elif extended: return (8, edit.from_length) else: return (0, edit.from_length) elif edit.from_length: return (2, edit.from_length) # DEL elif edit.to_length: return (1, edit.to_length) #INS with stream.open(gamfile, 'rb') as istream: with pysam.AlignmentFile(bamfile, "wb", template=samheader, threads=4) as outf: for data in istream: m = vg_pb2.Alignment() m.ParseFromString(data) read_pos = 0 logger.debug(m) for s in m.path.mapping: cigar = tuple([to_cigar(i, False) for i in s.edit]) read_end = read_pos + consume_cigar(cigar) a = pysam.AlignedSegment(header=samheader.header) a.query_sequence = m.sequence[ read_pos: read_end] # e.g. "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG" if s.position.is_reverse:
import sys import stream import logging import vg_pb2 from collections import Counter from collections import defaultdict import collections from collections import OrderedDict, namedtuple from collections import defaultdict import networkx as nx bubble_nodes = set() bubble_start = defaultdict() bubble_end = set() with stream.open('assembly_graph.P.int.remn2n.X_100.chrXIII.trans', "rb") as istream: for data in istream: l = vg_pb2.SnarlTraversal() l.ParseFromString(data) start = l.snarl.start.node_id end = l.snarl.end.node_id if l.snarl.start.backward == True: bubble_start[end] = start else: bubble_start[start] = end for i in range(0, len(l.visits)): bubble_nodes.add(l.visits[i].node_id) print(len(bubble_nodes))
def vg_reader(locus_file, gam_file, sample): """ input: sorted locus and sorted GAM file output from vg. output: sorted readset for core DP. assumptions: 1. locus file consists of linear ordering of simple bubbles only and hence sorted. Each locus file does not contain start and end vertex. 2. paths in the locus should be covered by atleast one pacbio read. 2. GAM file is sorted and restricted to locus file. 3. files consists of all DAG connected components. 4. add variant only when it identifies the branch uniquely. """ locus_count = 0 prev_startsnarl = 0 prev_endsnarl = 0 locus_branch_mapping = OrderedDict() locus_count = 0 prev_startsnarl = 0 prev_startsnarl_orientation = -1 prev_endsnarl = 0 prev_endsnarl_orientation = -1 with stream.open(str(locus_file), "rb") as istream: for data in istream: l = vg_pb2.SnarlTraversal() l.ParseFromString(data) #TODO: make ordered doctionary locus_branch_mapping # handle forward and backward case of nodes current_startsnarl = l.snarl.start.node_id current_startsnarl_orientation = l.snarl.start.backward current_endsnarl = l.snarl.end.node_id current_endsnarl_orientation = l.snarl.end.backward path_in_bubble = [] if len(l.visits) == 0: #TODO: for now, assumed, all nodes in path are either forward or backward if l.snarl.start.backward == True: path_in_bubble.append( tuple((l.snarl.end.node_id, l.snarl.start.node_id))) else: path_in_bubble.append( tuple((l.snarl.start.node_id, l.snarl.end.node_id))) else: #TODO: for now, assumed, all nodes in path are either forward or backward if l.snarl.start.backward == True: path_in_bubble.append( tuple((l.snarl.end.node_id, l.visits[-1].node_id))) for i in range(0, len(l.visits) - 1): path_in_bubble.append( tuple((l.visits[i + 1].node_id, l.visits[i].node_id))) path_in_bubble.append( tuple((l.visits[0].node_id, l.snarl.start.node_id))) else: path_in_bubble.append( tuple((l.snarl.start.node_id, l.visits[0].node_id))) for i in range(0, len(l.visits) - 1): path_in_bubble.append( tuple((l.visits[i].node_id, l.visits[i + 1].node_id))) path_in_bubble.append( tuple((l.visits[-1].node_id, l.snarl.end.node_id))) if current_startsnarl == prev_startsnarl and current_endsnarl == prev_endsnarl and current_endsnarl_orientation == prev_endsnarl_orientation and prev_startsnarl_orientation == current_startsnarl_orientation: per_locus.append(path_in_bubble) else: locus_count = locus_count + 1 per_locus = [] per_locus.append(path_in_bubble) prev_startsnarl = current_startsnarl prev_startsnarl_orientation = current_startsnarl_orientation prev_endsnarl = current_endsnarl prev_endsnarl_orientation = current_endsnarl_orientation if len(per_locus) < 3: locus_branch_mapping[locus_count] = per_locus print('The number of hets:') het_count = 0 for k, v in locus_branch_mapping.items(): if len(v) > 1: het_count = het_count + 1 print(het_count) # keep branch of paths in each bubble. alleles_per_pos = defaultdict() for k, v in locus_branch_mapping.items(): alleles_per_pos[k] = len(v) # both simple and complex bubbles: key is the values in locus_branch_mapping and value is triplet(locus, branch, alleles) reverse_mapping = defaultdict(list) for k, v in locus_branch_mapping.items(): if len(v) > 1: # more than one branch for i, b in enumerate(v): if len(b) > 0: for p, j in enumerate(b): reverse_mapping[j].append( [k, i, len(v)] ) # in complex bubbles, a node can map to multiple branches. # both simple and complex bubbles: extract reads from GAM file associated with the locus and create a sorted readset. # in complex bubble, set of nodes uniquely determine the path. readset = ReadSet() count = 0 duplicated = 0 #TODO: consider reads with only positive score. with stream.open(str(gam_file), "rb") as istream: for data in istream: g = vg_pb2.Alignment() g.ParseFromString(data) # hard-coded source id, mapping quality and other values. val1 = True val2 = False count1 = 0 count2 = 0 #score = g.score/len(g.sequence) #if score > 0.2: # continue read = Read(g.name, 0, 0, sample) # create read for each read alignment prev_tmp = [] prev_locus = -1 locus = -1 for i in range(0, len(g.path.mapping) - 1): #for i in g.path.mapping: # go over the mapping in a read # TODO: check for forward or reverse strand, we may not need it for DAG. edge1 = tuple((int(g.path.mapping[i].position.node_id), int(g.path.mapping[i + 1].position.node_id) )) # go over nodes in a mapping edge2 = tuple((int(g.path.mapping[i + 1].position.node_id), int(g.path.mapping[i].position.node_id) )) # go over nodes in a mapping if edge1 in reverse_mapping or edge2 in reverse_mapping: # handle start and sink node. if edge1 in reverse_mapping: #qualities = [10]* reverse_mapping[edge1][0][2] qualities = 1 node_inf = [ tuple(i[0:2]) for i in reverse_mapping[edge1] ] # consider (locus, branch) else: # qualities = [10]* reverse_mapping[edge2][0][2] qualities = 1 node_inf = [ tuple(i[0:2]) for i in reverse_mapping[edge2] ] tmp = [x for x in node_inf] if prev_locus != tmp[0][0]: prev_tmp = tmp prev_locus = tmp[0][0] interset_tmp = list(set(tmp).intersection(set(prev_tmp))) if len(prev_tmp) > 0 and len( set(tmp).intersection(set(prev_tmp)) ) == 1: # for complicated bubbles, but with Top-k paths. combination of some nodes uniquely determine branch. #qualities[interset_tmp[0][1]] = 0 qualities = 1 if i == len(g.path.mapping) - 2: if interset_tmp[0][1] == 0 or interset_tmp[0][ 1] == 1: read.add_variant(interset_tmp[0][0], interset_tmp[0][1], qualities) else: next_edge1 = tuple( (int(g.path.mapping[i + 1].position.node_id), int(g.path.mapping[i + 2].position.node_id))) next_edge2 = tuple( (int(g.path.mapping[i + 2].position.node_id), int(g.path.mapping[i + 1].position.node_id))) if next_edge1 not in reverse_mapping and next_edge2 not in reverse_mapping: if interset_tmp[0][1] == 0 or interset_tmp[0][ 1] == 1: read.add_variant(interset_tmp[0][0], interset_tmp[0][1], qualities) locus = interset_tmp[0][0] readset.add(read) readset1 = ReadSet() tmp_duplicated = set() for read in readset: if read.sort() == 1: duplicated = duplicated + 1 tmp = [] for variant in read: tmp.append(variant.position) #print("duplicated variant") x = [ item for item, count in collections.Counter(tmp).items() if count > 1 ] for a in x: tmp_duplicated.add(a) continue else: tmp = [] for variant in read: tmp.append(variant.position) #print("duplicated variant") x = [ item for item, count in collections.Counter(tmp).items() if count > 1 ] if len(x) > 0: continue if len(read) >= 4: tmp = [] for variant in read: tmp.append(variant.position) flag = 0 for i, x in enumerate(tmp): if i > 0: #print(int(x - tmp[i - 1])) if int(x - tmp[i - 1]) > 20: flag = 1 break if flag == 0: #print(read) readset1.add(read) #if len(read) >=5: # readset1.add(read) #print("length of duplicated bubbles") #print(tmp_duplicated) #print(len(list(tmp_duplicated))) readset1.sort() #print("duplicated") #print(duplicated) print("reads considered before read-selection") print(len(readset1)) #print(readset1) return readset1, alleles_per_pos, locus_branch_mapping
def vg_reader(locus_file, gam_file): """ input: sorted locus and sorted GAM file output from vg. output: sorted readset for core DP. assumptions: 1. locus file consists of linear ordering of simple bubbles only and hence sorted. Each locus file does not contain start and end vertex. 2. paths in the locus should be covered by atleast one pacbio read. 2. GAM file is sorted and restricted to locus file. 3. files consists of all DAG connected components. 4. add variant only when it identifies the branch uniquely. """ # create a dictionary of branches for each locus based on locus file. locus_branch_mapping = OrderedDict() locus_count = 0 prev_startsnarl = 0 prev_endsnarl = 0 locus_branch_mapping = defaultdict() locus_count = 0 prev_startsnarl = 0 prev_startsnarl_orientation = -1 prev_endsnarl = 0 prev_endsnarl_orientation = -1 with stream.open(str(locus_file), "rb") as istream: for data in istream: l = vg_pb2.SnarlTraversal() l.ParseFromString(data) #TODO: make ordered doctionary locus_branch_mapping # handle forward and backward case of nodes current_startsnarl = l.snarl.start.node_id current_startsnarl_orientation = l.snarl.start.backward current_endsnarl = l.snarl.end.node_id current_endsnarl_orientation = l.snarl.end.backward path_in_bubble = [] cyclic_bubbles = [ 102838, 102840, 102846, 102850, 52424, 52430, 52708, 52711, 54914, 54917, 60635, 60638, 60965, 60968, 61857, 61861, 61906, 61909, 65760, 65762, 67841, 67844, 67858, 67862, 70509, 70513, 73378, 73380, 83218, 83220, 83224, 83231, 83676, 83678, 86581, 86586, 92007, 92012, 92467, 92474, 97403, 97405, 99187, 99190 ] if l.snarl.end.node_id in cyclic_bubbles or l.snarl.start.node_id in cyclic_bubbles: continue if len(l.visits) == 0: #TODO: for now, assumed, all nodes in path are either forward or backward if l.snarl.start.backward == True: path_in_bubble.append( tuple((l.snarl.end.node_id, l.snarl.start.node_id))) else: path_in_bubble.append( tuple((l.snarl.start.node_id, l.snarl.end.node_id))) else: #TODO: for now, assumed, all nodes in path are either forward or backward if l.snarl.start.backward == True: path_in_bubble.append( tuple((l.snarl.end.node_id, l.visits[-1].node_id))) for i in range(0, len(l.visits) - 1): path_in_bubble.append( tuple((l.visits[i + 1].node_id, l.visits[i].node_id))) path_in_bubble.append( tuple((l.visits[0].node_id, l.snarl.start.node_id))) else: path_in_bubble.append( tuple((l.snarl.start.node_id, l.visits[0].node_id))) for i in range(0, len(l.visits) - 1): path_in_bubble.append( tuple((l.visits[i].node_id, l.visits[i + 1].node_id))) path_in_bubble.append( tuple((l.visits[-1].node_id, l.snarl.end.node_id))) if current_startsnarl == prev_startsnarl and current_endsnarl == prev_endsnarl and current_endsnarl_orientation == prev_endsnarl_orientation and prev_startsnarl_orientation == current_startsnarl_orientation: per_locus.append(path_in_bubble) else: locus_count = locus_count + 1 per_locus = [] per_locus.append(path_in_bubble) prev_startsnarl = current_startsnarl prev_startsnarl_orientation = current_startsnarl_orientation prev_endsnarl = current_endsnarl prev_endsnarl_orientation = current_endsnarl_orientation locus_branch_mapping[locus_count] = per_locus #for i in [1, 2, 131, 132, 509, 6, 3, 646, 10, 12, 13, 269, 143, 16, 17, 657, 659, 407, 280, 667, 31, 672, 169, 301, 687, 560, 48, 691, 563, 693, 694, 569, 572, 317, 573, 574, 319, 577, 701, 579, 580, 325, 582, 583, 584, 201, 330, 586, 588, 589, 585, 590, 592, 593, 594, 337, 339, 597, 85, 599, 87, 345, 601, 67, 607, 608, 609, 482, 612, 614, 360, 65, 632, 581, 494, 371, 500, 501, 629, 120, 506, 380, 381]: #del locus_branch_mapping[i] #for i in [7, 8, 12, 15, 19, 20, 22, 23, 27, 29, 31, 32, 35, 38, 40, 42, 43, 45, 46, 52, 53, 59, 60, 61, 62, 63, 65, 69, 70, 71, 72, 78, 81, 82, 87, 91, 92, 94, 98, 100, 102, 104, 108, 114, 115, 118, 127, 128, 129, 142, 149, 156, 162, 163, 164, 165, 167, 170, 171, 172, 177, 182, 185, 186, 195, 198, 203, 211, 212, 213, 216, 223, 226, 227, 229, 231, 233, 235, 237, 238, 242, 243, 248, 249, 258, 259, 260, 266, 270, 271, 272, 273, 277, 278, 280, 287, 288, 290, 295, 298, 299, 301, 304, 305, 307, 308, 310, 311, 312, 315, 316, 317, 319, 322, 323, 328, 330, 335, 338, 340, 343, 346, 347, 348, 351, 354, 362, 364, 366, 367, 368, 369, 372, 373, 379, 383, 384, 385, 387, 391, 392, 394, 395, 396, 397, 399, 403, 404, 405, 406, 407, 409, 411, 413, 415, 417, 419, 422, 425, 428, 429, 432, 433, 436, 437, 438, 440, 442, 443, 444, 446, 452, 453, 458, 459, 462, 464, 465, 467]: #del locus_branch_mapping[i] #for i in [5, 19, 20, 22, 23, 27, 29, 31, 32, 35, 38, 40, 42, 43, 45, 46, 52, 53, 59, 60, 61, 62, 63, 65, 69, 70, 71, 72, 78, 81, 82, 87, 91, 92, 94, 98, 100, 102, 104, 108, 114, 115, 118, 127, 128, 129, 142, 149, 156, 162, 163, 164, 165, 167, 170, 171, 172, 177, 182, 185, 186, 195, 198, 203, 211, 212, 213, 216, 223, 226, 227, 229, 231, 233, 235, 237, 238, 242, 243, 248, 249, 258, 259, 260, 266, 270, 271, 272, 273, 277, 278, 280, 287, 288, 290, 295, 298, 299, 301, 304, 305, 307, 308, 310, 311, 312, 315, 316, 317, 319, 322, 323, 328, 330, 335, 338, 340, 343, 346, 347, 348, 351, 354, 362, 364, 366, 367, 368, 369, 372, 373, 379, 383, 384, 385, 387, 391, 392, 394, 395, 396, 397, 399, 403, 404, 405, 406, 407, 409, 411, 413, 415, 417, 419, 422, 425, 428, 429, 432, 433, 436, 437, 438, 440, 442, 443, 444, 446, 452, 453, 458, 459, 462, 464, 465, 467]: #del locus_branch_mapping[i] #for i in [7, 8, 12, 15, 19, 20, 22, 23, 27, 29, 31, 32, 35, 38, 40, 42, 43, 45, 46, 52, 53, 59, 60, 61, 62, 63, 65, 69, 70, 71, 72, 78, 81, 82, 87, 91, 92, 94, 98, 100, 102, 104, 108, 114, 115, 118, 127, 128, 129, 142, 149, 156, 162, 163, 164, 165, 167, 170, 171, 172, 177, 182, 185, 186, 195, 198, 203, 211, 212, 213, 216, 223, 226, 227, 229, 231, 233, 235, 237, 238, 242, 243, 248, 249, 258, 259, 260, 266, 270, 271, 272, 273, 277, 278, 280, 287, 288, 290, 295, 298, 299, 301, 304, 305, 307, 308, 310, 311, 312, 315, 316, 317, 319, 322, 323, 328, 330, 335, 338, 340, 343, 346, 347, 348, 351, 354, 362, 364, 366, 367, 368, 369, 372, 373, 379, 383, 384, 385, 387, 391, 392, 394, 395, 396, 397, 398, 399, 400, 401, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 422, 425, 428, 429, 432, 433, 436, 437, 438, 440, 442, 443, 444, 446, 452, 453, 458, 459, 462, 464, 465, 467]: #del locus_branch_mapping[i] #for i in [7, 8, 12, 15, 19, 20, 22, 23, 27, 29, 31, 32, 35, 38, 40, 42, 43, 45, 46, 52, 53, 59, 60, 61, 62, 63, 65, 69, 70, 71, 72, 78, 81, 82, 87, 91, 92, 94, 98, 100, 102, 104, 108, 114, 115, 118, 127, 128, 129, 142, 149, 156, 162, 163, 164, 165, 167, 170, 171, 172, 177, 182, 185, 186, 195, 198, 203, 211, 212, 213, 216, 223, 226, 227, 229, 231, 233, 235, 237, 238, 242, 243, 248, 249, 258, 259, 260, 266, 270, 271, 272, 273, 277, 278, 280, 287, 288, 290, 295, 298, 299, 301, 304, 305, 307, 308, 310, 311, 312, 315, 316, 317, 319, 322, 323, 328, 330, 335, 338, 340, 343, 346, 347, 348, 351, 354, 362, 364, 366, 367, 368, 369, 372, 373, 379, 383, 384, 385, 387, 391, 392, 394, 395, 396, 397, 399, 403, 404, 405, 406, 407, 409, 411, 413, 415, 417, 419, 422, 425, 428, 429, 432, 433, 436, 437, 438, 440, 442, 443, 444, 446, 452, 453, 458, 459, 462, 464, 465, 467]: #del locus_branch_mapping[i] #for i in [395, 396]: #del locus_branch_mapping[i] #print(locus_branch_mapping) print('The number of hets:') het_count = 0 for k, v in locus_branch_mapping.items(): if len(v) > 1: het_count = het_count + 1 print(het_count) # keep branch of paths in each bubble. alleles_per_pos = defaultdict() for k, v in locus_branch_mapping.items(): alleles_per_pos[k] = len(v) # both simple and complex bubbles: key is the values in locus_branch_mapping and value is triplet(locus, branch, alleles) reverse_mapping = defaultdict(list) for k, v in locus_branch_mapping.items(): if len(v) > 1: # more than one branch for i, b in enumerate(v): if len(b) > 0: for p, j in enumerate(b): reverse_mapping[j].append( [k, i, len(v)] ) # in complex bubbles, a node can map to multiple branches. #print(reverse_mapping) print(locus_branch_mapping) # both simple and complex bubbles: extract reads from GAM file associated with the locus and create a sorted readset. # in complex bubble, set of nodes uniquely determine the path. readset = ReadSet() count = 0 duplicated = 0 #TODO: consider reads with only positive score. with stream.open(str(gam_file), "rb") as istream: for data in istream: g = vg_pb2.Alignment() g.ParseFromString(data) # hard-coded source id, mapping quality and other values. val1 = True val2 = False count1 = 0 count2 = 0 score = g.score / len(g.sequence) if score < 0.75: continue read = Read(g.name, 0, 0, 0) # create read for each read alignment #readnames= ["S1_Y12_290","S1_SK1_290","S1_Y12_430","S1_SK1_657","S1_Y12_139","S1_Y12_427","S1_SK1_427","S1_Y12_657","S1_SK1_588","S1_Y12_588","S1_SK1_139","S1_SK1_430","S1_Y12_76","S1_Y12_463","S1_SK1_463","S1_SK1_76"] #readnames = ["S1_Y12_259"] #if g.name not in readnames: #continue print(g.name) prev_tmp = [] prev_locus = -1 locus = -1 #for i in range(0,len(g.path.mapping)): #if g.path.mapping[i].position.is_reverse != val1: #val1 = False #break #else: #count1 = count1 +1 #if count1 == len(g.path.mapping): #count = count+1 ##print(g.name) #for i in range(0,len(g.path.mapping)): #if g.path.mapping[i].position.is_reverse != val2: #val2 = True #break #else: #count2 = count2 +1 #if count2 == len(g.path.mapping): #count = count+1 ##print(g.name) #print(val1) #print(val2) #if val1 ==val2: for i in range(0, len(g.path.mapping) - 1): #for i in g.path.mapping: # go over the mapping in a read # TODO: check for forward or reverse strand, we may not need it for DAG. edge1 = tuple((g.path.mapping[i].position.node_id, g.path.mapping[i + 1].position.node_id )) # go over nodes in a mapping edge2 = tuple((g.path.mapping[i + 1].position.node_id, g.path.mapping[i].position.node_id )) # go over nodes in a mapping print("edge") print(edge1) print(edge2) if edge1 in reverse_mapping or edge2 in reverse_mapping: # handle start and sink node. if edge1 in reverse_mapping: qualities = [10] * reverse_mapping[edge1][0][2] node_inf = [ tuple(i[0:2]) for i in reverse_mapping[edge1] ] # consider (locus, branch) else: qualities = [10] * reverse_mapping[edge2][0][2] node_inf = [ tuple(i[0:2]) for i in reverse_mapping[edge2] ] tmp = [x for x in node_inf] if prev_locus != tmp[0][0]: prev_tmp = tmp prev_locus = tmp[0][0] interset_tmp = list(set(tmp).intersection(set(prev_tmp))) print("I am outside if") # TODO: handle case with prev_tmp =0 print("prev_tmp") print(prev_tmp) print("tmp") print(tmp) if len(prev_tmp) > 0 and len( set(tmp).intersection(set(prev_tmp)) ) == 1: # for complicated bubbles, but with Top-k paths. combination of some nodes uniquely determine branch. print("I am inside if") qualities[interset_tmp[0][1]] = 0 if i == len(g.path.mapping) - 2: read.add_variant(interset_tmp[0][0], interset_tmp[0][1], qualities) else: print("i am in else") next_edge1 = tuple( (g.path.mapping[i + 1].position.node_id, g.path.mapping[i + 2].position.node_id)) next_edge2 = tuple( (g.path.mapping[i + 2].position.node_id, g.path.mapping[i + 1].position.node_id)) if next_edge1 not in reverse_mapping and next_edge2 not in reverse_mapping: read.add_variant(interset_tmp[0][0], interset_tmp[0][1], qualities) locus = interset_tmp[0][0] #if prev_locus!=locus: #prev_tmp = [] #else: #for i in tmp: #prev_tmp.append(i) #prev_locus = locus print(read) if len(read) >= 2: readset.add(read) print("non-shattered") print(count) #print(readset) readset1 = ReadSet() tmp_duplicated = set() for read in readset: if read.sort() == 1: duplicated = duplicated + 1 tmp = [] for variant in read: tmp.append(variant.position) print("duplicated variant") x = [ item for item, count in collections.Counter(tmp).items() if count > 1 ] for a in x: tmp_duplicated.add(a) continue else: readset1.add(read) print("length of duplicated bubbles") print(tmp_duplicated) print(len(list(tmp_duplicated))) readset1.sort() print("******") for i, read in enumerate(readset1): for j, variant in enumerate(read): print( str(i) + " " + str(variant.position) + " " + str(variant.allele) + " " + "10") print("******") print("duplicated") print(duplicated) print("reads considered before read-selection") print(len(readset1)) return readset1, alleles_per_pos, locus_branch_mapping
#trans = sys.argv[3] count = 0 locus_count = 0 prev_startsnarl = 0 prev_endsnarl = 0 locus_branch_mapping = OrderedDict() locus_count = 0 prev_startsnarl = 0 prev_startsnarl_orientation = -1 prev_endsnarl = 0 prev_endsnarl_orientation = -1 reads_dict = defaultdict(list) path_in_bubble = [] with stream.open(str(locus_file), "rb") as istream: for data in istream: l = vg_pb2.SnarlTraversal() l.ParseFromString(data) #TODO: make ordered doctionary locus_branch_mapping # handle forward and backward case of nodes current_startsnarl = l.snarl.start.node_id current_startsnarl_orientation = l.snarl.start.backward current_endsnarl = l.snarl.end.node_id current_endsnarl_orientation = l.snarl.end.backward if current_startsnarl == prev_startsnarl and current_endsnarl == prev_endsnarl: path_in_bubble.append(l) else: locus_branch_mapping[locus_count] = path_in_bubble
# Filler structures feature_filler = np.ndarray(shape=(batch_size, n_input // 3, 3), dtype=np.float32) label_filler = np.ndarray(shape=(batch_size, n_classes), dtype=np.float32) # Add ops to save and restore all the variables. saver = tf.train.Saver() # Initializing the variables sess = tf.Session() sess.run(tf.global_variables_initializer()) # Read Data print("Reading train data ...") istream = stream.open("output/train.gam", "rb") dataarray = [] for data in istream: dataarray.append(data) istream.close() print("... read %d events" % len(dataarray)) np.random.shuffle(dataarray) np.set_printoptions(linewidth=200) # Train event = neuland.Event() for iteration in range(1, 20, 1): for n, batch in enumerate(chunks(dataarray, batch_size)): feature_filler.fill(0.) label_filler.fill(0.) for m, data in enumerate(batch):
# tandom and interspersed repeat from both aligned pacbio reads and true_haps trans_filename = sys.argv[1] gam_filename = sys.argv[2] true_haps_filename = sys.argv[3] parameter_interspersed = sys.argv[4] out_filename = sys.argv[5] out_file = open(out_filename, 'w') d = {} count = 1 bubbles_start = set() #with stream.open('assembly_graph.P.int.remn2n.X_100.chrXIII.trans' ,"rb") as istream: with stream.open(str(trans_filename), "rb") as istream: for data in istream: l = vg_pb2.SnarlTraversal() l.ParseFromString(data) if l.snarl.start.backward == True: start_node = l.snarl.end.node_id else: start_node = l.snarl.start.node_id bubbles_start.add(start_node) multiplicity_bubbles = defaultdict(list) read_details = defaultdict(list) #with stream.open('../out.new.gam', "rb") as istream: with stream.open(str(gam_filename), "rb") as istream: for data in istream: g = vg_pb2.Alignment()
def vg_reader(locus_file, gam_file): """ input: sorted locus and sorted GAM file output from vg. output: sorted readset for core DP. assumptions: 1. locus file consists of linear ordering of simple bubbles only and hence sorted. Each locus file does not contain start and end vertex. 2. paths in the locus should be covered by atleast one pacbio read. 2. GAM file is sorted and restricted to locus file. 3. files consists of all DAG connected components. 4. add variant only when it identifies the branch uniquely. """ locus_count = 0 prev_startsnarl = 0 prev_endsnarl = 0 locus_branch_mapping = OrderedDict() locus_count = 0 prev_startsnarl = 0 prev_startsnarl_orientation = -1 prev_endsnarl = 0 prev_endsnarl_orientation = -1 reads_dict = defaultdict(list) with stream.open(str(locus_file), "rb") as istream: for data in istream: l = vg_pb2.SnarlTraversal() l.ParseFromString(data) #TODO: make ordered doctionary locus_branch_mapping # handle forward and backward case of nodes current_startsnarl = l.snarl.start.node_id current_startsnarl_orientation = l.snarl.start.backward current_endsnarl = l.snarl.end.node_id current_endsnarl_orientation = l.snarl.end.backward path_in_bubble = [] if len(l.visits) == 0: #TODO: for now, assumed, all nodes in path are either forward or backward if l.snarl.start.backward == True: path_in_bubble.append( tuple((l.snarl.end.node_id, l.snarl.start.node_id))) else: path_in_bubble.append( tuple((l.snarl.start.node_id, l.snarl.end.node_id))) else: #TODO: for now, assumed, all nodes in path are either forward or backward if l.snarl.start.backward == True: path_in_bubble.append( tuple((l.snarl.end.node_id, l.visits[-1].node_id))) for i in range(0, len(l.visits) - 1): path_in_bubble.append( tuple((l.visits[i + 1].node_id, l.visits[i].node_id))) path_in_bubble.append( tuple((l.visits[0].node_id, l.snarl.start.node_id))) else: path_in_bubble.append( tuple((l.snarl.start.node_id, l.visits[0].node_id))) for i in range(0, len(l.visits) - 1): path_in_bubble.append( tuple((l.visits[i].node_id, l.visits[i + 1].node_id))) path_in_bubble.append( tuple((l.visits[-1].node_id, l.snarl.end.node_id))) if current_startsnarl == prev_startsnarl and current_endsnarl == prev_endsnarl and current_endsnarl_orientation == prev_endsnarl_orientation and prev_startsnarl_orientation == current_startsnarl_orientation: per_locus.append(path_in_bubble) else: locus_count = locus_count + 1 per_locus = [] per_locus.append(path_in_bubble) prev_startsnarl = current_startsnarl prev_startsnarl_orientation = current_startsnarl_orientation prev_endsnarl = current_endsnarl prev_endsnarl_orientation = current_endsnarl_orientation locus_branch_mapping[locus_count] = per_locus print('The number of hets:') het_count = 0 for k, v in locus_branch_mapping.items(): if len(v) > 1: het_count = het_count + 1 print(het_count) # keep branch of paths in each bubble. alleles_per_pos = defaultdict() for k, v in locus_branch_mapping.items(): alleles_per_pos[k] = len(v) # both simple and complex bubbles: key is the values in locus_branch_mapping and value is triplet(locus, branch, alleles) reverse_mapping = defaultdict(list) for k, v in locus_branch_mapping.items(): if len(v) > 1: # more than one branch for i, b in enumerate(v): if len(b) > 0: for p, j in enumerate(b): reverse_mapping[j].append( [k, i, len(v)] ) # in complex bubbles, a node can map to multiple branches. count = 0 duplicated = 0 #TODO: consider reads with only positive score. with stream.open(str(gam_file), "rb") as istream: for data in istream: g = vg_pb2.Alignment() g.ParseFromString(data) # hard-coded source id, mapping quality and other values. val1 = True val2 = False count1 = 0 count2 = 0 score = g.score / len(g.sequence) #if score > 0.2: # continue read = [] # create read for each read alignment prev_tmp = [] prev_locus = -1 locus = -1 for i in range(0, len(g.path.mapping) - 1): #for i in g.path.mapping: # go over the mapping in a read # TODO: check for forward or reverse strand, we may not need it for DAG. edge1 = tuple((int(g.path.mapping[i].position.name), int(g.path.mapping[i + 1].position.name) )) # go over nodes in a mapping edge2 = tuple((int(g.path.mapping[i + 1].position.name), int(g.path.mapping[i].position.name) )) # go over nodes in a mapping if edge1 in reverse_mapping or edge2 in reverse_mapping: # handle start and sink node. if edge1 in reverse_mapping: qualities = [10] * reverse_mapping[edge1][0][2] node_inf = [ tuple(i[0:2]) for i in reverse_mapping[edge1] ] # consider (locus, branch) else: qualities = [10] * reverse_mapping[edge2][0][2] node_inf = [ tuple(i[0:2]) for i in reverse_mapping[edge2] ] tmp = [x for x in node_inf] if prev_locus != tmp[0][0]: prev_tmp = tmp prev_locus = tmp[0][0] interset_tmp = list(set(tmp).intersection(set(prev_tmp))) if len(prev_tmp) > 0 and len( set(tmp).intersection(set(prev_tmp)) ) == 1: # for complicated bubbles, but with Top-k paths. combination of some nodes uniquely determine branch. qualities[interset_tmp[0][1]] = 0 if i == len(g.path.mapping) - 2: #read.add_variant(interset_tmp[0][0], interset_tmp[0][1], qualities) reads_dict[g.name + "_" + str(g.query_position)].append( interset_tmp[0][0]) read.append(interset_tmp[0][0]) else: next_edge1 = tuple( (int(g.path.mapping[i + 1].position.name), int(g.path.mapping[i + 2].position.name))) next_edge2 = tuple( (int(g.path.mapping[i + 2].position.name), int(g.path.mapping[i + 1].position.name))) if next_edge1 not in reverse_mapping and next_edge2 not in reverse_mapping: #read.add_variant(interset_tmp[0][0], interset_tmp[0][1], qualities) reads_dict[g.name + "_" + str(g.query_position)].append( interset_tmp[0][0]) read.append(interset_tmp[0][0]) locus = interset_tmp[0][0] else: read.append(int(g.path.mapping[i].position.name)) read.append(int(g.path.mapping[i + 1].position.name)) reads_dict[g.name + "_" + str(g.query_position)].append( int(g.path.mapping[i].position.name)) reads_dict[g.name + "_" + str(g.query_position)].append( int(g.path.mapping[i + 1].position.name)) # for every pair of bubbles or bubble-node for k in range(0, len(read) - 1): pair1 = str(read[k]) + "_" + str( read[k + 1]) # not taking care of reverse direction now pair2 = str(read[k + 1]) + "_" + str(read[k]) # should take of direction, not adding pairs reverse of each other if pair2 in consec_pairs: consec_pairs[pair2].add(g.name) else: consec_pairs[pair1].add(g.name) return reads_dict, consec_pairs
nodes = set() edge_connections = defaultdict(set) gfafile = open('assembly_graph.P.int.remn2n.X_100.view.gfa', "rb") for line in gfafile: var = line.split('\t') if var[0] == 'S': nodes.add(int(var[1])) if var[0] == 'L': edge_connections[int(var[1])].add(int(var[3])) multiplicity_bubbles = defaultdict(list) read_details = defaultdict(list) with stream.open('out.new.gam', "rb") as istream: for data in istream: g = vg_pb2.Alignment() g.ParseFromString(data) for i in range(0,len(g.path.mapping)): node = g.path.mapping[i].position.node_id if node in nodes: multiplicity_bubbles[node].append(g.name) tmp = '+' if g.path.mapping[i].position.is_reverse == 'True': tmp = '-' node_tmp = str(node)+ "_" + str(tmp) read_details[g.name].append(node_tmp) count=0 repeaticity = defaultdict()
for neutron in rootevent.NeulandPrimaryNeutronInteractionPixels: n = protoevent.neutrons.add() n.x = neutron.GetX() n.y = neutron.GetY() n.z = neutron.GetZ() n.t = neutron.GetT() for digi in rootevent.NeulandDigis: b = protoevent.digis.add() b.id = digi.GetPaddle() b.tl = digi.GetTdcL() b.tr = digi.GetTdcR() b.e = digi.GetE() ostream.write(protoevent) print("Writing to protobuf stream %s" % sys.argv[1]) ostream = stream.open(sys.argv[1], 'wb') for digifilename in sys.argv[2:]: simufilename = digifilename.replace('.digi.', '.sim.') print("Reading ROOT file %s" % digifilename) tfile = ROOT.TFile.Open(digifilename) ttree = tfile.Get("evt") print("... and ROOT file %s" % simufilename) ttree.AddFriend("simtree = evt", simufilename) read_and_append(ttree, ostream) ostream.close()
def vg_reader(locus_file, gam_file): """ input: sorted locus and sorted GAM file output from vg. output: sorted readset for core DP. assumptions: 1. locus file consists of linear ordering of simple bubbles only and hence sorted. Each locus file does not contain start and end vertex. 2. paths in the locus should be covered by atleast one pacbio read. 2. GAM file is sorted and restricted to locus file. 3. files consists of all DAG connected components. 4. add variant only when it identifies the branch uniquely. """ # create a dictionary of branches for each locus based on locus file. locus_branch_mapping=defaultdict() locus_count=0 prev_startsnarl = 0 prev_endsnarl = 0 locus_branch_mapping=defaultdict() locus_count=0 prev_startsnarl = 0 prev_startsnarl_orientation = -1 prev_endsnarl = 0 prev_endsnarl_orientation = -1 with stream.open(str(locus_file), "rb") as istream: for data in istream: l = vg_pb2.SnarlTraversal() l.ParseFromString(data) # handle forward and backward case of nodes current_startsnarl = l.snarl.start.node_id current_startsnarl_orientation = l.snarl.start.backward current_endsnarl = l.snarl.end.node_id current_endsnarl_orientation = l.snarl.end.backward path_in_bubble =[] #if len(l.visits) ==1: # consider only hets #path_in_bubble.append(tuple ((l.snarl.start.node_id,l.visits[0].node_id))) #path_in_bubble.append(tuple ((l.visits[0].node_id, l.snarl.end.node_id))) #if current_startsnarl == prev_startsnarl and current_endsnarl == prev_endsnarl and current_endsnarl_orientation == prev_endsnarl_orientation and prev_startsnarl_orientation == current_startsnarl_orientation: #per_locus.append(path_in_bubble) #else: #locus_count=locus_count+1 #per_locus = [] #per_locus.append(path_in_bubble) #prev_startsnarl = current_startsnarl #prev_startsnarl_orientation = current_startsnarl_orientation #prev_endsnarl = current_endsnarl #prev_endsnarl_orientation = current_endsnarl_orientation #locus_branch_mapping[locus_count]=per_locus # TODO: fix this properly #if len(l.visits) ==1 and l.snarl.start.backward == False and l.snarl.end.backward == False: # consider only hets #path_in_bubble.append(tuple ((l.snarl.start.node_id,l.visits[0].node_id))) #path_in_bubble.append(tuple ((l.visits[0].node_id, l.snarl.end.node_id))) #if len(l.visits) ==1 and l.snarl.start.backward == True and l.snarl.end.backward == True: # consider only hets #path_in_bubble.append(tuple ((l.snarl.end.node_id,l.visits[0].node_id))) #path_in_bubble.append(tuple ((l.visits[0].node_id, l.snarl.start.node_id))) if len(l.visits) ==0: path_in_bubble.append(tuple ((l.snarl.start.node_id,l.snarl.end.node_id))) else: path_in_bubble.append(tuple ((l.snarl.start.node_id,l.visits[0].node_id))) for i in range(0,len(l.visits)-1): path_in_bubble.append(tuple((l.visits[i].node_id, l.visits[i+1].node_id))) path_in_bubble.append(tuple ((l.visits[-1].node_id, l.snarl.end.node_id))) if current_startsnarl == prev_startsnarl and current_endsnarl == prev_endsnarl and current_endsnarl_orientation == prev_endsnarl_orientation and prev_startsnarl_orientation == current_startsnarl_orientation: per_locus.append(path_in_bubble) else: locus_count=locus_count+1 per_locus = [] per_locus.append(path_in_bubble) prev_startsnarl = current_startsnarl prev_startsnarl_orientation = current_startsnarl_orientation prev_endsnarl = current_endsnarl prev_endsnarl_orientation = current_endsnarl_orientation locus_branch_mapping[locus_count]=per_locus #print(locus_branch_mapping) print('The number of hets:') het_count= 0 for k,v in locus_branch_mapping.items(): if len(v) >1: het_count = het_count +1 print(het_count) # keep branch of paths in each bubble. alleles_per_pos= defaultdict() for k,v in locus_branch_mapping.items(): alleles_per_pos[k]=len(v) # both simple and complex bubbles: key is the values in locus_branch_mapping and value is triplet(locus, branch, alleles) reverse_mapping= defaultdict(list) for k,v in locus_branch_mapping.items(): if len(v) > 1: # more than one branch for i,b in enumerate(v): if len(b) > 0: for p,j in enumerate(b): reverse_mapping[j].append([k,i, len(v)]) # in complex bubbles, a node can map to multiple branches. #print(reverse_mapping) # both simple and complex bubbles: extract reads from GAM file associated with the locus and create a sorted readset. # in complex bubble, set of nodes uniquely determine the path. readset=ReadSet() count =0 duplicated = 0 #TODO: consider reads with only positive score. with stream.open(str(gam_file), "rb") as istream: for data in istream: g = vg_pb2.Alignment() g.ParseFromString(data) # hard-coded source id, mapping quality and other values. val1 = True val2 = False count1 =0 count2=0 #score = g.score/len(g.sequence) #if score < 0.75: #continue read=Read(g.name, 0, 0, 0) # create read for each read alignment prev_tmp=[] prev_locus= -1 locus = -1 for i in range(0,len(g.path.mapping)): if g.path.mapping[i].position.is_reverse != val1: val1 = False break else: count1 = count1 +1 if count1 == len(g.path.mapping): count = count+1 #print(g.name) for i in range(0,len(g.path.mapping)): if g.path.mapping[i].position.is_reverse != val2: val2 = True break else: count2 = count2 +1 if count2 == len(g.path.mapping): count = count+1 #print(g.name) if val1 ==val2: for i in range(0,len(g.path.mapping)-1): #for i in g.path.mapping: # go over the mapping in a read # TODO: check for forward or reverse strand, we may not need it for DAG. edge1 = tuple((g.path.mapping[i].position.node_id, g.path.mapping[i+1].position.node_id)) # go over nodes in a mapping edge2 = tuple((g.path.mapping[i+1].position.node_id, g.path.mapping[i].position.node_id)) # go over nodes in a mapping #print(edge) if edge1 in reverse_mapping or edge2 in reverse_mapping: # handle start and sink node. if edge1 in reverse_mapping: qualities = [10]* reverse_mapping[edge1][0][2] node_inf= [tuple(i[0:2]) for i in reverse_mapping[edge1]] # consider (locus, branch) else: qualities = [10]* reverse_mapping[edge2][0][2] node_inf= [tuple(i[0:2]) for i in reverse_mapping[edge2]] tmp = [x for x in node_inf] interset_tmp= list(set(tmp).intersection(set(prev_tmp))) if len(prev_tmp) > 0 and len(set(tmp).intersection(set(prev_tmp)))==1: # for complicated bubbles, but with Top-k paths. combination of some nodes uniquely determine branch. qualities[interset_tmp[0][1]] = 0 read.add_variant(interset_tmp[0][0], interset_tmp[0][1], qualities) locus= interset_tmp[0][0] if prev_locus!=locus: prev_tmp = [] else: for i in tmp: prev_tmp.append(i) prev_locus = locus print(len(read)) print(g.name) if len(read) >= 2: readset.add(read) print("non-shattered") print(count) #print(readset) readset1=ReadSet() for read in readset: if read.sort() ==1: duplicated = duplicated +1 continue else: readset1.add(read) readset1.sort() print("duplicated") print(duplicated) print("reads considered before read-selection") print(len(readset1)) return readset1, alleles_per_pos, locus_branch_mapping
def vg_reader(locus_file, gam_file): """ input: sorted locus and sorted GAM file output from vg. output: sorted readset for core DP. assumptions: 1. locus file consists of linear ordering of simple bubbles only and hence sorted. Each locus file does not contain start and end vertex. 2. paths in the locus should be covered by atleast one pacbio read. 2. GAM file is sorted and restricted to locus file. 3. files consists of all DAG connected components. 4. add variant only when it identifies the branch uniquely. """ locus_count = 0 prev_startsnarl = 0 prev_endsnarl = 0 locus_branch_mapping = OrderedDict() prev_startsnarl_orientation = -1 prev_endsnarl_orientation = -1 start_end_bubblenods = set() insidebubble = 0 with stream.open(str(locus_file), "rb") as istream: for data in istream: l = vg_pb2.SnarlTraversal() l.ParseFromString(data) #TODO: make ordered doctionary locus_branch_mapping # handle forward and backward case of nodes current_startsnarl = l.snarl.start.node_id current_startsnarl_orientation = l.snarl.start.backward current_endsnarl = l.snarl.end.node_id current_endsnarl_orientation = l.snarl.end.backward path_in_bubble = [] start_end_bubblenods.add(l.snarl.end.node_id) start_end_bubblenods.add(l.snarl.start.node_id) hasInBubble = False if len(l.visits) == 0: #TODO: for now, assumed, all nodes in path are either forward or backward if l.snarl.start.backward == True: path_in_bubble.append( tuple((l.snarl.end.node_id, l.snarl.start.node_id))) else: path_in_bubble.append( tuple((l.snarl.start.node_id, l.snarl.end.node_id))) else: #TODO: for now, assumed, all nodes in path are either forward or backward if (l.snarl.start.backward == True and l.snarl.end.backward != True) or (l.snarl.start.backward != True and l.snarl.end.backward == True): path_in_bubble.append( tuple((l.snarl.end.node_id, l.visits[-1].node_id))) local_path_back = -1 for i in range(len(l.visits)): if l.visits[i].snarl.start.node_id != 0: pathBack = True if l.visits[i].backward: insideBack = True else: insideBack = False insidebubble = 1 hasInBubble = True if i == len(l.visits) - 1: break path_in_bubble.append( tuple((l.visits[-1 - i].node_id, l.visits[-2 - i].node_id))) path_in_bubble.append( tuple((l.visits[0].node_id, l.snarl.start.node_id))) else: local_path_back = 1 path_in_bubble.append( tuple((l.snarl.start.node_id, l.visits[0].node_id))) for i in range(len(l.visits)): if l.visits[i].snarl.start.node_id != 0: pathBack = False if l.visits[i].backward: insideBack = True else: insideBack = False insidebubble = 1 hasInBubble = True if i == len(l.visits) - 1: break path_in_bubble.append( tuple((l.visits[i].node_id, l.visits[i + 1].node_id))) path_in_bubble.append( tuple((l.visits[-1].node_id, l.snarl.end.node_id))) if hasInBubble: tempPath = path_in_bubble.copy() if current_startsnarl == prev_startsnarl and current_endsnarl == prev_endsnarl and current_endsnarl_orientation == prev_endsnarl_orientation and prev_startsnarl_orientation == current_startsnarl_orientation: pass else: try: locus_branch_mapping[locus_count] = per_locus except NameError: pass locus_count += 1 per_locus = [] trans_raw = [] trans_raw.append(l) else: if current_startsnarl == prev_startsnarl and current_endsnarl == prev_endsnarl and current_endsnarl_orientation == prev_endsnarl_orientation and prev_startsnarl_orientation == current_startsnarl_orientation: if insidebubble == 2: path_in_bubble = mergePath(tempPath, path_in_bubble, insideBack, pathBack, local_path_back) per_locus.append(path_in_bubble) insidebubble = 0 insideBack = False pathBack = False else: per_locus.append(path_in_bubble) else: if insidebubble == 1: insidebubble = 2 path_in_bubble = mergePath(tempPath, path_in_bubble, insideBack, pathBack, local_path_back) per_locus.append(path_in_bubble) else: try: locus_branch_mapping[locus_count] = per_locus except NameError: pass locus_count += 1 per_locus = [] per_locus.append(path_in_bubble) prev_startsnarl = current_startsnarl prev_startsnarl_orientation = current_startsnarl_orientation prev_endsnarl = current_endsnarl prev_endsnarl_orientation = current_endsnarl_orientation print('The number of hets:') het_count = 0 for k, v in locus_branch_mapping.items(): if len(v) > 1: het_count = het_count + 1 print(het_count) # keep branch of paths in each bubble. alleles_per_pos = defaultdict() for k, v in locus_branch_mapping.items(): alleles_per_pos[k] = len(v) # both simple and complex bubbles: key is the values in locus_branch_mapping and value is triplet(locus, branch, alleles) reverse_mapping = defaultdict(list) for k, v in locus_branch_mapping.items(): if len(v) > 1: # more than one branch for i, b in enumerate(v): if len(b) > 0: for p, j in enumerate(b): reverse_mapping[j].append( [k, i, len(v)] ) # in complex bubbles, a node can map to multiple branches. # both simple and complex bubbles: extract reads from GAM file associated with the locus and create a sorted readset. # in complex bubble, set of nodes uniquely determine the path. readset = ReadSet() count = 0 duplicated = 0 #TODO: consider reads with only positive score. with stream.open(str(gam_file), "rb") as istream: for data in istream: g = vg_pb2.Alignment() g.ParseFromString(data) # hard-coded source id, mapping quality and other values. val1 = True val2 = False count1 = 0 count2 = 0 #score = g.score/len(g.sequence) #if score > 0.2: # continue read = Read(g.name, 0, 0, 0) # create read for each read alignment prev_tmp = [] prev_locus = -1 locus = -1 n_variant = 0 for i in range(0, len(g.path.mapping) - 1): #for i in g.path.mapping: # go over the mapping in a read # TODO: check for forward or reverse strand, we may not need it for DAG. edge1 = tuple((g.path.mapping[i].position.node_id, g.path.mapping[i + 1].position.node_id )) # go over nodes in a mapping edge2 = tuple((g.path.mapping[i + 1].position.node_id, g.path.mapping[i].position.node_id )) # go over nodes in a mapping if edge1 in reverse_mapping or edge2 in reverse_mapping: # handle start and sink node. if edge1 in reverse_mapping: qualities = [10] * reverse_mapping[edge1][0][2] #qualitie = 1 node_inf = [ tuple(i[0:3]) for i in reverse_mapping[edge1] ] # consider (locus, branch) else: qualities = [10] * reverse_mapping[edge2][0][2] #qualities = 1 node_inf = [ tuple(i[0:3]) for i in reverse_mapping[edge2] ] tmp = node_inf.copy() if prev_locus != tmp[0][0]: prev_tmp = tmp.copy() prev_locus = tmp[0][0] len_in_path = 1 else: len_in_path += 1 interset_tmp = list(set(tmp).intersection(set(prev_tmp))) if len(interset_tmp) == 1 and interset_tmp[0][ 2] == len_in_path: # for complicated bubbles, but with Top-k paths. combination of some nodes uniquely determine branch. qualities[interset_tmp[0][1]] = 0 #qualities= 1 read.add_variant(interset_tmp[0][0], interset_tmp[0][1], qualities) n_variant += 1 if len(read) >= 2: readset.add(read) readset1 = ReadSet() tmp_duplicated = set() for read in readset: if read.sort() == 1: duplicated = duplicated + 1 tmp = [] for variant in read: tmp.append(variant.position) #print("duplicated variant") x = [ item for item, count in collections.Counter(tmp).items() if count > 1 ] for a in x: tmp_duplicated.add(a) continue else: if len(read) >= 2: readset1.add(read) readset1.sort() return readset1, alleles_per_pos, locus_branch_mapping, readset
#!/usr/bin/env python import sys, io import packet_pb2 as packet import job_pb2 as job import stream # Example from https://developers.google.com/protocol-buffers/docs/pythontutorial if len(sys.argv) != 2: print "Usage:", sys.argv[0], "jobs.pb.gz" sys.exit(-1) # Open the file and discard the header istream = stream.open(sys.argv[1], "rb") for msg in istream: hdr = packet.PacketHeader() hdr.ParseFromString(msg) print hdr #break for msg in istream: jobinfo = job.JobInfo() jobinfo.ParseFromString(msg) print jobinfo # Close the file istream.close()
import sys import stream import logging import vg_pb2 from collections import Counter from collections import defaultdict import collections from collections import OrderedDict, namedtuple from collections import defaultdict # assumption ... all S' and before L's d = defaultdict(list) count = 1 with stream.open('true_hap1.gam', "rb") as istream: for data in istream: g = vg_pb2.Alignment() g.ParseFromString(data) #if g.name != "S1_SK1_110": # continue tmp = [] for i in range(0, len(g.path.mapping)): node = g.path.mapping[i].position.node_id d[node].append(g.name) #print(tmp) count = 0 max_val = 0 for k, v in d.items(): for item, count in collections.Counter(v).items(): if len(v) > 1:
feature_filler = np.ndarray( shape=(batch_size, 60*50, 3), dtype=np.float32) label_filler = np.ndarray( shape=(batch_size, n_classes), dtype=np.float32) # Add ops to save and restore all the variables. saver = tf.train.Saver() # Initializing the variables sess = tf.Session() sess.run(tf.global_variables_initializer()) # Read Data print("Reading train data ...") istream = stream.open("output/train.gam", "rb") dataarray = [] for data in istream: dataarray.append(data) istream.close() print("... read %d events" % len(dataarray)) np.random.shuffle(dataarray) np.set_printoptions(linewidth=200) # Train event = neuland.Event() for n, batch in enumerate(chunks(dataarray, batch_size)): feature_filler.fill(0.) label_filler.fill(0.) for m, data in enumerate(batch):
from collections import OrderedDict, namedtuple from collections import defaultdict # assumption ... all S' and before L's filename = sys.argv[1] #out = sys.argv[2] d = {} count = 1 with open(filename) as fp: for line in fp: var = line.rstrip() edge = var + "_" + next(fp).rstrip() #print(edge) d[edge] = defaultdict() # periodicity in a read, read support with stream.open('ouralns.SK1.gam', "rb") as istream: for data in istream: g = vg_pb2.Alignment() g.ParseFromString(data) #if g.name != "S1_SK1_110": # continue tmp = [] for i in range(0, len(g.path.mapping) - 1): edge1 = str(g.path.mapping[i].position.node_id) + "_" + str( g.path.mapping[ i + 1].position.node_id) # go over nodes in a mapping edge2 = str(g.path.mapping[i + 1].position.node_id) + "_" + str( g.path.mapping[i].position.node_id ) # go over nodes in a mapping #if edge1 in d or edge2 in d: #print(edge1)
from collections import OrderedDict, namedtuple from collections import defaultdict # assumption ... all S' and before L's #filename = sys.argv[1] #out = sys.argv[2] d={} count=1 nodes = set() with stream.open('canu_new.contigs.gam', "rb") as istream: for data in istream: g = vg_pb2.Alignment() g.ParseFromString(data) for i in range(0,len(g.path.mapping)): node = g.path.mapping[i].position.node_id nodes.add(node) bubbles_start = set() covered_by_canu =set() with stream.open('assembly_graph.P.int.remn2n.X_100.trans' ,"rb") as istream: for data in istream: l = vg_pb2.SnarlTraversal() l.ParseFromString(data) if l.snarl.start.backward == True:
def generate_haplotigs(sample_superreads, components, node_seq_list, locus_branch_mapping, canu_alignments, vg_file, pred_haplotigs, locus_file): sample = 0 pred_haplotigs_file = open(pred_haplotigs, 'w') # This holds a dict from (node ID, orientation) pair, where true is reverse # (leftward) and false is forward (rightward) to a set of (node ID, # orientation) pairs of the nodes you reach, and their orientations when you get # there, reading off of the node in the specified orientation. # We will call these pairs "traversals". traversals_after = defaultdict(set) with stream.open(str(vg_file), "rb") as istream: for data in istream: l = vg_pb2.Graph() l.ParseFromString(data) for j in range(len(l.edge)): from_traversal = (getattr(l.edge[j], "from"), l.edge[j].from_start) to_traversal = (l.edge[j].to, l.edge[j].to_end) # Put the edge in the way it was read traversals_after[from_traversal].add(to_traversal) # Also store it in the other orientation, so you can follow it backward traversals_after[(to_traversal[0], not to_traversal[1])].add( (from_traversal[0], not from_traversal[1])) for haptype in range(2): # for second haplotype prev_comp = -1 hap1 = '' hapseq1 = defaultdict(list) hapseq2 = defaultdict(list) haplotype_over_bubbles = defaultdict(list) start_node_to_bubble = defaultdict(list) for sample, superreads in sample_superreads.items(): for v1, v2 in zip(*superreads): v = v1 if haptype == 0 else v2 b = locus_branch_mapping[v.position][v.allele] # tmp stores the nodes over the haplotype path in a bubble tmp = list() tmp.append(b[0][0]) for p, j in enumerate(b): tmp.append(j[-1]) def dfs_path(start, goal, tmp): stack = [((start, True), [(start, True)]), ((start, False), [(start, False)])] visited = set() visited.add(start) count = 0 while stack: (traversal, path) = stack.pop() for next in traversals_after[traversal]: if count > 5000: break if next[0] in tmp and next not in visited: #if "{}_{}".format(vertex, next) in edge_connections_sign: if next[0] == goal: if len(path) == len(tmp) - 1: return path + [next] else: count += 1 visited.add(next) stack.append((next, path + [next])) return [] path = dfs_path(tmp[0], tmp[-1], tmp) if len(path) != len(tmp): path = dfs_path(tmp[-1], tmp[0], tmp) # We need a function to flip a traversal def reverse_traversal(trav): return (trav[0], not trav[1]) # We need a function to flip a path and all its traversals def reverse_path(to_reverse): return [reverse_traversal(t) for t in reversed(path)] # store the haplotype path with start or end as key if len(path) == len(tmp): haplotype_over_bubbles[path[0]] = path # from start haplotype_over_bubbles[reverse_traversal( path[-1])] = reverse_path(path) start_node_to_bubble[path[0]] = v.position start_node_to_bubble[reverse_traversal( path[-1])] = v.position # consider underlying graph as bidirected graph # start from canu contigs and make break them based on whatshap components # In bubbles, consider the haplotype path made up of nodes stored and whether to traverse the path in forward or backward, decide based on canu # at non-bubble region, consider path based on canu by considering the underlying graph. nodes_list = set() dummy_list = ['0'] * 1000 orderalignment = defaultdict(list) orderalignment = defaultdict(lambda: [-1] * 10000, orderalignment) with stream.open(str(canu_alignments), "rb") as istream: for data in istream: g = vg_pb2.Alignment() contig_nodes = [] contig_nodes_blocks = [] contig_nodes_seq = '' g.ParseFromString(data) save_nodes = [] canu_nodes_toseq = defaultdict() for i in range(0, len(g.path.mapping)): index1 = g.path.mapping[i].position.node_id orientation_canu = g.path.mapping[i].position.is_reverse save_nodes.append((index1, orientation_canu)) canu_nodes_toseq[index1] = g.path.mapping[i].edit[ 0].sequence # What component was the last bubble in, if there was a last bubble prev_component = None it_val = 0 already_done = set() for i in range(0, len(save_nodes)): if i >= it_val: index1 = save_nodes[i][0] orientation_canu = save_nodes[i][1] # to take care of components, break when the bubbleid of previous and current is not equal if (index1, orientation_canu) in start_node_to_bubble: bubbleid = start_node_to_bubble[(index1, orientation_canu)] component = components[bubbleid] if prev_component is not None and component != prev_component: # We have moved to a new component of bubbles contig_nodes.append(contig_nodes_blocks) contig_nodes_blocks = [] prev_component = component elif prev_component is None: # Remember the first component prev_component = component if (index1, orientation_canu ) not in haplotype_over_bubbles: if orientation_canu == False: already_done.add(index1) contig_nodes_blocks.append( str(index1) + "_" + str(0)) else: already_done.add(index1) contig_nodes_blocks.append( str(index1) + "_" + str(1)) if (index1, orientation_canu) in haplotype_over_bubbles: if haplotype_over_bubbles[( index1, orientation_canu )][-1] in save_nodes: # taking ordering from graph: for traversal in haplotype_over_bubbles[( index1, orientation_canu)][:-1]: if traversal[0] not in already_done: # Put each traversal that appears in the bubble in the contig node blocks # Except for the last one, which will be in the next bubble or in Canu again contig_nodes_blocks.append( str(traversal[0]) + "_" + ("1" if traversal[1] else "0")) already_done.add(traversal[0]) if ( index1, orientation_canu ) in haplotype_over_bubbles and haplotype_over_bubbles[ (index1, orientation_canu)][-1] in save_nodes: if save_nodes.index(haplotype_over_bubbles[( index1, orientation_canu)][-1]) > save_nodes.index( haplotype_over_bubbles[( index1, orientation_canu)][0]): # Skip to the last traversal in the bubble # It will also be shared by Canu it_val = save_nodes.index( haplotype_over_bubbles[(index1, orientation_canu)] [-1]) # end node is not repeated else: # Don't do this Canu visit, it's part of a bubble we already did. continue contig_nodes.append(contig_nodes_blocks) # for the last one. # build the contig sequence taking care of reverse complements for every canu contigs for j, contig_blocks in enumerate(contig_nodes): contig_nodes_seq = '' for i in contig_blocks: node = int(i.split("_")[0]) if i.split("_")[1] == '1': contig_nodes_seq = contig_nodes_seq + reverse_complement( str(node_seq_list[node])) else: contig_nodes_seq = contig_nodes_seq + str( node_seq_list[node]) pred_haplotigs_file.write(">seq" + str(j) + "_" + str(locus_file) + "_" + str(haptype + 1) + "\n") pred_haplotigs_file.write(contig_nodes_seq + '\n')
import logging import vg_pb2 from collections import Counter from collections import defaultdict import collections from collections import OrderedDict, namedtuple from collections import defaultdict # assumption ... all S' and before L's #filename = sys.argv[1] #out = sys.argv[2] d = {} count = 1 bubbles_start = set() with stream.open('assembly_graph.P.int.remn2n.X_100.chrXIII.trans', "rb") as istream: #with stream.open('assembly_graph.P.int.remn2n.X_100.trans' ,"rb") as istream: for data in istream: l = vg_pb2.SnarlTraversal() l.ParseFromString(data) if l.snarl.start.backward == True: start_node = l.snarl.end.node_id else: start_node = l.snarl.start.node_id bubbles_start.add(start_node) multiplicity_bubbles = defaultdict(list) read_details = defaultdict(list) with stream.open('../out.new.chrXIII.gam', "rb") as istream: for data in istream: g = vg_pb2.Alignment()
def reverse_map(locus_file): print('Start to read locus_file') locus_count = 0 per_locus = [] #trans_raw = [] prev_startsnarl = 0 prev_endsnarl = 0 locus_branch_mapping = OrderedDict() prev_startsnarl_orientation = -1 prev_endsnarl_orientation = -1 insidebubble = 0 with stream.open(str(locus_file), "rb") as istream: for data in istream: l = vg_pb2.SnarlTraversal() l.ParseFromString(data) current_startsnarl = l.snarl.start.node_id current_startsnarl_orientation = l.snarl.start.backward current_endsnarl = l.snarl.end.node_id current_endsnarl_orientation = l.snarl.end.backward path_in_bubble = [] hasInBubble = False if len(l.visits) == 0: if l.snarl.start.backward == True: path_in_bubble.append( tuple((l.snarl.end.node_id, l.snarl.start.node_id))) else: path_in_bubble.append( tuple((l.snarl.start.node_id, l.snarl.end.node_id))) else: if (l.snarl.start.backward == True and l.snarl.end.backward != True) or (l.snarl.start.backward != True and l.snarl.end.backward == True): path_in_bubble.append( tuple((l.snarl.end.node_id, l.visits[-1].node_id))) local_path_back = -1 for i in range(len(l.visits)): if l.visits[i].snarl.start.node_id != 0: pathBack = True if l.visits[i].backward: insideBack = True else: insideBack = False insidebubble = 1 hasInBubble = True if i == len(l.visits) - 1: break path_in_bubble.append( tuple((l.visits[-1 - i].node_id, l.visits[-2 - i].node_id))) path_in_bubble.append( tuple((l.visits[0].node_id, l.snarl.start.node_id))) else: local_path_back = 1 path_in_bubble.append( tuple((l.snarl.start.node_id, l.visits[0].node_id))) for i in range(len(l.visits)): if l.visits[i].snarl.start.node_id != 0: pathBack = False if l.visits[i].backward: insideBack = True else: insideBack = False insidebubble = 1 hasInBubble = True if i == len(l.visits) - 1: break path_in_bubble.append( tuple((l.visits[i].node_id, l.visits[i + 1].node_id))) path_in_bubble.append( tuple((l.visits[-1].node_id, l.snarl.end.node_id))) if hasInBubble: tempPath = path_in_bubble.copy() if current_startsnarl == prev_startsnarl and current_endsnarl == prev_endsnarl and current_endsnarl_orientation == prev_endsnarl_orientation and prev_startsnarl_orientation == current_startsnarl_orientation: pass else: try: locus_branch_mapping[locus_count] = per_locus except NameError: pass locus_count -= 1 per_locus = [] else: if current_startsnarl == prev_startsnarl and current_endsnarl == prev_endsnarl and current_endsnarl_orientation == prev_endsnarl_orientation and prev_startsnarl_orientation == current_startsnarl_orientation: if insidebubble == 2: path_in_bubble = mergePath(tempPath, path_in_bubble, insideBack, pathBack, local_path_back) per_locus.append(path_in_bubble) insidebubble = 0 insideBack = False pathBack = False else: per_locus.append(path_in_bubble) else: if insidebubble == 1: insidebubble = 2 path_in_bubble = mergePath(tempPath, path_in_bubble, insideBack, pathBack, local_path_back) per_locus.append(path_in_bubble) else: try: locus_branch_mapping[locus_count] = per_locus except NameError: pass locus_count -= 1 per_locus = [] per_locus.append(path_in_bubble) prev_startsnarl = current_startsnarl prev_startsnarl_orientation = current_startsnarl_orientation prev_endsnarl = current_endsnarl prev_endsnarl_orientation = current_endsnarl_orientation locus_branch_mapping[locus_count] = per_locus het_count = 0 alleles_per_pos = dict() for k, bubble in locus_branch_mapping.items(): alleles_per_pos[k] = len(bubble) if len(bubble) > 1: het_count = het_count + 1 print('The number of hets:', het_count) reverse_mapping = defaultdict(set) allele_reverse_mapping = defaultdict(list) for k, bubble in locus_branch_mapping.items(): if bubble == []: continue for path in bubble: for edge in path: for node in edge: reverse_mapping[node].add(k) for i, path in enumerate(bubble): if len(path) > 0: for edge in path: allele_reverse_mapping[edge].append( [k, i, len(path), len(bubble)]) return reverse_mapping, allele_reverse_mapping, alleles_per_pos, locus_branch_mapping
import sys import stream import logging import vg_pb2 from collections import Counter from collections import defaultdict import collections from collections import OrderedDict, namedtuple from collections import defaultdict file_input = sys.argv[1] #file_out = argv[2] out = open(file_input + '.gfa', 'w') nodes_list = set() with stream.open(str(file_input), "rb") as istream: for data in istream: g = vg_pb2.Alignment() g.ParseFromString(data) for i in range(0, len(g.path.mapping) - 1): node1 = g.path.mapping[i].position.node_id node2 = g.path.mapping[i + 1].position.node_id nodes_list.add(node1) nodes_list.add(node2) if g.path.mapping[i].position.is_reverse == True and g.path.mapping[ i + 1].position.is_reverse == True: out.write("L" + "\t" + str(node1) + "\t" + '-' + "\t" + str(node2) + "\t" + '-' + "\t" + "0M" + "\n") if g.path.mapping[ i].position.is_reverse == False and g.path.mapping[ i + 1].position.is_reverse == True:
'C': 'G', 'G': 'C', 'T': 'A', 'a': 'T', 'c': 'G', 'g': 'C', 't': 'A' } return "".join([seq_dict[base] for base in reversed(seq)]) nodes_list = set() dummy_list = ['0'] * 100 orderalignment = defaultdict(list) orderalignment = defaultdict(lambda: [-1] * 100, orderalignment) with stream.open(str(file_input), "rb") as istream: for data in istream: g = vg_pb2.Alignment() g.ParseFromString(data) #read_info = g.name canu_name = g.name canu_chunk_num = int(g.query_position) orderalignment[canu_name].insert(canu_chunk_num, g) new_orderalignment = defaultdict(list) for k, v in orderalignment.items(): new_orderalignment[k] = [x for x in v if x != -1] print('hello') ostream = stream.open(sys.argv[2], 'wb')
import sys import stream import vg_pb2 from collections import Counter from collections import defaultdict import collections from collections import OrderedDict, namedtuple # assumption ... all S' and before L's out = sys.argv[1] f = open(out, 'w') bubble_to_remove = set() bubbles_dict_trans = defaultdict(int) with stream.open('component54.trans', "rb") as istream: for data in istream: l = vg_pb2.SnarlTraversal() l.ParseFromString(data) tmp = str(l.snarl.start.node_id) bubbles_dict_trans[tmp] = 0 with stream.open('out.new.chrI.gam', "rb") as istream: for data in istream: g = vg_pb2.Alignment() g.ParseFromString(data) for i in range(0, len(g.path.mapping)): node = str(g.path.mapping[i].position.node_id) if node in bubbles_dict_trans: bubbles_dict_trans[node] += 1