Exemplo n.º 1
0
def reverse_map(locus_file):

    print('Start to read locus_file')
    locus_count = 0
    per_locus = []
    #trans_raw = []
    prev_startsnarl = 0
    prev_endsnarl = 0
    locus_branch_mapping = OrderedDict()
    prev_startsnarl_orientation = -1
    prev_endsnarl_orientation = -1
    insidebubble = 0
    with stream.open(str(locus_file), "rb") as istream:
        for data in istream:
            l = vg_pb2.SnarlTraversal()
            l.ParseFromString(data)
            current_startsnarl = l.snarl.start.node_id
            current_startsnarl_orientation = l.snarl.start.backward
            current_endsnarl = l.snarl.end.node_id
            current_endsnarl_orientation = l.snarl.end.backward
            path_in_bubble = []
            hasInBubble = False

            if len(l.visits) == 0:
                if l.snarl.start.backward == True:
                    path_in_bubble.append(
                        tuple((l.snarl.end.node_id, l.snarl.start.node_id)))
                else:
                    path_in_bubble.append(
                        tuple((l.snarl.start.node_id, l.snarl.end.node_id)))
            else:
                if (l.snarl.start.backward == True and l.snarl.end.backward !=
                        True) or (l.snarl.start.backward != True
                                  and l.snarl.end.backward == True):
                    path_in_bubble.append(
                        tuple((l.snarl.end.node_id, l.visits[-1].node_id)))
                    local_path_back = -1
                    for i in range(len(l.visits)):
                        if l.visits[i].snarl.start.node_id != 0:
                            pathBack = True
                            if l.visits[i].backward:
                                insideBack = True
                            else:
                                insideBack = False
                            insidebubble = 1
                            hasInBubble = True
                        if i == len(l.visits) - 1:
                            break
                        path_in_bubble.append(
                            tuple((l.visits[-1 - i].node_id,
                                   l.visits[-2 - i].node_id)))
                    path_in_bubble.append(
                        tuple((l.visits[0].node_id, l.snarl.start.node_id)))
                else:
                    local_path_back = 1
                    path_in_bubble.append(
                        tuple((l.snarl.start.node_id, l.visits[0].node_id)))
                    for i in range(len(l.visits)):
                        if l.visits[i].snarl.start.node_id != 0:
                            pathBack = False
                            if l.visits[i].backward:
                                insideBack = True
                            else:
                                insideBack = False
                            insidebubble = 1
                            hasInBubble = True
                        if i == len(l.visits) - 1:
                            break
                        path_in_bubble.append(
                            tuple((l.visits[i].node_id,
                                   l.visits[i + 1].node_id)))
                    path_in_bubble.append(
                        tuple((l.visits[-1].node_id, l.snarl.end.node_id)))

            if hasInBubble:
                tempPath = path_in_bubble.copy()

                if current_startsnarl == prev_startsnarl and current_endsnarl == prev_endsnarl and current_endsnarl_orientation == prev_endsnarl_orientation and prev_startsnarl_orientation == current_startsnarl_orientation:
                    pass
                else:
                    try:
                        locus_branch_mapping[locus_count] = per_locus
                    except NameError:
                        pass
                    locus_count -= 1
                    per_locus = []
            else:
                if current_startsnarl == prev_startsnarl and current_endsnarl == prev_endsnarl and current_endsnarl_orientation == prev_endsnarl_orientation and prev_startsnarl_orientation == current_startsnarl_orientation:
                    if insidebubble == 2:
                        path_in_bubble = mergePath(tempPath, path_in_bubble,
                                                   insideBack, pathBack,
                                                   local_path_back)
                        per_locus.append(path_in_bubble)
                        insidebubble = 0
                        insideBack = False
                        pathBack = False
                    else:
                        per_locus.append(path_in_bubble)
                else:
                    if insidebubble == 1:
                        insidebubble = 2
                        path_in_bubble = mergePath(tempPath, path_in_bubble,
                                                   insideBack, pathBack,
                                                   local_path_back)
                        per_locus.append(path_in_bubble)
                    else:
                        try:
                            locus_branch_mapping[locus_count] = per_locus
                        except NameError:
                            pass
                        locus_count -= 1
                        per_locus = []
                        per_locus.append(path_in_bubble)

            prev_startsnarl = current_startsnarl
            prev_startsnarl_orientation = current_startsnarl_orientation
            prev_endsnarl = current_endsnarl
            prev_endsnarl_orientation = current_endsnarl_orientation

    locus_branch_mapping[locus_count] = per_locus
    het_count = 0
    alleles_per_pos = dict()
    for k, bubble in locus_branch_mapping.items():
        alleles_per_pos[k] = len(bubble)
        if len(bubble) > 1:
            het_count = het_count + 1
    print('The number of hets:', het_count)
    reverse_mapping = defaultdict(set)
    allele_reverse_mapping = defaultdict(list)
    for k, bubble in locus_branch_mapping.items():
        if bubble == []:
            continue
        for path in bubble:
            for edge in path:
                for node in edge:
                    reverse_mapping[node].add(k)
        for i, path in enumerate(bubble):
            if len(path) > 0:
                for edge in path:
                    allele_reverse_mapping[edge].append(
                        [k, i, len(path), len(bubble)])
    return reverse_mapping, allele_reverse_mapping, alleles_per_pos, locus_branch_mapping
Exemplo n.º 2
0
trans_filename = sys.argv[1]
gam_filename = sys.argv[2]
true_haps_filename = sys.argv[3]
parameter_interspersed = sys.argv[4]
out_filename = sys.argv[5]

out_file = open(out_filename, 'w')

d = {}
count = 1

bubbles_start = set()
#with stream.open('assembly_graph.P.int.remn2n.X_100.chrXIII.trans' ,"rb") as istream:
with stream.open(str(trans_filename), "rb") as istream:
    for data in istream:
        l = vg_pb2.SnarlTraversal()
        l.ParseFromString(data)
        if l.snarl.start.backward == True:
            start_node = l.snarl.end.node_id
        else:
            start_node = l.snarl.start.node_id
        bubbles_start.add(start_node)

multiplicity_bubbles = defaultdict(list)
read_details = defaultdict(list)
#with stream.open('../out.new.gam', "rb") as istream:
with stream.open(str(gam_filename), "rb") as istream:
    for data in istream:
        g = vg_pb2.Alignment()
        g.ParseFromString(data)
        tmp = []
Exemplo n.º 3
0
def vg_reader(locus_file, gam_file):
    """
	input: sorted locus and sorted GAM file output from vg.
	output: sorted readset for core DP.
	assumptions: 
	1. locus file consists of linear ordering of simple bubbles only and hence sorted. Each locus file does not contain start and end vertex.
	2. paths in the locus should be covered by atleast one pacbio read.
	2. GAM file is sorted and restricted to locus file.
	3. files consists of all DAG connected components.
	4. add variant only when it identifies the branch uniquely.
	"""

    locus_count = 0
    prev_startsnarl = 0
    prev_endsnarl = 0
    locus_branch_mapping = OrderedDict()
    locus_count = 0
    prev_startsnarl = 0
    prev_startsnarl_orientation = -1
    prev_endsnarl = 0
    prev_endsnarl_orientation = -1
    reads_dict = defaultdict(list)
    with stream.open(str(locus_file), "rb") as istream:
        for data in istream:
            l = vg_pb2.SnarlTraversal()
            l.ParseFromString(data)
            #TODO: make ordered doctionary locus_branch_mapping
            # handle forward and backward case of nodes
            current_startsnarl = l.snarl.start.node_id
            current_startsnarl_orientation = l.snarl.start.backward
            current_endsnarl = l.snarl.end.node_id
            current_endsnarl_orientation = l.snarl.end.backward
            path_in_bubble = []

            if len(l.visits) == 0:
                #TODO: for now, assumed, all nodes in path are either forward or backward
                if l.snarl.start.backward == True:
                    path_in_bubble.append(
                        tuple((l.snarl.end.node_id, l.snarl.start.node_id)))
                else:
                    path_in_bubble.append(
                        tuple((l.snarl.start.node_id, l.snarl.end.node_id)))
            else:
                #TODO: for now, assumed, all nodes in path are either forward or backward
                if l.snarl.start.backward == True:
                    path_in_bubble.append(
                        tuple((l.snarl.end.node_id, l.visits[-1].node_id)))
                    for i in range(0, len(l.visits) - 1):
                        path_in_bubble.append(
                            tuple((l.visits[i + 1].node_id,
                                   l.visits[i].node_id)))
                    path_in_bubble.append(
                        tuple((l.visits[0].node_id, l.snarl.start.node_id)))
                else:
                    path_in_bubble.append(
                        tuple((l.snarl.start.node_id, l.visits[0].node_id)))
                    for i in range(0, len(l.visits) - 1):
                        path_in_bubble.append(
                            tuple((l.visits[i].node_id,
                                   l.visits[i + 1].node_id)))
                    path_in_bubble.append(
                        tuple((l.visits[-1].node_id, l.snarl.end.node_id)))

            if current_startsnarl == prev_startsnarl and current_endsnarl == prev_endsnarl and current_endsnarl_orientation == prev_endsnarl_orientation and prev_startsnarl_orientation == current_startsnarl_orientation:
                per_locus.append(path_in_bubble)
            else:
                locus_count = locus_count + 1
                per_locus = []
                per_locus.append(path_in_bubble)
            prev_startsnarl = current_startsnarl
            prev_startsnarl_orientation = current_startsnarl_orientation
            prev_endsnarl = current_endsnarl
            prev_endsnarl_orientation = current_endsnarl_orientation
            locus_branch_mapping[locus_count] = per_locus

    print('The number of hets:')
    het_count = 0
    for k, v in locus_branch_mapping.items():
        if len(v) > 1:
            het_count = het_count + 1
    print(het_count)
    # keep branch of paths in each bubble.
    alleles_per_pos = defaultdict()
    for k, v in locus_branch_mapping.items():
        alleles_per_pos[k] = len(v)

    # both simple and complex bubbles: key is the values in locus_branch_mapping and value is triplet(locus, branch, alleles)
    reverse_mapping = defaultdict(list)
    for k, v in locus_branch_mapping.items():
        if len(v) > 1:  # more than one branch
            for i, b in enumerate(v):
                if len(b) > 0:
                    for p, j in enumerate(b):
                        reverse_mapping[j].append(
                            [k, i, len(v)]
                        )  # in complex bubbles, a node can map to multiple branches.

    count = 0
    duplicated = 0
    #TODO: consider reads with only positive score.
    with stream.open(str(gam_file), "rb") as istream:
        for data in istream:
            g = vg_pb2.Alignment()
            g.ParseFromString(data)
            # hard-coded source id, mapping quality and other values.
            val1 = True
            val2 = False

            count1 = 0
            count2 = 0
            score = g.score / len(g.sequence)

            #if score > 0.2:
            #       continue
            read = []  # create read for each read alignment

            prev_tmp = []
            prev_locus = -1
            locus = -1

            for i in range(0, len(g.path.mapping) - 1):
                #for i in g.path.mapping: # go over the mapping in a read
                # TODO: check for forward or reverse strand, we may not need it for DAG.
                edge1 = tuple((int(g.path.mapping[i].position.name),
                               int(g.path.mapping[i + 1].position.name)
                               ))  # go over nodes in a mapping
                edge2 = tuple((int(g.path.mapping[i + 1].position.name),
                               int(g.path.mapping[i].position.name)
                               ))  # go over nodes in a mapping
                if edge1 in reverse_mapping or edge2 in reverse_mapping:  # handle start and sink node.
                    if edge1 in reverse_mapping:
                        qualities = [10] * reverse_mapping[edge1][0][2]
                        node_inf = [
                            tuple(i[0:2]) for i in reverse_mapping[edge1]
                        ]  # consider (locus, branch)
                    else:
                        qualities = [10] * reverse_mapping[edge2][0][2]
                        node_inf = [
                            tuple(i[0:2]) for i in reverse_mapping[edge2]
                        ]
                    tmp = [x for x in node_inf]
                    if prev_locus != tmp[0][0]:
                        prev_tmp = tmp
                        prev_locus = tmp[0][0]

                    interset_tmp = list(set(tmp).intersection(set(prev_tmp)))
                    if len(prev_tmp) > 0 and len(
                            set(tmp).intersection(set(prev_tmp))
                    ) == 1:  # for complicated bubbles, but with Top-k paths. combination of some nodes uniquely determine branch.
                        qualities[interset_tmp[0][1]] = 0
                        if i == len(g.path.mapping) - 2:
                            #read.add_variant(interset_tmp[0][0], interset_tmp[0][1], qualities)
                            reads_dict[g.name + "_" +
                                       str(g.query_position)].append(
                                           interset_tmp[0][0])
                            read.append(interset_tmp[0][0])
                        else:
                            next_edge1 = tuple(
                                (int(g.path.mapping[i + 1].position.name),
                                 int(g.path.mapping[i + 2].position.name)))
                            next_edge2 = tuple(
                                (int(g.path.mapping[i + 2].position.name),
                                 int(g.path.mapping[i + 1].position.name)))

                            if next_edge1 not in reverse_mapping and next_edge2 not in reverse_mapping:
                                #read.add_variant(interset_tmp[0][0], interset_tmp[0][1], qualities)
                                reads_dict[g.name + "_" +
                                           str(g.query_position)].append(
                                               interset_tmp[0][0])
                                read.append(interset_tmp[0][0])
                        locus = interset_tmp[0][0]
                else:
                    read.append(int(g.path.mapping[i].position.name))
                    read.append(int(g.path.mapping[i + 1].position.name))
                    reads_dict[g.name + "_" + str(g.query_position)].append(
                        int(g.path.mapping[i].position.name))
                    reads_dict[g.name + "_" + str(g.query_position)].append(
                        int(g.path.mapping[i + 1].position.name))

            # for every pair of bubbles or bubble-node
            for k in range(0, len(read) - 1):
                pair1 = str(read[k]) + "_" + str(
                    read[k + 1])  # not taking care of reverse direction now
                pair2 = str(read[k + 1]) + "_" + str(read[k])
                # should take of direction, not adding pairs reverse of each other
                if pair2 in consec_pairs:
                    consec_pairs[pair2].add(g.name)
                else:
                    consec_pairs[pair1].add(g.name)

    return reads_dict, consec_pairs