Пример #1
0
 def check(self):
   for v in self.vs.itervalues():
     assert v.conj, "Some vertex have no conjugate"
   for e in self.es.itervalues():
     assert e.conj, "Some edge have no conjugate"
   for e in self.es.itervalues():
     assert self.K == len(e.seq) - e.len, "Inconsistent K"
   for e in self.es.itervalues():
     assert e.seq == utils.rc(e.conj.seq), (e.seq, utils.rc(e.conj.seq))
Пример #2
0
 def check(self):
     for v in self.vs.itervalues():
         assert v.conj, "Some vertex have no conjugate"
     for e in self.es.itervalues():
         assert e.conj, "Some edge have no conjugate"
     for e in self.es.itervalues():
         assert self.K == len(e.seq) - e.len, "Inconsistent K"
     for e in self.es.itervalues():
         assert e.seq == utils.rc(e.conj.seq), (e.seq, utils.rc(e.conj.seq))
Пример #3
0
 def __get_kmers_pos(self, genome, k):
     kmers = dict()
     for i in range(len(genome) - k + 1):
         kmer = genome[i:i + k]
         if kmer not in kmers:
             kmers[kmer] = [i + 1]
             kmers[utils.rc(kmer)] = [-(len(genome) - i - k + 1)]
         else:
             kmers[kmer].append(i + 1)
             kmers[utils.rc(kmer)].append(-(len(genome) - i - k + 1))
     return kmers
Пример #4
0
 def __get_kmers_pos(self, genome, k):
   kmers = dict()
   for i in range(len(genome) - k + 1):
     kmer = genome[i: i +k]
     if kmer not in kmers:
       kmers[kmer] = [i+1]
       kmers[utils.rc(kmer)] = [-(len(genome) - i -k + 1)]
     else:
       kmers[kmer].append(i+1)
       kmers[utils.rc(kmer)].append(-(len(genome)- i - k+1))
   return kmers
Пример #5
0
def extract_path(G, path, type="str"):
    logging.debug("Extracting path of length: %d" % len(path))

    seq = ""
    for n in path:
        nid, o = int(n[:-1]), n[-1:]
        assert (o == '+' or o == '-')

        if o == "+":
            seq += G.node[nid]['seq']
        else:
            seq += utils.rc(G.node[nid]['seq'])
Пример #6
0
def comp(G):
    for node in G.node:
        G.node[node]['seq']=utils.rc(G.node[node]['seq'])
    
    genome2length=dict()
    #relabel the offsets, determine the length of all genomes in the graph, then l-pos
    for sample in G.graph['paths']:
        maxp=0
        for node,data in G.nodes(data=True):
            if sample in data['offsets']:
                if data['offsets'][sample]+len(data['seq'])>maxp:
                    maxp=data['offsets'][sample]+len(data['seq'])
        genome2length[sample]=maxp
    
    for sample in G.graph['paths']:
        for node,data in G.nodes(data=True):
            if sample in data['offsets']:
                G.node[node]['offsets'][sample]=genome2length[sample]-(G.node[node]['offsets'][sample]+len(data['seq']))
    
    G.reverse(copy=False)
    return G
Пример #7
0
def comp(G):
    for node in G.node:
        G.node[node]['seq'] = utils.rc(G.node[node]['seq'])

    genome2length = dict()
    #relabel the offsets, determine the length of all genomes in the graph, then l-pos
    for sample in G.graph['paths']:
        maxp = 0
        for node, data in G.nodes(data=True):
            if sample in data['offsets']:
                if data['offsets'][sample] + len(data['seq']) > maxp:
                    maxp = data['offsets'][sample] + len(data['seq'])
        genome2length[sample] = maxp

    for sample in G.graph['paths']:
        for node, data in G.nodes(data=True):
            if sample in data['offsets']:
                G.node[node]['offsets'][sample] = genome2length[sample] - (
                    G.node[node]['offsets'][sample] + len(data['seq']))

    G.reverse(copy=False)
    return G
Пример #8
0
 def use_scaffold_paired_info(self, L, additional_prd):
   long_edges = set()
   used_paires = set()
   connect_edges = set()
   count_correct_scaffolds = 0
   count_incorrect_scaffolds = 0
   for edge_id, edge in self.es.items():
     if edge.length() > L:
       long_edges.add(edge)
   for e1 in long_edges:
     for e2 in long_edges:
       first_rectangle = e1.diagonals[-1].rectangle
       second_rectangle = e2.diagonals[0].rectangle
       e11 = first_rectangle.e1
       e12 = first_rectangle.e2
       e21 = second_rectangle.e1
       e22 = second_rectangle.e2
       if (e12.eid, e21.eid) in additional_prd: #or (e11, e22) in additional_prd or (e11, e21) in additional_prd or (e12, e22) in additional_prd:
         (D, weight, delta) = additional_prd[(e12.eid,e21.eid)][0]
         if not self.graph.is_connected(first_rectangle.e2.v2, second_rectangle.e1, 10):
           count_correct_scaffolds +=1
         if len(first_rectangle.e2.v2.out) != 0 or len(second_rectangle.e1.v1.inn) != 0:
           continue
         used_paires.add((e12.eid, e21.eid))
         count_incorrect_scaffolds +=1
         if D - first_rectangle.e2.len > 0 and D - first_rectangle.e2.len < 100:
           print "SHOULD CONNECT", (e1.eid, e2.eid), (e12.eid, e21.eid), D - first_rectangle.e2.len, "\n", first_rectangle.e2.seq[-55:], "\n", second_rectangle.e1.seq[:55]
           connect_edges.add((e1.eid, e2.eid))
           max_eid = self.graph.max_eid
           self.graph.add_edge(max_eid, e12.v2.vid, e21.v1.vid, self.graph.K + 3, max_eid + 1)
           self.graph.add_edge(max_eid + 1, e21.conj.v2.vid, e12.conj.v1.vid, self.graph.K + 3, max_eid)
           seq = first_rectangle.e2.seq[-self.graph.K:] + "NNN" +  second_rectangle.e1.seq[:self.graph.K]
           self.graph.add_seq(max_eid, seq)
           self.graph.add_seq(max_eid + 1, utils.rc(seq))
           seq2 = second_rectangle.conj.e2.seq[-self.graph.K:] + "NNN" + first_rectangle.conj.e1.seq[:self.graph.K]
           assert seq2 == utils.rc(seq),"\n" +  seq2 + "\n" + utils.rc(seq)
           path_1 = []
           path_2 = []
           used = set()
           begin_path = False
           start_offset = 0
           for diag in e1.diagonals:
               if e11 == diag.rectangle.e2:
                 begin_path = True
               if begin_path and diag.rectangle.e2 not in used:
                 path_1.append(diag.rectangle.e2)
                 used.add(diag.rectangle.e2)
           path_1.append(self.graph.es[max_eid])
           if e1.diagonals[-1].rectangle.e2.len <= e1.diagonals[-1].offsetc:
             path_1 = path_1[1:]
             start_offset = 0
           else:
             start_offset = e1.diagonals[-1].offsetc
           path_2.append(self.graph.es[max_eid])
           path_2.append(e2.diagonals[0].rectangle.e1)
           used = set()
           for diag in e2.diagonals:
             if e22 == diag.rectangle.e1:
               break
             if diag.rectangle.e1 not in used:  
               path_2.append(diag.rectange.e1)
               used.add(diag.rectangle.e1)
           print "path1", [e.eid for e in path_1] , "path2", [e.eid for e in path_2]  
           #self.add_rectangles_by_path(path_1, path_2, start_offset)
          
   self.test_utils.logger.info("count_correct_scaffolds " + str(count_correct_scaffolds) + " " + str(count_incorrect_scaffolds) + " " + str(len(used_paires)) + "\n")
   return connect_edges 
Пример #9
0
    def use_scaffold_paired_info(self, L, additional_prd):
        long_edges = set()
        used_paires = set()
        connect_edges = set()
        count_correct_scaffolds = 0
        count_incorrect_scaffolds = 0
        for edge_id, edge in self.es.items():
            if edge.length() > L:
                long_edges.add(edge)
        for e1 in long_edges:
            for e2 in long_edges:
                first_rectangle = e1.diagonals[-1].rectangle
                second_rectangle = e2.diagonals[0].rectangle
                e11 = first_rectangle.e1
                e12 = first_rectangle.e2
                e21 = second_rectangle.e1
                e22 = second_rectangle.e2
                if (
                        e12.eid, e21.eid
                ) in additional_prd:  #or (e11, e22) in additional_prd or (e11, e21) in additional_prd or (e12, e22) in additional_prd:
                    (D, weight, delta) = additional_prd[(e12.eid, e21.eid)][0]
                    if not self.graph.is_connected(first_rectangle.e2.v2,
                                                   second_rectangle.e1, 10):
                        count_correct_scaffolds += 1
                    if len(first_rectangle.e2.v2.out) != 0 or len(
                            second_rectangle.e1.v1.inn) != 0:
                        continue
                    used_paires.add((e12.eid, e21.eid))
                    count_incorrect_scaffolds += 1
                    if D - first_rectangle.e2.len > 0 and D - first_rectangle.e2.len < 100:
                        first_rectangle.e2.seq[
                            -55:], "\n", second_rectangle.e1.seq[:55]
                        connect_edges.add((e1.eid, e2.eid))
                        max_eid = self.graph.max_eid
                        self.graph.add_edge(max_eid, e12.v2.vid, e21.v1.vid,
                                            self.graph.K + 3, max_eid + 1)
                        self.graph.add_edge(max_eid + 1, e21.conj.v2.vid,
                                            e12.conj.v1.vid, self.graph.K + 3,
                                            max_eid)
                        seq = first_rectangle.e2.seq[
                            -self.graph.
                            K:] + "NNN" + second_rectangle.e1.seq[:self.graph.
                                                                  K]
                        self.graph.add_seq(max_eid, seq)
                        self.graph.add_seq(max_eid + 1, utils.rc(seq))
                        path_1 = []
                        path_2 = []
                        used = set()
                        begin_path = False
                        start_offset = 0
                        for diag in e1.diagonals:
                            if e11 == diag.rectangle.e2:
                                begin_path = True
                            if begin_path and diag.rectangle.e2 not in used:
                                path_1.append(diag.rectangle.e2)
                                used.add(diag.rectangle.e2)
                        path_1.append(self.graph.es[max_eid])
                        if e1.diagonals[-1].rectangle.e2.len <= e1.diagonals[
                                -1].offsetc:
                            path_1 = path_1[1:]
                            start_offset = 0
                        else:
                            start_offset = e1.diagonals[-1].offsetc
                        path_2.append(self.graph.es[max_eid])
                        path_2.append(e2.diagonals[0].rectangle.e1)
                        used = set()
                        for diag in e2.diagonals:
                            if e22 == diag.rectangle.e1:
                                break
                            if diag.rectangle.e1 not in used:
                                path_2.append(diag.rectangle.e1)
                                used.add(diag.rectangle.e1)

        self.test_utils.logger.info("count_correct_scaffolds " +
                                    str(count_correct_scaffolds) + " " +
                                    str(count_incorrect_scaffolds) + " " +
                                    str(len(used_paires)) + "\n")
        return connect_edges
Пример #10
0
def extract(G,sample):
    logging.info("Extracting path: %s from graph (%s) of size: (%d,%d)"%(sample,type(G),G.number_of_nodes(),G.number_of_edges()))
    
    if sample == "_longest_":
        #shortcut to extract the "longest" path in terms of sequence

        if type(G)==nx.MultiDiGraph:
            sv=utils.MultiGraphToDiGraph(G)
            for v,t,k in G.edges:
                G[v][t][k]['weight']=len(G.node[t]['seq'])-G.node[t]['seq'].count("N") if 'seq' in G.node[t] else 0
        else:
            for v,t in G.edges:
                G[v][t]['weight']=len(G.node[t]['seq'])-G.node[t]['seq'].count("N") if 'seq' in G.node[t] else 0

        # p=[]
        seq=""
        # e=None
        # weights=[0]
        for n in dag_longest_path_custom(G, weight='weight'):
            # p.append(n)
            # if e!=None:
            #     if 0 in G[e][n]:
            #         weights.append(G[e][n][0]['weight'])
            #     else:
            #         weights.append(G[e][n]['weight'])
            seq+=G.node[n]['seq']
            # e=n

        # with open("path.txt",'w') as f:
        #     f.write("total length: %d\n"%sum(weights))
        #     for n,w in zip(p,weights):
        #         f.write("%s-%d\n"%(n,w))

        return seq
        
    elif sample not in G.graph['path2id']:
        logging.fatal("Unknown path: %s, graph contains: %s"%(sample, G.graph['path2id'].keys()))
        sys.exit(1)

    else:
        sid=G.graph['path2id'][sample]
        
        sg=[]
        for n1,n2,d in G.edges(data=True):
            if sid in d['paths']:
                sg.append((n1,n2,d))
        
        if len(sg)>0:
            #G can be a MultiDiGraph, but subgraph should be single edge!
            subgraph=nx.DiGraph(sg)
            seq=""
            path=list(nx.topological_sort(subgraph))

            if type(G)==nx.MultiDiGraph:
                inito=G[path[0]][path[1]][0]['ofrom']
            else:
                inito=G[path[0]][path[1]]['ofrom']

            pnode=None

            for node in path:
                offset=0
                if pnode==None:
                    o=inito
                else:
                    o=subgraph[pnode][node]['oto']
                    if 'cigar' in subgraph[pnode][node] and subgraph[pnode][node]['cigar']!='0M':
                        cigar=subgraph[pnode][node]['cigar']
                        a=re.findall(r'(\d+)(\w)', cigar)
                        for l,t in a: #determine offset within the segment to allow for overlapping segments
                            if t=='M' or t=='I' or t=='S' or t=='P': #source of the edge (pnode) is considered the reference
                                offset+=int(l)
                    
                if o=="+":
                    s=G.node[node]['seq']
                else:
                    s=utils.rc(G.node[node]['seq'])

                assert(len(s)>=offset)

                seq+=s[offset:]
                pnode=node

        else: #has to be a single node
            seq=""
            for n in G:
                if sid in G.node[n]['offsets']:
                    seq=G.node[n]['seq']
                    break

        return seq
Пример #11
0
 def make_graph(self, genome, k):
   self.K = k
   kmers = self.__get_kmers_pos(genome, k)
   visit = set()
   vid = 0
   eid = 0
   edges = set()
   verts = dict()
   for key in kmers:
     if key in visit:
       continue
     body = [key[-1]]
     end_vertex = key[1:]                    
     while True:
         next_kmer = extend_forward(end_vertex, kmers)
         if next_kmer == None:
             break
         body.append(next_kmer[-1])
         end_vertex = next_kmer[1:]
         visit.add(next_kmer)
         visit.add(utils.rc(next_kmer))
         
     begin_vertex = key[:-1]
     while True:
         next_kmer = extend_backward(begin_vertex, kmers)
         if next_kmer == None:
             break
         body.insert(0, next_kmer[-1])
         begin_vertex = next_kmer[0:-1]
         visit.add(next_kmer)
         visit.add(utils.rc(next_kmer))
         
     body = begin_vertex + ''.join(body)
     if begin_vertex not in verts:
       begin_ref = self.add_vertex(vid, vid+1)
       r_end_ref = self.add_vertex(vid+1, vid)
       verts[begin_vertex] = begin_ref.vid
       verts[utils.rc(begin_vertex)] = r_end_ref.vid
       vid +=2
     if end_vertex not in verts:
       end_ref = self.add_vertex(vid, vid+1)
       r_begin_ref = self.add_vertex(vid+1, vid)
       verts[end_vertex] = end_ref.vid
       verts[utils.rc(end_vertex)] = r_begin_ref.vid
       vid +=2
     bv = verts[begin_vertex]
     ev = verts[end_vertex]
     rbv = verts[utils.rc(end_vertex)]
     rev = verts[utils.rc(begin_vertex)]
     if (bv, ev) not in edges:
       if (bv,ev) == (rbv, rev) and body == utils.rc(body):
         self.add_edge(eid, bv, ev, len(body) -k +1 , eid)
         edges.add((bv,ev))
         self.add_seq(eid, body)
         self.etalon_dist[eid] = kmers[body[:k]] + kmers[utils.rc(body)[:k]]
         eid += 1
       else:
         self.add_edge(eid, bv, ev, len(body) - k + 1, eid +1)
         self.add_edge(eid +1, rbv, rev, len(body) -k +1, eid)
         edges.add((bv,ev)) 
         edges.add((rbv, rev))
         self.add_seq(eid, body)
         self.add_seq(eid +1, utils.rc(body))
         self.etalon_dist[eid] = kmers[body[:k]]
         self.etalon_dist[eid+1] = kmers[utils.rc(body)[:k]]
         eid += 2
Пример #12
0
def check(reference, bgraph, K, log, test_util):
    K = 15
    #logstream = open(os.path.join(folder, 'rectangles.log'), 'a')
    #corr = os.path.join(folder, 'rectangles.corr.diagonals')
    #wrong = os.path.join(folder, 'rectangles.wrong.diagonals')

    max_mismatch = 2 # for match()

    # read reference genome 
    ref = open(reference)
    header = ref.readline()
    assert header[0] == '>'
    genome = ''
    for line in ref:
        genome += line.strip()
    ref.close()
    rcgenome = utils.rc(genome)

    def get_index(genome, not_rc):
        s = genome + genome[:K]
        d = {}
        for i in xrange(len(genome)):
            kmer = s[i:i + K]
            if not (kmer in d):
                d[kmer] = set()
            if not_rc:
                d[kmer].add(i + 1)
            else:
                d[kmer].add(-(len(genome) - i - K + 1))
        return d

    igenome = get_index(genome, True)
    ircgenome = get_index(rcgenome, False)


    def find_index_begin(seq):
        ref_indexs = set()
        for i in range(len(seq) - K + 1):
            kmer = seq[i:i + K]
            if kmer in igenome:
                ref_index = igenome[kmer]
                for ind in ref_index:
                    ref_indexs.add(ind - i)
            if kmer in ircgenome:
                ref_index = ircgenome[kmer]
                for ind in ref_index:
                    ref_indexs.add(ind - i)
        return ref_indexs

    count_true_be = 0
    count_true_etalon_dist_be = 0
    count_part_true_be = 0
    count_part_true_etalon_dist_be = 0
    count_be = 0
    covered = []
    for be in bgraph.bes:
        count_be += 1
        is_true_be = True
        is_part_true = False
        is_true_etalon_dist = True
        is_part_true_etalon_dist = False
        for diag in be.diagonals:
            is_true_diag = test_util.is_true_diagonal(diag)
            if is_true_diag:
                is_part_true_etalon_dist = True
            else:
                is_true_etalon_dist = False
            seq1 = diag.rectangle.e1.seq[diag.offseta:diag.offsetc + 55]
            seq2 = diag.rectangle.e2.seq[diag.offsetb:diag.offsetd + 55]
            indexs1 = find_index_begin(seq1)
            indexs2 = find_index_begin(seq2)
            is_true = False
            log.write(str(indexs1) + "\n")
            log.write(str(indexs2) + "\n")
            log.write(str(diag.d) + "\n")
            for i1 in indexs1:
                for i2 in indexs2:
                    if diag.d - 5 <= abs(i1 - i2) <= diag.d + 5:
                        is_true = True
                        covered.append((i1, i1 + len(seq1)))
            if is_true:
                is_part_true = True
            else:
                is_true_be = False
            log.write("diag is true " + str(is_true) + "\n")
        if is_part_true and not is_true_be:
            count_part_true_be += 1
        if is_true_be:
            count_true_be += 1
        if is_part_true_etalon_dist and not is_true_etalon_dist:
            count_part_true_etalon_dist_be += 1
        if is_true_etalon_dist:
            count_true_etalon_dist_be += 1

            #print diag, count_part_true_be, count_true_be
    log.write("true " + str(count_true_be) + str(" part_true ") + str(count_part_true_be) + " false " + str(
        count_be - count_true_be - count_part_true_be) + "\n")
    log.write("etalon true " + str(count_true_etalon_dist_be) + str(" part_true ") + str(
        count_part_true_etalon_dist_be) + " false " + str(
        count_be - count_true_etalon_dist_be - count_part_true_etalon_dist_be) + "\n")

    covered.sort()
    # print covered
    missed = 0
    end = covered[0][1]
    for cov in covered:
        if cov[0] > end:
            missed += 1
        end = cov[1]
    log.write("missing" + str(missed) + "\n")

    """def match(genome, seq, pos):
Пример #13
0
def check(reference, bgraph, K, log, test_util):
    K = 15
    #logstream = open(os.path.join(folder, 'rectangles.log'), 'a')
    #corr = os.path.join(folder, 'rectangles.corr.diagonals')
    #wrong = os.path.join(folder, 'rectangles.wrong.diagonals')

    max_mismatch = 2  # for match()

    # read reference genome
    ref = open(reference)
    header = ref.readline()
    assert header[0] == '>'
    genome = ''
    for line in ref:
        genome += line.strip()
    ref.close()
    rcgenome = utils.rc(genome)

    def get_index(genome, not_rc):
        s = genome + genome[:K]
        d = {}
        for i in xrange(len(genome)):
            kmer = s[i:i + K]
            if not (kmer in d):
                d[kmer] = set()
            if not_rc:
                d[kmer].add(i + 1)
            else:
                d[kmer].add(-(len(genome) - i - K + 1))
        return d

    igenome = get_index(genome, True)
    ircgenome = get_index(rcgenome, False)

    def find_index_begin(seq):
        ref_indexs = set()
        for i in range(len(seq) - K + 1):
            kmer = seq[i:i + K]
            if kmer in igenome:
                ref_index = igenome[kmer]
                for ind in ref_index:
                    ref_indexs.add(ind - i)
            if kmer in ircgenome:
                ref_index = ircgenome[kmer]
                for ind in ref_index:
                    ref_indexs.add(ind - i)
        return ref_indexs

    count_true_be = 0
    count_true_etalon_dist_be = 0
    count_part_true_be = 0
    count_part_true_etalon_dist_be = 0
    count_be = 0
    covered = []
    for be in bgraph.bes:
        count_be += 1
        is_true_be = True
        is_part_true = False
        is_true_etalon_dist = True
        is_part_true_etalon_dist = False
        for diag in be.diagonals:
            is_true_diag = test_util.is_true_diagonal(diag)
            if is_true_diag:
                is_part_true_etalon_dist = True
            else:
                is_true_etalon_dist = False
            seq1 = diag.rectangle.e1.seq[diag.offseta:diag.offsetc + 55]
            seq2 = diag.rectangle.e2.seq[diag.offsetb:diag.offsetd + 55]
            indexs1 = find_index_begin(seq1)
            indexs2 = find_index_begin(seq2)
            is_true = False
            log.write(str(indexs1) + "\n")
            log.write(str(indexs2) + "\n")
            log.write(str(diag.d) + "\n")
            for i1 in indexs1:
                for i2 in indexs2:
                    if diag.d - 5 <= abs(i1 - i2) <= diag.d + 5:
                        is_true = True
                        covered.append((i1, i1 + len(seq1)))
            if is_true:
                is_part_true = True
            else:
                is_true_be = False
            log.write("diag is true " + str(is_true) + "\n")
        if is_part_true and not is_true_be:
            count_part_true_be += 1
        if is_true_be:
            count_true_be += 1
        if is_part_true_etalon_dist and not is_true_etalon_dist:
            count_part_true_etalon_dist_be += 1
        if is_true_etalon_dist:
            count_true_etalon_dist_be += 1

            #print diag, count_part_true_be, count_true_be
    log.write("true " + str(count_true_be) + str(" part_true ") +
              str(count_part_true_be) + " false " +
              str(count_be - count_true_be - count_part_true_be) + "\n")
    log.write("etalon true " + str(count_true_etalon_dist_be) +
              str(" part_true ") + str(count_part_true_etalon_dist_be) +
              " false " + str(count_be - count_true_etalon_dist_be -
                              count_part_true_etalon_dist_be) + "\n")

    covered.sort()
    # print covered
    missed = 0
    end = covered[0][1]
    for cov in covered:
        if cov[0] > end:
            missed += 1
        end = cov[1]
    log.write("missing" + str(missed) + "\n")
    """def match(genome, seq, pos):