Пример #1
0
def main():
    assert os.path.exists('in.fa')
    assert os.path.exists('in.weights')

    sanity_checks.sanity_check_fasta('in.fa')

    num_size = int(os.popen("grep -c \">\" in.fa").read().strip())

    if num_size <= cc_settings.MAX_SPLIT_IN_SIZE:
        run_Cogent_on_input()
    else:
        dirs = split_files(input_filename='in.fa',
                           split_size=cc_settings.MAX_SPLIT_IN_SIZE)
        run_Cogent_on_split_files(dirs, depth=0)

    # align input to cogent2 gmap db so we can use it for evalution later;
    run_gmap(dbname='cogent2', infile='in.trimmed.fa')

    # clean up GMAP db files
    if os.path.exists('cogent') and os.path.isdir('cogent'):
        cleanup_gmap('cogent')
    if os.path.exists('cogent2') and os.path.isdir('cogent2'):
        cleanup_gmap('cogent2')

    # rewrite cogent2.fa with prefix
    f = open('cogent2.renamed.fasta', 'w')
    for r in SeqIO.parse(open('cogent2.fa'), 'fasta'):
        f.write(">{0}|{1}\n{2}\n".format(cc_settings.OUTPUT_PREFIX, r.id,
                                         r.seq))
    f.close()
Пример #2
0
def run_Cogent_on_split_files(split_dirs):
    """
    1. run Cogent individually on each split directory
    2. combine all cogent2.fa from split directories, pretend they are the "INPUT", run Cogent on it

    """
    time1 = time.time()
    olddir = os.getcwd()
    for d in split_dirs:
        os.chdir(d)
        run_Cogent_on_input()
        os.chdir(olddir)

    if os.path.exists('combined'):
        run_external_call("rm -rf combined")
    os.makedirs('combined')
    # now combine all the cogent2 results and pretend they are the "INPUT"
    f = open('combined/in.fa', 'w')
    f2 = open('combined/in.weights', 'w')
    i = 0
    for d in split_dirs:
        for r in SeqIO.parse(open(os.path.join(d, 'cogent2.fa')), 'fasta'):
            f.write(">fake_input_path{0}\n{1}\n".format(i, r.seq))
            f2.write("fake_input_path{0}\t1\n".format(i))
            i += 1
    f.close()
    f2.close()

    os.chdir('combined')
    run_Cogent_on_input()
    os.chdir('../')

    # now take the output from combined and run LP against it,
    # using the real input this time

    with open('in.trimmed.fa', 'w') as f:
        for r in SeqIO.parse(open('in.fa'), 'fasta'):
            f.write(">{0}\n{1}\n".format(r.id, trim_ends(str(r.seq))))

    if os.path.exists('post_combined'):
        run_external_call("rm -rf post_combined")
    os.makedirs('post_combined')
    os.chdir('post_combined')
    run_external_call("ln -s ../combined/cogent2.fa cogent.fa")
    run_external_call("ln -s ../in.weights in.weights")
    run_external_call("ln -s ../in.trimmed.fa in.trimmed.fa")
    run_gmap()
    post_gmap_processing(
        seqrecs=[r for r in SeqIO.parse(open('in.trimmed.fa'), 'fasta')])
    os.chdir('../')

    # now the result we want is in combined/cogent2.fa, do postprocessing on it with the full in.fa

    run_external_call("ln -f -s post_combined/cogent2.fa cogent2.fa")
    run_gmap(dbname='cogent2', infile='in.trimmed.fa')
    #post_gmap_processing()

    time4 = time.time()
    log.info("[RUNTIME] Total time in run_Cogent: {0}".format(time4 - time1))
Пример #3
0
def run_Cogent_on_split_files(split_dirs):
    """
    1. run Cogent individually on each split directory
    2. combine all cogent2.fa from split directories, pretend they are the "INPUT", run Cogent on it

    """
    time1 = time.time()
    olddir = os.getcwd()
    for d in split_dirs:
        os.chdir(d)
        run_Cogent_on_input()
        os.chdir(olddir)

    if os.path.exists("combined"):
        run_external_call("rm -rf combined")
    os.makedirs("combined")
    # now combine all the cogent2 results and pretend they are the "INPUT"
    f = open("combined/in.fa", "w")
    f2 = open("combined/in.weights", "w")
    i = 0
    for d in split_dirs:
        for r in SeqIO.parse(open(os.path.join(d, "cogent2.fa")), "fasta"):
            f.write(">fake_input_path{0}\n{1}\n".format(i, r.seq))
            f2.write("fake_input_path{0}\t1\n".format(i))
            i += 1
    f.close()
    f2.close()

    os.chdir("combined")
    run_Cogent_on_input()
    os.chdir("../")

    # now take the output from combined and run LP against it,
    # using the real input this time

    with open("in.trimmed.fa", "w") as f:
        for r in SeqIO.parse(open("in.fa"), "fasta"):
            f.write(">{0}\n{1}\n".format(r.id, trim_ends(str(r.seq))))

    if os.path.exists("post_combined"):
        run_external_call("rm -rf post_combined")
    os.makedirs("post_combined")
    os.chdir("post_combined")
    run_external_call("ln -s ../combined/cogent2.fa cogent.fa")
    run_external_call("ln -s ../in.weights in.weights")
    run_external_call("ln -s ../in.trimmed.fa in.trimmed.fa")
    run_gmap()
    post_gmap_processing(seqrecs=[r for r in SeqIO.parse(open("in.trimmed.fa"), "fasta")])
    os.chdir("../")

    # now the result we want is in combined/cogent2.fa, do postprocessing on it with the full in.fa

    run_external_call("ln -f -s post_combined/cogent2.fa cogent2.fa")
    run_gmap(dbname="cogent2", infile="in.trimmed.fa")
    # post_gmap_processing()

    time4 = time.time()
    log.info("[RUNTIME] Total time in run_Cogent: {0}".format(time4 - time1))
Пример #4
0
def run_Cogent_on_split_files(split_dirs, depth):
    """
    1. run Cogent individually on each split directory
    2. combine all cogent2.fa from split directories, pretend they are the "INPUT", run Cogent on it

    """
    time1 = time.time()
    olddir = os.getcwd()
    for d in split_dirs:
        os.chdir(d)
        if os.path.exists('cogent2.fa'):
            print >> sys.stderr, "skipping {0} because done already".format(d)
            os.chdir(olddir)
            continue
        run_Cogent_on_input()
        # clean up cogent in the split dir
        if os.path.exists('cogent') and os.path.isdir('cogent'):
            cleanup_gmap('cogent')
        if os.path.exists('cogent2') and os.path.isdir('cogent2'):
            cleanup_gmap('cogent2')
        os.chdir(olddir)

    if os.path.exists('combined'):
        run_external_call("rm -rf combined")
    os.makedirs('combined')
    # now combine all the cogent2 results and pretend they are the "INPUT"
    f = open('combined/in.fa', 'w')
    f2 = open('combined/in.weights', 'w')
    i = 0
    for d in split_dirs:
        for r in SeqIO.parse(open(os.path.join(d, 'cogent2.fa')), 'fasta'):
            f.write(">fake_input_path{0}\n{1}\n".format(i, r.seq))
            f2.write("fake_input_path{0}\t1\n".format(i))
            i += 1
    f.close()
    f2.close()

    os.chdir('combined')
    if i > cc_settings.MAX_POST_SPLIT_IN_SIZE and depth < cc_settings.MAX_RECUR_DEPTH:
        dirs = split_files(input_filename='in.fa', split_size=cc_settings.MAX_POST_SPLIT_IN_SIZE)
        run_Cogent_on_split_files(dirs, depth+1)
    run_Cogent_on_input()
    os.chdir('../')

    # now take the output from combined and run LP against it,
    # using the real input this time

    with open('in.trimmed.fa', 'w') as f:
        for r in SeqIO.parse(open('in.fa'), 'fasta'):
            f.write(">{0}\n{1}\n".format(r.id, trim_ends(str(r.seq))))

    if os.path.exists('post_combined'):
        run_external_call("rm -rf post_combined")
    os.makedirs('post_combined')
    os.chdir('post_combined')
    run_external_call("ln -s ../combined/cogent2.fa cogent.fa")
    run_external_call("ln -s ../in.weights in.weights")
    run_external_call("ln -s ../in.trimmed.fa in.trimmed.fa")
    run_gmap()
    post_gmap_processing(seqrecs=[r for r in SeqIO.parse(open('in.trimmed.fa'), 'fasta')])
    os.chdir('../')

    # now the result we want is in combined/cogent2.fa, do postprocessing on it with the full in.fa

    run_external_call("ln -f -s post_combined/cogent2.fa cogent2.fa")
    run_gmap(dbname='cogent2', infile='in.trimmed.fa')
    #post_gmap_processing()

    time4 = time.time()
    log.info("[RUNTIME] Total time in run_Cogent: {0}".format(time4-time1))
Пример #5
0
def run_Cogent_on_input():
    """
    The main reconstruction function.

    Homopolymers and repeated nodes in path must be resolved first.
    (however, it's possible the graph contains cycles not manifested in path,
     this is a bug that will result in failure to *explain* the sequences later,
     right now I catch the bug by using the sequence pth itself but this should be fixed eventually)

    Graph reduction is iteratively done until cannot be further reduced

    Two points of failure:
    (1) graph is not reduced to small enough, too many paths, mem explosion
        cur soln: fall back to using own paths
    (2) cycle in graph
        cur soln: fall back to using own paths (still wrong)
    """
    time1 = time.time()
    # first trim in.fa away all lower case
    f = open('in.trimmed.fa', 'w')
    for r in SeqIO.parse(open('in.fa'),'fasta'):
        f.write(">{0}\n{1}\n".format(r.id, trim_ends(str(r.seq))))
    f.close()

    seqweights = {}
    # read in the weights for each sequence
    with open('in.weights') as f:
        for line in f:
            seqid, weight = line.strip().split('\t')
            seqweights[seqid] = int(weight)

    adjusted_kmer = splice_cycle.precycle_kmer_adjustment(cc_settings.KMER_SIZE)
    if adjusted_kmer != cc_settings.KMER_SIZE:
        log.info("Adjusting k-mer size to: {0}".format(adjusted_kmer))
        cc_settings.KMER_SIZE = adjusted_kmer

    # setting up the DiGraph
    G = nx.DiGraph()
    node_d = {None: -1}  # this is just used to initialize the graph, delete it later
    path_d = {}
    reader = SeqIO.parse(open('in.trimmed.fa'),'fasta')
    seqrecs = []
    for r in reader:
        sp.add_seq_to_graph(G, node_d, path_d, str(r.seq), r.id, seqweights[r.id])
        seqrecs.append(r)
    del node_d[None]
    mermap = dict((v,k) for k,v in node_d.iteritems())



    # resolve all homopolymers
    homo_nodes = filter(lambda n: G.has_edge(n, n), G.nodes_iter())
    for n in homo_nodes:
        sp.untangle_homopolymer_helper(G, path_d, mermap, seqweights, n)

    splice_cycle.detect_and_replace_cycle(G, path_d, seqweights, mermap, max(G.nodes()), cc_settings.KMER_SIZE)

    visited = {}
    sp.reachability(G, mermap, visited, path_d)

    # cycle detection and abort if detected
    # (this should not happen with splice_cycle.detect_and_replace_cycle run)
    for k,v in path_d.iteritems():
        for x in v:
            if v.count(x) > 1:
                log.info("CYCLE detected through path analysis! Raise CycleDetectedException!")
                os.system("touch CYCLE_DETECTED")
                raise CycleDetectedException

    if cc_settings.NX_CYCLE_DETECTION:
        log.info("Doing nx.cycle_detection....")
        iter = nx.simple_cycles(G)
        for _it in iter:
            print >> sys.stderr, "CYCLE detected through simple_cycles! Raise CycleDetectedException!"
            os.system("touch CYCLE_DETECTED")
            raise CycleDetectedException

    nx.write_graphml(G, 'in.0.graphml')

    log.info("Initial Graph Size: {0} nodes, {1} edges".format(G.number_of_nodes(), G.number_of_edges()))

    ## sanity check: confirm that all sequences can be reconstructed via the collapsed graph
    ## also check that all nodes are visited
    #for n in G.nodes_iter(): assert n in visited
    #for k,v in path_d.iteritems():
    #    s = sp.stitch_string_from_path(v, mermap)
    #    s2 = seqdict[k].seq.tostring().upper()
    #    assert s.find(s2) >= 0

    while True:
        cur_num_nodes = G.number_of_nodes()
        sp.find_source_bubbles(G, path_d, mermap)
        sp.reachability(G, mermap, {}, path_d)
        sp.find_bubbles(G, path_d, mermap)
        sp.reachability(G, mermap, {}, path_d)
        sp.contract_sinks(G, path_d, mermap)
        sp.find_dangling_sinks(G, path_d, mermap)
        sp.reachability(G, mermap, {}, path_d)
        #assert sanity_check_path_all_valid(path_d, G)
        if G.number_of_nodes() == cur_num_nodes:
            break

    nx.write_graphml(G, 'in.1.graphml')

    log.info("Post-Reduction Graph Size: {0} nodes, {1} edges".format(G.number_of_nodes(), G.number_of_edges()))

    time2 = time.time()

    keys = path_d.keys()
    keys.sort()
    good_for, paths = find_minimal_path_needed_to_explain_pathd(G, path_d, keys)
    solve_with_lp_and_reduce(good_for, paths, mermap)

    time3 = time.time()

    run_gmap()
    post_gmap_processing(seqrecs=seqrecs)

    time4 = time.time()

    log.info("[RUNTIME] for graph construction and reduction: {0}".format(time2-time1))
    log.info("[RUNTIME] for path finding and LP solving: {0}".format(time3-time2))
    log.info("[RUNTIME] for GMAP and post-processing: {0}".format(time4-time3))
    log.info("[RUNTIME] Total time in run_Cogent: {0}".format(time4-time1))
Пример #6
0
def run_Cogent_on_input():
    """
    The main reconstruction function.

    Homopolymers and repeated nodes in path must be resolved first.
    (however, it's possible the graph contains cycles not manifested in path,
     this is a bug that will result in failure to *explain* the sequences later,
     right now I catch the bug by using the sequence pth itself but this should be fixed eventually)

    Graph reduction is iteratively done until cannot be further reduced

    Two points of failure:
    (1) graph is not reduced to small enough, too many paths, mem explosion
        cur soln: fall back to using own paths
    (2) cycle in graph
        cur soln: fall back to using own paths (still wrong)
    """
    time1 = time.time()
    # first trim in.fa away all lower case
    f = open("in.trimmed.fa", "w")
    for r in SeqIO.parse(open("in.fa"), "fasta"):
        f.write(">{0}\n{1}\n".format(r.id, trim_ends(str(r.seq))))
    f.close()

    seqweights = {}
    # read in the weights for each sequence
    with open("in.weights") as f:
        for line in f:
            seqid, weight = line.strip().split("\t")
            seqweights[seqid] = int(weight)

    adjusted_kmer = splice_cycle.precycle_kmer_adjustment(cc_settings.KMER_SIZE)
    if adjusted_kmer != cc_settings.KMER_SIZE:
        log.info("Adjusting k-mer size to: {0}".format(adjusted_kmer))
        cc_settings.KMER_SIZE = adjusted_kmer

    # setting up the DiGraph
    G = nx.DiGraph()
    node_d = {None: -1}  # this is just used to initialize the graph, delete it later
    path_d = {}
    reader = SeqIO.parse(open("in.trimmed.fa"), "fasta")
    seqrecs = []
    for r in reader:
        sp.add_seq_to_graph(G, node_d, path_d, str(r.seq), r.id, seqweights[r.id])
        seqrecs.append(r)
    del node_d[None]
    mermap = dict((v, k) for k, v in node_d.iteritems())

    # resolve all homopolymers
    homo_nodes = filter(lambda n: G.has_edge(n, n), G.nodes_iter())
    for n in homo_nodes:
        sp.untangle_homopolymer_helper(G, path_d, mermap, seqweights, n)

    splice_cycle.detect_and_replace_cycle(G, path_d, seqweights, mermap, max(G.nodes()), cc_settings.KMER_SIZE)

    visited = {}
    sp.reachability(G, mermap, visited, path_d)

    # cycle detection and abort if detected
    # (this should not happen with splice_cycle.detect_and_replace_cycle run)
    for k, v in path_d.iteritems():
        for x in v:
            if v.count(x) > 1:
                log.info("CYCLE detected through path analysis! Raise CycleDetectedException!")
                os.system("touch CYCLE_DETECTED")
                raise CycleDetectedException

    if cc_settings.NX_CYCLE_DETECTION:
        log.info("Doing nx.cycle_detection....")
        iter = nx.simple_cycles(G)
        for _it in iter:
            print >>sys.stderr, "CYCLE detected through simple_cycles! Raise CycleDetectedException!"
            os.system("touch CYCLE_DETECTED")
            raise CycleDetectedException

    nx.write_graphml(G, "in.0.graphml")

    log.info("Initial Graph Size: {0} nodes, {1} edges".format(G.number_of_nodes(), G.number_of_edges()))

    ## sanity check: confirm that all sequences can be reconstructed via the collapsed graph
    ## also check that all nodes are visited
    # for n in G.nodes_iter(): assert n in visited
    # for k,v in path_d.iteritems():
    #    s = sp.stitch_string_from_path(v, mermap)
    #    s2 = seqdict[k].seq.tostring().upper()
    #    assert s.find(s2) >= 0

    while True:
        cur_num_nodes = G.number_of_nodes()
        sp.find_source_bubbles(G, path_d, mermap)
        sp.reachability(G, mermap, {}, path_d)
        sp.find_bubbles(G, path_d, mermap)
        sp.reachability(G, mermap, {}, path_d)
        sp.contract_sinks(G, path_d, mermap)
        sp.find_dangling_sinks(G, path_d, mermap)
        sp.reachability(G, mermap, {}, path_d)
        # assert sanity_check_path_all_valid(path_d, G)
        if G.number_of_nodes() == cur_num_nodes:
            break

    nx.write_graphml(G, "in.1.graphml")

    log.info("Post-Reduction Graph Size: {0} nodes, {1} edges".format(G.number_of_nodes(), G.number_of_edges()))

    time2 = time.time()

    keys = path_d.keys()
    keys.sort()
    good_for, paths = find_minimal_path_needed_to_explain_pathd(G, path_d, keys)
    solve_with_lp_and_reduce(good_for, paths, mermap)

    time3 = time.time()

    run_gmap()
    post_gmap_processing(seqrecs=seqrecs)

    time4 = time.time()

    log.info("[RUNTIME] for graph construction and reduction: {0}".format(time2 - time1))
    log.info("[RUNTIME] for path finding and LP solving: {0}".format(time3 - time2))
    log.info("[RUNTIME] for GMAP and post-processing: {0}".format(time4 - time3))
    log.info("[RUNTIME] Total time in run_Cogent: {0}".format(time4 - time1))