Пример #1
0
def main():
    fastg_file_name = ''
    blast_result_file = ''
    output_file = ''
    overlap_len = None
    options, args = getopt.getopt(sys.argv[1:], 'hl:')
    for option, value in options:
        if option == '-l':
            overlap_len = int(value)
        elif option == '-h':
            print_help_message()
            sys.exit()
        else:
            print_help_message()
            sys.exit()
    fastg_file_name, blast_result_file, output_file = args

    nodes = fastg_file.build_assembly_graph(fastg_file_name,
        overlap_len)
    alignments = list(filter(lambda x: x.is_valid and x.is_forward,
        read_file(blast_result_file)))
    Alignment.add_connection(alignments, nodes)
    alignments.sort(key=lambda x: x.start)
    # write_file(output_file, alignments)
    values, actions = Alignment.get_path(alignments)
    # Alignment.write_alignments_to_dot_file(alignments, output_file,
        # actions, values)
    Alignment.write_path_to_dot_file(actions, values, output_file)
Пример #2
0
def _test_read_file():
    input_file = 'assembly_graph.siteGraph'
    fastg_file_ = 'assembly_graph.fastg'
    import fastg_file
    nodes = fastg_file.build_assembly_graph(fastg_file_, 127)
    sites = read_file(input_file, nodes)
    for site in sites.values():
        print(site.id)
        print([(ele[0], ele[1]) for ele in site.children])
def main():
    input_file = ''
    output_file = ''
    overlap_len = None
    shift_len = 0
    to_simplify = False
    max_interval_len = None
    options, args = getopt.getopt(sys.argv[1:], 'm:k:i:l:o:hs')
    for option, value in options:
        if option == '-i':
            input_file = value
        elif option == '-o':
            output_file = value
        elif option == '-l':
            overlap_len = int(value)
        elif option == '-k':
            shift = int(value)
        elif option == '-m':
            max_interval_len = int(value)
        elif option == '-s':
            to_simplify = True
        elif option == '-h':
            printHelpMessage()
            sys.exit()
        else:
            printHelpMessage()
            sys.exit()
    print('overlap_len:', overlap_len)
    nodes = fastg_file.build_assembly_graph(input_file, overlap=overlap_len)
    tick = time.time()
    sites, site_position_index = build_site_graph(nodes, shift=shift)
    tock = time.time()

    if to_simplify:
        print('Simplifying site graph...')
        sites = site_graph.simplify_site_graph(sites)
    # print('{} sites created on {} nodes'.format(len(sites), len(site_position_index)))
    tock2 = time.time()

    # debug_site = sites['648r']
    # print('site', debug_site.id, 'has {} children.'.format(len(debug_site.children)))
    # for child, interval, nodes_path, _ in debug_site.children:
    # print(child.id, interval, [ele.uid for ele in nodes_path])

    site_graph.write_file(output_file, sites, [
        ' '.join(sys.argv), 'Number of sites: {}'.format(
            len(sites)), 'Number of nodes: {}'.format(len(nodes)),
        'Numner of nodes contain site: {}'.format(len(site_position_index)),
        'Max interval len: {}'.format(max_interval_len),
        'Allowed repeat num: {}'.format(ALLOWED_REPEAT_NUM),
        'Time used to build graph: {} seconds'.format(round(tock - tick)),
        'Time used to simplify graph: {} seconds'.format(round(tock2 - tock))
    ])
Пример #4
0
def main():
    interface = 'hk:'
    options, args = getopt.getopt(sys.argv[1:], interface)
    overlap_size = None
    for option, value in options:
        if option == '-h':
            print_help()
            sys.exit()
        elif option == '-k':
            overlap_size = int(value)

    input_file, output_file = args
    nodes = fastg_file.build_assembly_graph(input_file, overlap_size)
    write_last_graph(nodes, output_file, overlap_size)
def _test_build_site_graph():
    input_file = 'assembly_graph.fastg'
    # input_file = 'test.fastg'
    overlap = 127
    nodes = fastg_file.build_assembly_graph(input_file, overlap)
    # nodes = fastg_file.build_assembly_graph(input_file, overlap=2)
    sites, site_position_index = build_site_graph(nodes)
    num_position_in_index = sum(
        (len(ele) for ele in site_position_index.values()))
    site_ids = [
        int(site.id) for site in sites.values() if not site.id.endswith('r')
    ]
    site_ids.sort()

    site_positions = list(site_position_index.items())
    site_positions.sort(key=lambda x: int(x[0].uid.rstrip('r')))

    site_index = {}
    count = [0] * 10
    num_site_contain_self_as_child = 0
    for node, positions in site_position_index.items():
        for position, site in positions.items():
            if site not in site_index:
                site_index[site] = [(node.uid, position)]
            else:
                site_index[site].append((node.uid, position))
    for v in site_index.values():
        v.sort(key=lambda x: int(x[0].rstrip('r')))
    site_index_items = list(site_index.items())
    site_index_items.sort(key=lambda x: x[1][0])
    for site, positions in site_index_items:
        child_sites, intervals, node_paths = list(zip(
            *site.children)) if site.children else ([], [], [])
        child_site_ids = [ele.id for ele in child_sites]
        node_ids = [[ele.uid for ele in node_path] for node_path in node_paths]
        child_site_id_intervals = list(zip(child_site_ids, intervals))
        child_site_id_intervals_ = sorted(child_site_id_intervals)
        # print(site, positions, 'C:', child_site_id_intervals_)
        if site.id in child_site_ids:
            num_site_contain_self_as_child += 1
            print('!!!')
        count[len(positions)] += 1
Пример #6
0
def main():

    paths_file_name = ''
    gap_info_file_name = ''
    fastg_file_name = ''
    interface = 'hp:g:'
    options, args = getopt.getopt(sys.argv[1:], interface)
    for option, value in options:
        if option == '-h':
            print_help()
            sys.exit()
        elif option == '-p':
            paths_file_name = value
        elif option == '-g':
            fastg_file_name = value
    gap_info_file_name, output_file_name = args

    # paths_file_name = '/home/huangbin/simulation_lab/Bacteria/E.coli_PL/spades.dir/contigs.paths'
    # gap_info_file_name = '/home/huangbin/simulation_lab/Bacteria/E.coli_PL/hybrid_scaffold_keep_11.dir/hybrid_scaffolds/tmp1.txt'
    # fastg_file_name = 'datasets/assembly_graph.fastg'
    paths = paths_file.read_file(paths_file_name)
    gaps = gap_info_file.read_file(gap_info_file_name)
    nodes = fastg_file.build_assembly_graph(fastg_file_name, overlap=OVERLAP)
    _, site_position_indexs = sitegraph_builder.build_site_graph(nodes, mode=1)
    # Transform gaps.
    for gap in gaps:
        debug = 0
        if gap.start_node_id.startswith('NODE_2_'):
            debug = 1
        if debug == 1:
            print('DEBUG:', gap.start_node_id, gap.start_site_index)
        gap.start_node_id, gap.start_site_index, gap.start_site_position =\
            transform_position(gap.start_node_id, gap.start_site_position, nodes, paths, site_position_indexs, OVERLAP, debug=debug)
        gap.end_node_id, gap.end_site_index, gap.end_site_position =\
            transform_position(gap.end_node_id, gap.end_site_position, nodes, paths, site_position_indexs, OVERLAP, debug=debug)
        if debug == 1:
            print('DEBUG:', gap.start_node_id, gap.start_site_index)
    # Write gaps.
    cmd = ' '.join(sys.argv)
    gap_info_file.write_file(gaps, output_file_name, comments=[cmd])
Пример #7
0
def main():
    # unmodified_reference_cmap = None
    global find_path_script
    fastg_file_name = None
    site_graph_file_name = None
    find_path_script = 'find_path_dp.py'
    work_dir = None
    task_name = None
    n_thread = 1
    is_node_id_processed = 0
    interface = 'hf:o:n:x:s:m:'
    options, args = getopt.getopt(sys.argv[1:], interface)
    for option, value in options:
        if option == '-h':
            print_help()
            sys.exit()
        elif option == '-f':
            fastg_file_name = value
        elif option == '-s':
            site_graph_file_name = value
        elif option == '-x':
            find_path_script = value
        elif option == '-n':
            n_thread = int(value)
        elif option == '-o':
            overlap_len = int(value)
        elif option == '-m':
            is_node_id_processed = int(value)

    gap_info_file_name = args[0]
    if len(args) == 1:
        pass
    if len(args) == 2:
        task_name = args[1]
    elif len(args) == 3:
        work_dir, task_name = args[1:3]
    else:
        print_help()
        sys.exit()
    if task_name is None:
        task_name = fastg_file_name.rsplit('.', 1)
    global log_dir, seq_fa_file, graph_pickle_file
    log_dir = work_dir + '/' + task_name + '_log.dir'
    seq_fa_file = work_dir + '/' + task_name + '.fa'
    graph_pickle_file = work_dir + '/' + task_name + '_graph.pickle'
    subprocess.run(('rm', '-rf', log_dir))
    subprocess.run(('mkdir', log_dir))

    # Read assembly graph.
    nodes = fastg_file.build_assembly_graph(fastg_file_name, overlap_len)

    if site_graph_file_name:
        # Reads site graph from file.
        sites = site_graph.read_file(site_graph_file_name, nodes)
        _, site_position_index = sitegraph_builder.build_site_graph(nodes,
                                                                    mode=1)
        for position_index in site_position_index.values():
            for position in position_index:
                original_site = position_index[position]
                position_index[position] = sites[original_site.id]
    else:
        # Build site graph form assembly graph, and write site graph.
        sites, site_position_index = \
            sitegraph_builder.build_site_graph(nodes)
        sites = site_graph.simplify_site_graph(sites)
        site_graph_file_name = fastg_file_name.rsplit('.')[0] +\
             '.sitegraph'
        site_graph.write_file(site_graph_file_name, sites)

    # Write nodes and sites to pickle file.
    with open(graph_pickle_file, 'wb') as fout:
        pickle.dump(nodes, fout, -1)
        pickle.dump(sites, fout, -1)

    # Find start site and end site.
    gaps = gap_info_file.read_file(gap_info_file_name)

    exe_tuples = []
    for gap in gaps:
        if is_node_id_processed:
            start_node_id = gap.start_node_id
            end_node_id = gap.end_node_id
        else:
            start_node_id = get_node_id_from_long_name(gap.start_node_id)
            end_node_id = get_node_id_from_long_name(gap.end_node_id)
        start_site_position, start_site = get_site_by_index(
            gap.start_site_index, site_position_index[nodes[start_node_id]])
        end_site_position, end_site = get_site_by_index(
            gap.end_site_index, site_position_index[nodes[end_node_id]])
        print(start_site.id, end_site.id)

        gap_seq_id = '-'.join((start_node_id, end_node_id))
        exe_tuples.append(
            (gap_seq_id,
             construct_cmd(start_site.id, end_site.id, start_site_position,
                           end_site_position, graph_pickle_file, gap.intervals,
                           gap_seq_id)))

    with Pool(n_thread) as p:
        p.map(find_path_dp_process, exe_tuples)