Exemplo n.º 1
0
def scaffold(args):
  logging.info('Creating the scaffold graph')
  g = load_from_fasta_tsv(args.fasta, args.edges, args.containment)
  print_stats(g)

  # delete small vertices
  if args.min_ctg_len:
    logging.info('Removing vertices smaller than %d bp' % args.min_ctg_len)
    n_removed = 0
    for v in g.vertices:
      if len(v.seq) < args.min_ctg_len:
        g.remove_vertex(v)
        n_removed += 1
        logging.info('Removed %d vertices' % n_removed)
        print_stats(g)

  # prune scaffold edges
  if g.edges:
    logging.info('Simplifying the graph using paired-end reads')
    logging.info('Contracting unambigous paths')
    contract_edges(g, store_ordering=True)
    print_stats(g)
    save_fasta(g, 'contracted.fasta')
    
    if args.cut_tip_len:
      n_cut = cut_tips(g, d=args.cut_tip_len)
      logging.info('Cut %d tips shorter than %d bp' \
                    % (n_cut, args.cut_tip_len))
    
    logging.info('Pruning edges with low support')
    n_pruned1 = prune_scaffold_edges(g, abs_support_thr=args.pe_abs_thr, 
                                        rel_support_thr=args.pe_rel_thr)
    n_pruned2 = prune_scaffold_edges_via_wells(g, thr=args.pe_rc_rel_thr)
    logging.info('%d edges pruned' % (n_pruned1 + n_pruned2))

    logging.info('Contracting unambigous paths')
    n_contracted = contract_edges(g)
    print_stats(g)

  # delete all existing edges from the graph
  E = g.edges
  for e in E:
    g.remove_edge(e)

  # create new edges whenever vertices have similar well profiles
  logging.info('Creating edges from read clouds')
  n_edges = make_wellscaff_edges(g, min_common=args.rc_abs_thr, 
                                    min_thr=args.rc_rel_edge_thr)
  logging.info('%d scaffold edges from read clouds' % n_edges)

  logging.info('Auto-saving graph with prefix %s.wellscaff' % args.out)
  save_to_fasta_tsv(g, '%s.wellscaff.fasta' % args.out, 
                       '%s.wellscaff.tsv' % args.out, 
                       '%s.wellscaff.containment' % args.out)

  logging.info('Pruning edges with low support')
  n_pruned = prune_via_wells(g, min_common=args.rc_abs_thr, 
                                min_thr=args.rc_rel_prun_thr)
  logging.info('%d edges pruned' % n_pruned)

  logging.info('Contracting unambigous paths')
  n_contracted = contract_edges(g, store_ordering=True)
  print_stats(g)

  logging.info('Saving scaffolding results')
  save_fasta(g, '%s.fasta' % args.out)
  save_ordering(g, '%s.ordering' % args.out)
Exemplo n.º 2
0
def scaffold(args):
    logging.info('Creating the scaffold graph')
    g = load_from_fasta_tsv(args.fasta, args.edges, args.containment)
    print_stats(g)

    # delete small vertices
    if args.min_ctg_len:
        logging.info('Removing vertices smaller than %d bp' % args.min_ctg_len)
        n_removed = 0
        for v in g.vertices:
            if len(v.seq) < args.min_ctg_len:
                g.remove_vertex(v)
                n_removed += 1
                logging.info('Removed %d vertices' % n_removed)
                print_stats(g)

    # prune scaffold edges
    if g.edges:
        logging.info('Simplifying the graph using paired-end reads')
        logging.info('Contracting unambigous paths')
        contract_edges(g, store_ordering=True)
        print_stats(g)
        save_fasta(g, 'contracted.fasta')

        if args.cut_tip_len:
            n_cut = cut_tips(g, d=args.cut_tip_len)
            logging.info('Cut %d tips shorter than %d bp' \
                          % (n_cut, args.cut_tip_len))

        logging.info('Pruning edges with low support')
        n_pruned1 = prune_scaffold_edges(g,
                                         abs_support_thr=args.pe_abs_thr,
                                         rel_support_thr=args.pe_rel_thr)
        n_pruned2 = prune_scaffold_edges_via_wells(g, thr=args.pe_rc_rel_thr)
        logging.info('%d edges pruned' % (n_pruned1 + n_pruned2))

        logging.info('Contracting unambigous paths')
        n_contracted = contract_edges(g)
        print_stats(g)

    # delete all existing edges from the graph
    E = g.edges
    for e in E:
        g.remove_edge(e)

    # create new edges whenever vertices have similar well profiles
    logging.info('Creating edges from read clouds')
    n_edges = make_wellscaff_edges(g,
                                   min_common=args.rc_abs_thr,
                                   min_thr=args.rc_rel_edge_thr)
    logging.info('%d scaffold edges from read clouds' % n_edges)

    logging.info('Auto-saving graph with prefix %s.wellscaff' % args.out)
    save_to_fasta_tsv(g, '%s.wellscaff.fasta' % args.out,
                      '%s.wellscaff.tsv' % args.out,
                      '%s.wellscaff.containment' % args.out)

    logging.info('Pruning edges with low support')
    n_pruned = prune_via_wells(g,
                               min_common=args.rc_abs_thr,
                               min_thr=args.rc_rel_prun_thr)
    logging.info('%d edges pruned' % n_pruned)

    logging.info('Contracting unambigous paths')
    n_contracted = contract_edges(g, store_ordering=True)
    print_stats(g)

    logging.info('Saving scaffolding results')
    save_fasta(g, '%s.fasta' % args.out)
    save_ordering(g, '%s.ordering' % args.out)
Exemplo n.º 3
0
def scaffold_via_wells_mst(g):
    # initialize internal contig labels (used for downstream qc)
    for v in g.vertices:
        v.initialize_contigs()

    # construct well-based scaffold graph in networkx format
    nxg = g.nxgraph
    # nxg = _construct_graph(g)

    # weigh edges according to how many wells they are sharing:
    _reweigh_edges(nxg, g, type_='wells')

    # find the maxinum spanning forest
    msf = nx.minimum_spanning_tree(nxg)

    # keep simplifying the graph until the msf has no branching nodes:
    n_iter = 1
    while _has_branches(msf) and n_iter <= 10:
        print 'MSF simplificaiton iteration %d' % n_iter

        # print '...', max(msf.degree(weight=None).values())
        # print '...', sorted(msf.degree(weight=None).iteritems(), key=lambda x: x[1], reverse=True)[:10]
        # vg = sorted(msf.degree(weight=None).iteritems(), key=lambda x: x[1], reverse=True)[0][0]
        # v = g.vertex_from_id(vg[0])
        # N = [n.id for n in g.vertices if v in n.neighbors]
        # print ',,,', N
        # print msf.neighbors(v)

        # remove edges of g not selected in forest MSF
        E = [e for e in g.edges]
        n_removed = 0
        for e in E:
            e_nx = ((e.v1.id, e.connection[e.v1]), (e.v2.id,
                                                    e.connection[e.v2]))
            if not msf.has_edge(*e_nx):
                g.remove_edge(e)
                n_removed += 1

        print '%d edges not in MST removed.' % n_removed

        # contract edges
        n_contracted = contract_edges(g, store_ordering=True)
        print '%d edges contracted.' % n_contracted

        # now we are going to compute the trunk

        # get the networkx graph again
        nxg = g.nxgraph
        _reweigh_edges(nxg, g, type_='wells')  # FIXME: do this once

        # recompute the maxinum spanning forest
        msf = nx.minimum_spanning_tree(g.nxgraph)

        # for each tree in forest:
        trunk = list()
        for mst in nx.connected_component_subgraphs(msf):
            # add to mst trunk
            if len(mst) >= 4:
                trunk.extend(_mst_trunk(mst, g))

        # remove edges not in trunk:
        E = [e for e in g.edges]
        print trunk
        trunk_v = set([v[0] for v in trunk])
        n_removed = 0
        for e in E:
            v1_id, v2_id = e.v1.id, e.v2.id
            if v1_id not in trunk_v or v2_id not in trunk_v:
                g.remove_edge(e)
                n_removed += 1

        if n_iter >= 4: keyboard()

        print '%d edges not in trunk removed.' % n_removed

        # contract one last time
        n_contracted = contract_edges(g, store_ordering=True)
        print '%d edges contracted.' % n_contracted

        # construct well-based scaffold graph in networkx format
        nxg = g.nxgraph
        # nxg = _construct_graph(g)

        # weigh edges according to how many wells they are sharing:
        _reweigh_edges(nxg, g, type_='wells')

        # find the maxinum spanning forest
        msf = nx.minimum_spanning_tree(nxg)

        n_iter += 1
Exemplo n.º 4
0
def scaffold_via_wells_mst(g):
    # initialize internal contig labels (used for downstream qc)
    for v in g.vertices:
        v.initialize_contigs()

    # construct well-based scaffold graph in networkx format
    nxg = g.nxgraph
    # nxg = _construct_graph(g)

    # weigh edges according to how many wells they are sharing:
    _reweigh_edges(nxg, g, type_="wells")

    # find the maxinum spanning forest
    msf = nx.minimum_spanning_tree(nxg)

    # keep simplifying the graph until the msf has no branching nodes:
    n_iter = 1
    while _has_branches(msf) and n_iter <= 10:
        print "MSF simplificaiton iteration %d" % n_iter

        # print '...', max(msf.degree(weight=None).values())
        # print '...', sorted(msf.degree(weight=None).iteritems(), key=lambda x: x[1], reverse=True)[:10]
        # vg = sorted(msf.degree(weight=None).iteritems(), key=lambda x: x[1], reverse=True)[0][0]
        # v = g.vertex_from_id(vg[0])
        # N = [n.id for n in g.vertices if v in n.neighbors]
        # print ',,,', N
        # print msf.neighbors(v)

        # remove edges of g not selected in forest MSF
        E = [e for e in g.edges]
        n_removed = 0
        for e in E:
            e_nx = ((e.v1.id, e.connection[e.v1]), (e.v2.id, e.connection[e.v2]))
            if not msf.has_edge(*e_nx):
                g.remove_edge(e)
                n_removed += 1

        print "%d edges not in MST removed." % n_removed

        # contract edges
        n_contracted = contract_edges(g, store_ordering=True)
        print "%d edges contracted." % n_contracted

        # now we are going to compute the trunk

        # get the networkx graph again
        nxg = g.nxgraph
        _reweigh_edges(nxg, g, type_="wells")  # FIXME: do this once

        # recompute the maxinum spanning forest
        msf = nx.minimum_spanning_tree(g.nxgraph)

        # for each tree in forest:
        trunk = list()
        for mst in nx.connected_component_subgraphs(msf):
            # add to mst trunk
            if len(mst) >= 4:
                trunk.extend(_mst_trunk(mst, g))

        # remove edges not in trunk:
        E = [e for e in g.edges]
        print trunk
        trunk_v = set([v[0] for v in trunk])
        n_removed = 0
        for e in E:
            v1_id, v2_id = e.v1.id, e.v2.id
            if v1_id not in trunk_v or v2_id not in trunk_v:
                g.remove_edge(e)
                n_removed += 1

        if n_iter >= 4:
            keyboard()

        print "%d edges not in trunk removed." % n_removed

        # contract one last time
        n_contracted = contract_edges(g, store_ordering=True)
        print "%d edges contracted." % n_contracted

        # construct well-based scaffold graph in networkx format
        nxg = g.nxgraph
        # nxg = _construct_graph(g)

        # weigh edges according to how many wells they are sharing:
        _reweigh_edges(nxg, g, type_="wells")

        # find the maxinum spanning forest
        msf = nx.minimum_spanning_tree(nxg)

        n_iter += 1