示例#1
0
def reorder_mat(A, thr_list, min_cc_len, VERB):

    if not isspmatrix_csr(A):
        A = A.tocsr()
    # Initialization.
    ccs_ord = []
    #Create list of unordered connected components
    todo_ccs = [np.arange(A.shape[0])]
    todo_next = []
    n_loop = 0

    while len(todo_ccs) > 0:
        thr_sub = thr_list[n_loop]  # starts at 0.4 for n_loop=0
        # Reorder each of them
        for cc in todo_ccs:
            # if statement
            # in order not to make the preprocessing twice. We could also remove
            # the preprocessing from the pipeline and do it here.
            if n_loop > 0:
                A_sub = A[cc, :][:, cc]
                A_sub = remove_bridge_reads(A_sub.multiply(A_sub > thr_sub))
            else:
                A_sub = A

            # Compute connected components
            (n_cc, labels) = connected_components(A_sub,
                                                  directed=False,
                                                  return_labels=True)

            # Reorder each cc with spectral and keep the ordering if it looks OK
            for i_cc in xrange(n_cc):
                cc_sub = np.argwhere(labels == i_cc)[:, 0]
                if len(cc_sub) <= min_cc_len:
                    continue
                msg = " Running spectral algorithm in connected"\
                      "component of size %d..." % (len(cc_sub))
                oprint(msg, cond=(VERB >= 2))
                (_, fidvec) = get_fiedler(A_sub[cc_sub, :][:, cc_sub])
                permu = np.argsort(fidvec)
                (ii, jj, _) = find(A_sub[cc_sub[permu], :][:, cc_sub[permu]])
                bw = max(abs(ii - jj))
                if bw >= 80:
                    oprint("Bandwidth larger than 80 in reordered matrix.",
                           cond=(VERB >= 2))
                    todo_next.append(cc[cc_sub])
                else:
                    ccs_ord.append(cc[cc_sub[permu]])

        todo_ccs = todo_next
        todo_next = []
        n_loop += 1

    return ccs_ord
def add_next_window(temp_fn, w_idx, cc_idx, whole_cons, opts, trim_margin):
    """ Add the consensus from the current window to the current consensus.

    Parameters
    ----------
    temp_fn : str (temporary file to write sequences to align with spoa)
    w_idx : int (index of current window)
    cc_idx : int (index of the connected component)
    whole_cons : str (consensus extracted so far by joining the consensus sequences from windows 0 to w_idx - 1)
    opts : dict (keywords arguments for global parameters)
    trim_margin : int (number of bp to trim on each end of the consensus, as the consensus sequence
    is more likely to be erroneous on the ends)

    Returns
    -------
    str (consensus extracted by joining the consensus sequences from windows 0 to w_idx)
    """
    DATATYPE = opts['READS_FMT'][-1]
    ROOT_DIR = opts['ROOT_DIR']
    MERGE_MARGIN = opts['MERGE_MARGIN']
    VERB = opts['VERB']

    fn = "%s/cc_%d/poa_in_cc_%d_win_%d.fast%s.cnsns" % (
        ROOT_DIR, cc_idx, cc_idx, w_idx, DATATYPE)
    if (not (os.path.exists(fn)) or os.path.getsize(fn) == 0):
        msg = "file %s does not exist or is empty" % (fn)
        oprint(msg, cond=(VERB >= 2))
        return whole_cons

    next_win_seq = get_consensus(fn, trim_margin)
    next_win_len = len(next_win_seq)
    whole_cons_len = len(whole_cons)
    kept_len = max(0, whole_cons_len - next_win_len - MERGE_MARGIN)
    cons0 = whole_cons[:kept_len]
    cons1 = whole_cons[kept_len:]

    # Write end of current consensus long sequence and next consensus window sequence in poa_in file
    poa_in_fh = open(temp_fn, "wb")
    poa_in_fh.write(">end_of_current_cons\n%s\n" % (cons1))
    poa_in_fh.write(">cons_in_window_%d\n%s\n" % (w_idx, next_win_seq))
    poa_in_fh.close()
    # Run poa to include next
    out_fn = "%s/cc_%d/poa_out_cons_cc%d_win_%d" % (ROOT_DIR, cc_idx, cc_idx,
                                                    w_idx)
    cons1b = run_spoa_and_consensus(temp_fn, out_fn, opts['SPOA_PATH'])

    return cons0 + cons1b
示例#3
0
def reord_submat(in_tuple, A, opts):

    (thr_sub, cc) = in_tuple
    min_len = int(opts['MIN_CC_LEN'])
    verb = int(opts['VERB'])
    JULIA_PATH = opts['JULIA_PATH']
    JULIA_SCRIPT = opts['JULIA_SCRIPT']
    # rep_time_fh = open('%s/time_evs.txt' %(opts['ROOT_DIR']), 'a')

    sub_todo_next = []
    sub_ccs_ord = []

    A_sub = A[cc, :][:, cc]
    A_sub = remove_bridge_reads(A_sub.multiply(A_sub > thr_sub))
    # Compute connected components
    (n_cc, labels) = connected_components(A_sub,
                                          directed=False,
                                          return_labels=True)

    # Reorder each cc with spectral and keep the ordering if it looks OK
    for i_cc in xrange(n_cc):
        cc_sub = np.argwhere(labels == i_cc)[:, 0]
        if len(cc_sub) <= min_len:
            continue
        msg = " Running spectral algorithm in connected "\
              "component of size %d..." % (len(cc_sub))
        oprint(msg, cond=(verb >= 2))
        # t1 = time()

        if JULIA_PATH and (len(cc_sub) > 3000):
            permu = get_fiedler_julia(A_sub[cc_sub, :][:, cc_sub], JULIA_PATH,
                                      JULIA_SCRIPT)
            # rep_time_fh.write("%d\t%3.6f\t(julia)\n" %(len(cc_sub), time()-t1))
        else:
            (_, fidvec) = get_fiedler(A_sub[cc_sub, :][:, cc_sub])
            permu = np.argsort(fidvec)
            # rep_time_fh.write("%d\t%3.6f\n" %(len(cc_sub), time()-t1))

        oprint("Done in %3.6fs" % (time() - t1), cond=(verb >= 2))

        (ii, jj, _) = find(A_sub[cc_sub[permu], :][:, cc_sub[permu]])
        bw = max(abs(ii - jj))
        if bw >= 80:
            oprint("Bandwidth larger than 90 in reordered matrix.",
                   cond=(verb >= 2))
            sub_todo_next.append(cc[cc_sub])
        else:
            sub_ccs_ord.append(cc[cc_sub[permu]])

    # rep_time_fh.close()

    return sub_ccs_ord, sub_todo_next
def merge_windows_in_cc(cc_idx, opts):
    """ Merge the consensus sequences from all windows into one sequence (contig).

    Parameters
    ----------
    cc_idx : int (index of the connected component)
    opts : dict (keywords arguments for global parameters)

    """
    # Parse arguments
    TRIM_MARGIN = opts['TRIM_MARGIN']
    DATATYPE = opts['READS_FMT'][-1]
    ROOT_DIR = opts['ROOT_DIR']
    VERB = opts['VERB']

    # Count number of windows
    try:
        cmd = "ls %s/cc_%d/poa_in_cc_%d_win_*.fast*.cnsns | wc -l" % (
            ROOT_DIR, cc_idx, cc_idx)
        n_win = int(subprocess.check_output(cmd, shell=True))
    except:
        n_win = 10000  # quick fix in case of problem with output of subprocess

    # Initialize
    fn = "%s/cc_%d/poa_in_cc_%d_win_%d.fast%s.cnsns" % (ROOT_DIR, cc_idx,
                                                        cc_idx, 0, DATATYPE)
    whole_cons = get_consensus(fn, TRIM_MARGIN)
    oprint(len(whole_cons))

    # Incrementally add consensus between window k and window k+1
    # trim margin = 0 for first and last 3 windows
    for w_idx in xrange(0, 3):
        poa_in_fn = "%s/poa_in_cons_cc_%d_win_%d.fasta" % (ROOT_DIR, cc_idx,
                                                           w_idx)
        whole_cons = add_next_window(poa_in_fn, w_idx, cc_idx, whole_cons,
                                     opts, 0)

    # trim margin = args.trim_margin for the rest of the windows
    for w_idx in xrange(3, n_win - 3):
        poa_in_fn = "%s/poa_in_cons_cc_%d_win_%d.fasta" % (ROOT_DIR, cc_idx,
                                                           w_idx)
        whole_cons = add_next_window(poa_in_fn, w_idx, cc_idx, whole_cons,
                                     opts, TRIM_MARGIN)
        msg = "Consensus generation... %dbp extracted so far (window %d)" % (
            len(whole_cons), w_idx)
        condition = (VERB >= 2) and (w_idx % 500 == 0)
        oprint(msg, cond=condition)

    for w_idx in xrange(n_win - 3, n_win):
        poa_in_fn = "%s/poa_in_cons_cc_%d_win_%d.fasta" % (ROOT_DIR, cc_idx,
                                                           w_idx)
        whole_cons = add_next_window(poa_in_fn, w_idx, cc_idx, whole_cons,
                                     opts, 0)

    msg = "extracted and merged sequences in windows for contig %d. Consensus length %dbp" % \
          (cc_idx, len(whole_cons))
    oprint(msg, cond=(VERB >= 2))

    # Print consensus to backup file
    consensus_fn = "%s/consensus_cc_%d.fasta" % (ROOT_DIR, cc_idx)
    consensus_fh = open(consensus_fn, "wb")
    consensus_fh.write(">consensus_from_windows_contig_%d\n%s\n" %
                       (cc_idx, whole_cons))
    consensus_fh.close()

    # print(">contig_%d\n%s" % (cc_idx, whole_cons), file=sys.stdout)

    return whole_cons
示例#5
0
                         "less good)")
parser.add_argument("--julia", default=None,
                    help="path to Julia (optional,"\
"though eigenvector computations are clearly faster in Julia than in Python)")

args = parser.parse_args()
opts = fill_args_opts(args)
ROOT_DIR = opts['ROOT_DIR']
VERB = opts['VERB']

# Load reads
reads_fh = open(args.READS_FN, "rU")
record_list = list(SeqIO.parse(reads_fh, opts['READS_FMT']))
reads_fh.close()
oprint("Reads loaded. Compute overlaps from files...",
       dt=(time() - t0),
       cond=(VERB >= 2))

# Compute overlaps from the files
(read_nb2id, ovl_list, I, J, K, num_match, ovl_len,
 n_reads) = compute_overlaps(args.minimapfn, record_list)

# Threshold based on overlaps value (number of matches) and length
THR = mquantiles(num_match, args.sim_qtile)
oprint("THR = %1.1f " % THR)
cond1 = (num_match > THR)
cond2 = (ovl_len > opts['LEN_THR'])
idxok = np.argwhere(cond1 * cond2)[:, 0]
num_match_l = num_match
I = I[idxok]
J = J[idxok]
示例#6
0
def reorder_submat(A, cc, num_match_l, qtile, ccs_ord, opts):
    """ Reorder matrix A with spectral ordering algorithm.

    Recursive function that reorders each connected component of the input matrix and raises threshold in
    the connected components where the order seems wrong, based on the bandwidth of the reordered matrix
    (this criterium is empirical and specific to genome assembly of genomes with limited number of repeats).

    Parameters
    ----------
    A : scipy.sparse matrix (similarity matrix)
    cc : list (index of the reads in the cc_idx-th connected component)
    num_match_l : list (of number of matches (int) such that A[i,j] = number of matches between i and j) *before*
    preprocessing and not restricted to the reads in cc. It is used to compute the threshold with qtile)
    qtile : real (the values lower than the threhsold thr = quantile(num_match_l, qtile) are removed from A)
    opts : dict (keywords argument containing global parameters and options)

    ccs_ord : list (of lists or reads index sorted by position inside a given connected component)


    Returns
    ----------
    None but ccs_ord is modified "passed by reference"

    """

    VERB = opts['VERB']
    min_cc_len = opts['MIN_CC_LEN']
    JULIA_PATH = opts['JULIA_PATH']
    JULIA_SCRIPT = opts['JULIA_SCRIPT']

    # rep_time_fh = open('%s/time_evs.txt' %(opts['ROOT_DIR']), 'wb')
    # t0 = time()

    if not isspmatrix_csr(A):
        A = A.tocsr()
    (ncs, lbls) = connected_components(A, directed=False, return_labels=True)
    for nc in xrange(ncs):
        cc_sub = np.argwhere(lbls == nc)[:, 0]
        if len(cc_sub) <= min_cc_len:
            continue
        msg = " Running spectral algorithm in connected component of size %d..." % (
            len(cc_sub))
        oprint(msg, cond=(VERB >= 2))
        # A_sub = A.copy().tocsr()
        # A_sub = A_sub[cc_sub, :]
        # A_sub = A_sub[:, cc_sub]
        A_sub = A[cc_sub, :][:, cc_sub]
        # t1 = time()
        #

        # Use Julia if possible to reorder relatively large matrices
        if JULIA_PATH and (len(cc_sub) > 4000):
            permu = get_fiedler_julia(A_sub, JULIA_PATH, JULIA_SCRIPT)
            # rep_time_fh.write("%d\t%3.6f\t(julia)\n" %(len(cc_sub), time()-t1))

        else:
            (fidval, fidvec) = get_fiedler(A_sub)
            if fidval < 1e-12:
                oprint(
                    "\n\nWARNING ! Non connected submatrix of size %d!\n\n" %
                    (len(cc_sub)))
            # rep_time_fh.write("%d\t%3.6f\n" %(len(cc_sub), time()-t1))
            permu = np.argsort(fidvec)
        cc_ord = [cc_sub[idx] for idx in permu]
        # A_ord = A_sub.copy()
        # A_ord = A_ord[permu, :]
        # A_ord = A_ord[:, permu]
        # (ii, jj, _) = find(A_ord)
        (ii, jj, _) = find(A_sub[permu, :][:, permu])
        bw = max(abs(ii - jj))
        if bw >= 80:
            oprint(
                "Bandwidth larger than 80 in reordered matrix. Threshold in submatrix increased before reordering.",
                cond=(VERB >= 2))
            new_qtile = qtile
            new_qtile += min(0.1, 0.5 * (1. - qtile))
            thr_sub = mquantiles(num_match_l, new_qtile)
            A_sub = remove_bridge_reads(A_sub.multiply(A_sub > thr_sub))
            cc_abs = [cc[idx] for idx in cc_sub]
            reorder_submat(A_sub, cc_abs, num_match_l, new_qtile, ccs_ord,
                           opts)
        else:
            ccs_ord.append([cc[idx] for idx in cc_ord])
            # oprint("Done in %3.3f." %(time() - t1), dt=(time() - t0), cond=(VERB >= 2))
        #
        # oprint("Computed rough layout in %3.3f." %(time() - t0), cond=(VERB >= 2))
        #
    # rep_time_fh.close()
    return