示例#1
0
def alignWithRef(currentD, reference_dict, pD, outdir):
    for i in currentD:
        query_seq = currentD[i]['consensus']
        query_ali = currentD[i]['alignment']
        query_names = currentD[i]['names']
        q_matrix, nt_inds = Consensus.makeAlignmentMatrix(query_ali)
        candidates = UtilityFunctions.FastaToDict(reference_dict)
        for target_nam, target_seq in candidates.items():
            qm = copy.copy(q_matrix)
            target_ali = UtilityFunctions.AlignmentArray([target_seq])
            # Align the query and target consensus sequences
            result = Alignment.SWalign(query_seq, target_seq,
                                       pD, useSub=True)

            # Check the if the consensus sequences are a good match
            is_match = Alignment.alignmentMeetsCriteria(result, query_seq,
                                                        target_seq, pD)

            if is_match[0]:
                result['alignment'] = is_match[1]
                # get the full alignment for the two consensus sequences
                result = Alignment.getAlignmentFull(result,
                                                    query_seq,
                                                    target_seq,
                                                    pD)

                ali, matrix = Consensus.expandAlignment(result,
                                                        query_ali,
                                                        target_ali,
                                                        qm,
                                                        nt_inds)
                cons = Consensus.collapseAlignment(matrix, nt_inds)
                names = query_names + [target_nam]
                result2 = Alignment.SWalign(query_seq, cons, pD, useSub=True)
                is_match_2 = Alignment.alignmentMeetsCriteria(result2,
                                                              query_seq,
                                                              cons,
                                                              pD)
                result2['alignment'] = is_match_2[1]
                result2 = Alignment.getAlignmentFull(result2,
                                                     query_seq,
                                                     cons,
                                                     pD)
                a2 = UtilityFunctions.AlignmentArray([query_seq])
                ali, matrix = Consensus.expandAlignment(result2,
                                                        a2,
                                                        ali,
                                                        qm,
                                                        nt_inds)
                names = ["*consensus_%s" % i] + names
                path = "%s/final_clusters/consensus_%s_to_%s_ali.fasta" % (
                    outdir, i, target_nam.split(" ")[0])
                out = open(path, "w")
                for j, nam in enumerate(names):
                    out.write(">%s\n%s\n" % (nam,
                                             "".join(list(ali[j]))))
                out.close()
def test_AlignmentArray():
    F = UtilityFunctions.FastaToDict("tests/test_fasta.fasta")
    ali = UtilityFunctions.AlignmentArray(F.values())
    comp = [['A', 'C', 'G', 'T', 'T', "-"], ['A', 'A', 'C', 'T'],
            ['C', 'T', 'A', 'G']]
    comp = np.array(comp)
    assert np.array_equal(comp, ali)
def test_reverseComplementAlignment():
    F = UtilityFunctions.FastaToDict("tests/test_data/test_fasta.fasta")
    ali = UtilityFunctions.AlignmentArray(F.values())
    rcomp = [['-', 'A', 'A', 'C', 'G', 'T'], ['A', 'G', 'T', 'T'],
             ['C', 'T', 'A', 'G']]
    rcomp = np.array(rcomp)
    assert np.array_equal(UtilityFunctions.reverseComplementAlignment(ali),
                          rcomp)
示例#4
0
def runCandidates(Z, fasta_dict, seqdict, candfile, pD, outdir, rround,
                  currentD=None):
    '''
    Allows the user to specify a set of reference sequences - in the first
    round of clustering, only contigs which align to these references
    (meeting the same minimum criteria as for normal clustering) are
    selected.  In the second round, the fragments identified in the query
    file which matched the reference sequence are used to identify
    further fragments.  From this point clustering of consensus
    sequences continues as normal.
    
    Parameters
    ----------
    Z: list
        List of two item tuples where the first element is an integer and
        the second the sequence name for all sequences in the main input file
    fasta_dict: pyfaidx.Fasta
        pyfaidx indexed Fasta object containing the main input file of contigs
    seqdict: dict
        A dictionary where keys are sequence IDs and values are empty
        dictionaries - these are used to store CIAlign logs for sequences
        later but it is not run at this stage when candidate sequences are
        used
    candfile: str
        Path to a file containing the reference sequences to match the contigs
        to in the first round of clustering
    pD: dict
        Dictionary containing the initial parameters set by the user
    outdir: str
        Path to directory in which to save all output files
    rround: int
        Round number - which round of clustering is this - used to
        determine where to save the output
    currentD: dict
        Dictionary containing the results of previous rounds of clustering
        used in this case to expand consensus sequences from round 1
    
    Returns
    -------
    D: dict
        Updated version of currentD containing the results of this
        round of clustering
    '''

    X = copy.copy(Z)
    candidates = UtilityFunctions.FastaToDict(candfile)
    D = dict()
    k = 0
    # iterate through the reference sequences
    for c_nam, c_seq in candidates.items():
        current = dict()
        # store candidate name and sequence in the results dictionary
        current['name'] = c_nam
        current['consensus'] = c_seq
        if rround == 1:
            # in the first round, none of the input sequences are
            # consensus sequences
            current['alignment'] = UtilityFunctions.AlignmentArray([c_seq])
            current['seqdict'] = seqdict
            current['names'] = [c_nam]
        elif rround == 2:
            # in the second round, expand the consensus sequences based
            # on the output of the previous round
            consn = int(c_nam.replace("*consensus_", ""))
            current['alignment'] = currentD[consn]['alignment']
            current['seqdict'] = currentD[consn]['seqdict']
            current['names'] = currentD[consn]['names']

        current['matrix'], current['nt_inds'] = Consensus.makeAlignmentMatrix(
                                                    current['alignment'])

        k += 1
        X, C = AlignmentSW.buildCluster(Z, fasta_dict, current, pD, k,
                                        candidate=True, skip=True)

        D.setdefault(k, dict())
        D[k]['consensus'] = C['current_consensus']
        D[k]['alignment'] = C['current_alignment']
        D[k]['names'] = C['current_names']
        D[k]['seqdict'] = C['seqdict']
        Alignment.writeFastas(D[k], k, rround, outdir, candidates=True,
                              reference=candidates)
        if rround == 2:
            Alignment.mergeFastas(2, len(D), outdir)
    return(D)
def buildCluster(X,
                 fasta_dict,
                 current,
                 pD,
                 k,
                 cons=False,
                 currentD=None,
                 candidate=False,
                 skip=False,
                 log=None):
    '''
    Build a cluster based on the current query sequence.
    Adapted for large input files - don't store everything in memory
    '''
    lp("Starting new cluster with %s" % (current['name']), 2, pD)
    # X is a list of names and integers
    j = 0
    # these are updated throughout to represent the current alignment
    matrix = current['matrix']
    nt_inds = current['nt_inds']
    seqdict = current['seqdict']

    current_names = current['names']
    current_consensus = current['consensus']
    current_alignment = current['alignment']

    # Each time a match is found and the the cluster is updated, start again
    # with the new cluster as
    # a query.
    i = 0
    # run a first pass without going back to the top every time a new sequence
    # is added - just switch to the consensus until you get to the end
    first_pass_done = False
    while j != len(X) and len(X) != 0:
        if first_pass_done:
            i = 0
        any_matches_inner = False
        n_new = 0
        # Look through all sequences which are not yet clustered until a match
        # to the current query is found (or you get to the end)
        while True and i != len(X):
            # update the query sequence
            query_seq = current_consensus
            query_ali = current_alignment
            query_names = current_names
            target_nam = X[i][1]
            target_seq = fasta_dict[target_nam][0:].seq
            if cons:
                cons_n_t = int(target_nam.replace("*consensus_", ""))
                target_ali = currentD[cons_n_t]['alignment']
                target_names = currentD[cons_n_t]['names']
            else:
                target_ali = UtilityFunctions.AlignmentArray([target_seq])
                target_names = [target_nam]
            lp("Testing %s" % ", ".join(target_names), 3, pD)
            # Align the query and target consensus sequences
            result = Alignment.SWalign(query_seq, target_seq, pD, useSub=True)

            # Check the if the consensus sequences are a good match
            is_match = Alignment.alignmentMeetsCriteria(
                result, query_seq, target_seq, pD)
            # if they are not try the reverse complement

            if skip:
                skipnames = query_names
            else:
                skipnames = []
            if not is_match[0]:
                target_seq = UtilityFunctions.reverseComplement(target_seq)
                result = Alignment.SWalign(query_seq,
                                           target_seq,
                                           pD,
                                           useSub=False)
                is_match = Alignment.alignmentMeetsCriteria(
                    result, query_seq, target_seq, pD, candidate)
                target_ali = UtilityFunctions.reverseComplementAlignment(
                    target_ali)
                for nam in target_names:
                    seqdict[nam]['is_rc'] = True

            if is_match[0] and target_nam not in skipnames:
                lp("Match found.", 2, pD)
                # We found a match - something has changed
                any_matches_inner = True
                n_new += 1
                # remove the current value from X
                X = X[:i] + X[i + 1:]
                result['alignment'] = is_match[1]
                # get the full alignment for the two consensus sequences
                result = Alignment.getAlignmentFull(result, query_seq,
                                                    target_seq, pD)
                current_names = query_names + target_names

                lp(
                    "Expanding current alignment to include %s" %
                    (", ".join(target_names)), 3, pD)
                ali, matrix = Consensus.expandAlignment(
                    result, query_ali, target_ali, matrix, nt_inds)
                # make a new sequence based on the new alignment
                current_consensus = Consensus.collapseAlignment(
                    matrix, nt_inds)
                current_alignment = ali
                if first_pass_done:
                    i = 0
                # now you have a match and the consensus is updated,
                # start at the top again
                break
            else:
                lp("No match.", 3, pD)
            # keep going through the other sequences
            i += 1
        j += 1

        if any_matches_inner:
            # if anything has changed, clean up the alignment etc
            lp("Cluster %i updated - %s sequences" % (k, len(current_names)),
               2, pD)
            lp("Cleaning cluster %i with CIAlign" % (k), 3, pD)
            if not candidate:
                funcs = ['remove_insertions', 'crop_ends', 'remove_gaponly']
            else:
                funcs = ['remove_gaponly']
            R = Alignment.cleanAlignmentCIAlign(current_alignment,
                                                current_names,
                                                query_seq,
                                                matrix,
                                                nt_inds,
                                                seqdict,
                                                pD,
                                                functions=funcs,
                                                log=log)

            current_alignment, matrix, current_consensus, seqdict = R
        elif not first_pass_done:
            first_pass_done = True
            i = 0
        else:
            break

    C = dict()
    C['current_alignment'] = current_alignment
    C['current_consensus'] = current_consensus
    C['current_names'] = current_names
    C['seqdict'] = seqdict
    C['matrix'] = matrix
    C['nt_inds'] = nt_inds
    return (X, C)
def runClusters(Z,
                fasta_dict,
                pD,
                seqdict,
                rround,
                cons=False,
                currentD=None,
                candidate=False,
                reference_dict=None,
                log=None):
    X = copy.copy(Z)
    if not cons:
        Znams = [z[0] for z in Z]
    else:
        Znams = []
        for z in Z:
            n = int(z[1].replace("*consensus_", ""))
            Znams += currentD[n]['names']
    done = set()
    k = 1
    D = dict()
    # keep building clusters until 1 or 0 sequences are left
    while len(done) < len(Znams) and len(X) > 1:
        current = dict()
        current['name'] = X[0][1]
        current['consensus'] = fasta_dict[X[0][1]][0:].seq
        lp("Remaining unaligned sequences %i" % (len(X)), 2, pD)

        # convert the query alignment into a matrix
        if cons:
            cons_n_q = int(current['name'].replace("*consensus_", ""))
            current['alignment'] = currentD[cons_n_q]['alignment']
            current['names'] = currentD[cons_n_q]['names']
        else:
            current['alignment'] = UtilityFunctions.AlignmentArray(
                [current['consensus']])
            current['names'] = [current['name']]

        current['matrix'], current['nt_inds'] = Consensus.makeAlignmentMatrix(
            current['alignment'])

        current['seqdict'] = seqdict
        # remove the query sequence from X
        X = X[1:]
        # Build a cluster based on the current query sequence
        X, C = buildCluster(X,
                            fasta_dict,
                            current,
                            pD,
                            k,
                            cons,
                            currentD,
                            candidate=candidate,
                            log=log)

        D.setdefault(k, dict())

        lp("Saved cluster %i with %s sequences" % (k, len(C['current_names'])),
           1, pD)

        D[k]['consensus'] = C['current_consensus']
        D[k]['alignment'] = C['current_alignment']
        D[k]['names'] = C['current_names']
        D[k]['log'] = C['seqdict']
        Alignment.writeFastas(D[k], k, rround, pD['outdir'])
        done = done | set(C['current_names'])
        k += 1

    if len(X) == 1:
        D.setdefault(k, dict())
        D[k]['consensus'] = current['consensus']
        nam = X[0][1]
        if not cons:
            D[k]['alignment'] = UtilityFunctions.AlignmentArray(
                [current['consensus']])
            D[k]['names'] = [X[0][1]]
        else:
            cons_n_q = int(nam.replace("*consensus_", ""))
            D[k]['alignment'] = currentD[cons_n_q]['alignment']
            D[k]['names'] = currentD[cons_n_q]['names']
        D[k]['log'] = dict()
        lp("Saved cluster %i with %s sequences" % (k, len(D[k]['names'])), 1,
           pD)
        Alignment.writeFastas(D[k], k, rround, pD['outdir'])
    return (D)