Пример #1
0
def parse(
    handle,
    sequences=None,
    query_file=None,
    query_ids=None,
    max_evalue=0.01,
    min_identity=30,
    min_coverage=50,
):
    """Parse Tabular results from remote BLAST search performed via API.

    Since the API provides no option for returning query coverage, which is a metric we
    want to use for filtering hits, query sequences must be passed to this function so
    that their lengths can be compared to the alignment length.

    Arguments:
        handle (list):
            File handle (or file handle-like) object corresponding to BLAST results. Note
            that this function expects an iterable of tab-delimited lines and performs no
            validation/error checking
        sequences (dict): Query sequences
        query_file (str): Path to FASTA format query file
        query_ids (list): NCBI sequence identifiers
        max_evalue (float): Maximum e-value
        min_identity (float): Minimum percent identity
        min_coverage (float): Minimum percent query coverage
    Returns:
        list: Hit objects corresponding to criteria passing BLAST hits
    """
    if not sequences:
        sequences = helpers.get_sequences(query_file, query_ids)

    hits = []
    for line in handle:
        qid, sid, pident, *_, qstart, qend, _, _, evalue, score, _ = line.split(
            "\t")

        # Manually calculate query coverage
        coverage = (int(qend) - int(qstart) + 1) / len(sequences[qid]) * 100

        hit = Hit(
            query=qid,
            subject=sid,
            identity=pident,
            coverage=coverage,
            evalue=evalue,
            bitscore=score,
        )

        if (float(hit.identity) > min_identity
                and float(hit.coverage) > min_coverage
                and hit.evalue < max_evalue):
            hits.append(hit)

    if len(hits) == 0:
        raise ValueError("No results found")

    return hits
Пример #2
0
def search(
    database,
    sequences=None,
    query_file=None,
    query_ids=None,
    blast_file=None,
    **kwargs,
):
    """Launch a new BLAST search using either DIAMOND or command-line BLASTp (remote).

    Arguments:
        database (str): Path to DIAMOND database
        sequences (dict): Query sequences
        query_file (str): Path to FASTA file containing query sequences
        query_ids (list): NCBI sequence accessions
        blast_file (str): Path to the file blast results are written to
    Raises:
        ValueError: No value given for query_file or query_ids
    Returns:
        list: Parsed rows with hits from DIAMOND results table
    """
    if query_file:
        table = diamond(query_file, database, **kwargs)
    else:
        if not sequences:
            sequences = helpers.get_sequences(query_ids=query_ids)

        # delete=False since you cannot open tempfiles twice in Windows
        # see: https://stackoverflow.com/questions/46497842/passing-namedtemporaryfile-to-a-subprocess-on-windows
        fasta = NTF("w", delete=False)
        text = helpers.sequences_to_fasta(sequences)
        try:
            with fasta:
                fasta.write(text)
            table = diamond(fasta.name, database, **kwargs)
        finally:
            os.unlink(fasta.name)

    results = parse(table)

    if blast_file:
        LOG.info("Writing DIAMOND hit table to %s", blast_file)
        blast_table = "".join(table)
        with open(blast_file, "w") as f:
            f.write(blast_table)

    return results
Пример #3
0
def search(
    database,
    sequences=None,
    query_file=None,
    query_ids=None,
    blast_file=None,
    **kwargs,
):
    """Launch a new BLAST search using either DIAMOND or command-line BLASTp (remote).

    Arguments:
        database (str): Path to DIAMOND database
        sequences (dict): Query sequences
        query_file (str): Path to FASTA file containing query sequences
        query_ids (list): NCBI sequence accessions
        blast_file (TextIOWrapper): file blast results are written to
    Raises:
        ValueError: No value given for query_file or query_ids
    Returns:
        list: Parsed rows with hits from DIAMOND results table
    """
    if query_file:
        table = diamond(query_file, database, **kwargs)
    else:
        if not sequences:
            sequences = helpers.get_sequences(query_ids=query_ids)

        # delete=False since you cannot open tempfiles twice in Windows
        fasta = NTF("w", delete=False)
        text = helpers.sequences_to_fasta(sequences)
        try:
            with fasta:
                fasta.write(text)
            table = diamond(fasta.name, database, **kwargs)
        finally:
            os.unlink(fasta.name)

    results = parse(table)

    if blast_file:
        LOG.info("Writing DIAMOND hit table to %s", blast_file.name)
        blast = "\n".join(results)
        blast_file.write(blast)

    return results
Пример #4
0
def start(
    sequences=None,
    query_file=None,
    query_ids=None,
    database="nr",
    program="blastp",
    megablast=False,
    filtering="F",
    evalue=0.1,
    nucl_reward=None,
    nucl_penalty=None,
    gap_costs="11 1",
    matrix="BLOSUM62",
    hitlist_size=500,
    threshold=11,
    word_size=6,
    comp_based_stats=2,
    entrez_query=None,
):
    """Launch a remote BLAST search using NCBI BLAST API.

    Note that the HITLIST_SIZE, ALIGNMENTS and DESCRIPTIONS parameters must all be set
    together in order to mimic max_target_seqs behaviour.

    Usage guidelines:

    1. Don't contact server more than once every 10 seconds
    2. Don't poll for a single RID more than once a minute
    3. Use URL parameter email/tool
    4. Run scripts weekends or 9pm-5am Eastern time on weekdays if >50 searches

    For a full description of the parameters, see:

        1. `BLAST API documentation<https://ncbi.github.io/blast-cloud/dev/api.html>`
        2. `BLAST documentation <https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=BlastHelp>`

    Parameters:
        sequences (dict): Query sequence dict generated by helpers.get_sequences()
        query_file (str): Path to a query FASTA file
        query_ids (list): Collection of NCBI sequence identifiers
        database (str): Target NCBI BLAST database
        program (str): BLAST variant to run
        megablast (bool): Enable megaBLAST option (only with BLASTn)
        filtering (str): Low complexity filtering
        evalue (float): E-value cutoff
        nucl_reward (int): Reward for matching bases (only with BLASTN/megaBLAST)
        nucl_penalty (int): Penalty for mismatched bases (only with BLASTN/megaBLAST)
        gap_costs (str): Gap existence and extension costs
        matrix (str): Scoring matrix name
        hitlist_size (int): Number of database sequences to keep
        threshold (int): Neighbouring score for initial words
        word_size (int): Size of word for initial matches
        comp_based_stats (int): Composition based statistics algorithm
        entrez_query (str): NCBI Entrez search term for pre-filtering the BLAST database

    Returns:
        rid (str): Request Identifier (RID) assigned to the search
        rtoe (int): Request Time Of Execution (RTOE), estimated run time of the search
    """
    if not sequences:
        sequences = helpers.get_sequences(query_file=query_file,
                                          query_ids=query_ids)

    query = helpers.sequences_to_fasta(sequences)

    parameters = {
        "CMD": "PUT",
        "DATABASE": database,
        "PROGRAM": program,
        "FILTER": filtering,
        "EXPECT": evalue,
        "GAPCOSTS": gap_costs,
        "MATRIX": matrix,
        "HITLIST_SIZE": hitlist_size,
        "ALIGNMENTS": hitlist_size,
        "DESCRIPTIONS": hitlist_size,
        "WORD_SIZE": word_size,
        "COMPOSITION_BASED_STATISTICS": comp_based_stats,
    }

    if entrez_query:
        parameters["ENTREZ_QUERY"] = entrez_query

    if program == "blastn":
        if megablast:
            parameters["MEGABLAST"] = "on"
        if nucl_reward:
            parameters["NUCL_REWARD"] = nucl_reward
        if nucl_penalty:
            parameters["NUCL_PENALTY"] = nucl_penalty
    else:
        # Does not apply to blastn
        parameters["THRESHOLD"] = threshold

    response = requests.post(BLAST_API_URL,
                             files={"QUERY": query},
                             params=parameters)

    LOG.debug("Search parameters: %s", parameters)
    LOG.debug("Search URL: %s", response.url)

    rid, rtoe = re.findall(r"(?:RID|RTOE) = (.+?)[\n\s]", response.text)
    return rid, int(rtoe)
Пример #5
0
def test_get_sequences_bad_input():
    with pytest.raises(ValueError):
        helpers.get_sequences()
Пример #6
0
def test_get_sequences_query_ids(mocker):
    mocker.patch("cblaster.helpers.efetch_sequences")
    helpers.get_sequences(query_ids=["seq1", "seq2"])
    helpers.efetch_sequences.assert_called_once_with(["seq1", "seq2"])
Пример #7
0
def test_get_sequences_query_file(mocker):
    mocker.patch("cblaster.helpers.parse_fasta")
    helpers.get_sequences(query_file=TEST_DIR / "test.faa")
    helpers.parse_fasta.assert_called_once()
Пример #8
0
def cblaster(
    query_file=None,
    query_ids=None,
    mode=None,
    json_db=None,
    database=None,
    gap=20000,
    unique=3,
    min_hits=3,
    min_identity=30,
    min_coverage=50,
    max_evalue=0.01,
    entrez_query=None,
    output=None,
    output_hide_headers=False,
    output_delimiter=None,
    output_decimals=4,
    binary=None,
    binary_hide_headers=True,
    binary_delimiter=None,
    binary_key=len,
    binary_attr="identity",
    binary_decimals=4,
    rid=None,
    require=None,
    session_file=None,
    indent=None,
    plot=False,
    recompute=False,
    blast_file=None,
    ipg_file=None,
    hitlist_size=None,
):
    """Run cblaster.

    This function is the central workflow for the entire cblaster package.

    Arguments:
        query_file (str): Path to FASTA format query file
        query_ids (list): NCBI protein sequence identifiers
        mode (str): Search mode ('local' or 'remote')
        json_db (str): JSON database created with cblaster makedb
        database (str): Search database (NCBI if remote, DIAMOND if local)
        gap (int): Maximum gap (kilobase) between cluster hits
        unique (int): Minimum number of query sequences with hits in clusters
        min_hits (int): Minimum number of hits in clusters
        min_identity (float): Minumum identity (%) cutoff
        min_coverage (float): Minumum coverage (%) cutoff
        max_evalue (float): Maximum e-value threshold
        entrez_query (str): NCBI Entrez query to filter search database
        output (str): Path to cblaster summary output file
        output_hide_headers (bool): Hide headers in summary table
        output_delimiter (str): Delimiter used in summary table
        output_decimals (int): Total decimal places in hit scores in summary table
        binary (str): Path to cblaster binary output file
        binary_hide_headers (bool): Hide headers in binary table
        binary_delimiter (str): Delimiter used in binary table
        binary_key (str): Key function used in binary table (len, max or sum)
        binary_attr (str): Hit attribute used for calculating cell values in binary table
        binary_decimals (int): Total decimal places in cell values in binary table
        rid (str): NCBI BLAST search request identifier (RID)
        require (list): Query sequences that must be in hit clusters
        session_file (str): Path to cblaster session JSON file
        indent (int): Total spaces to indent JSON files
        plot (str): Path to cblaster plot HTML file
        recompute (str): Path to recomputed session JSON file
    Returns:
        Session: cblaster search Session object
    """

    if session_file and all(Path(sf).exists() for sf in session_file):
        LOG.info("Loading session(s) %s", session_file)
        session = Session.from_files(session_file)

        if recompute:
            LOG.info("Filtering session with new thresholds")
            context.filter_session(
                session,
                min_identity,
                min_coverage,
                max_evalue,
                gap,
                unique,
                min_hits,
                require,
            )
            if recompute is not True:
                LOG.info("Writing recomputed session to %s", recompute)
                with open(recompute, "w") as fp:
                    session.to_json(fp, indent=indent)
    else:
        session = Session(
            queries=query_ids if query_ids else [],
            sequences=helpers.get_sequences(
                query_file=query_file,
                query_ids=query_ids,
            ),
            params={
                "mode": mode,
                "database": database,
                "min_identity": min_identity,
                "min_coverage": min_coverage,
                "max_evalue": max_evalue,
            },
        )

        if query_file:
            # get_sequences() returns OrderedDict, so save keys to
            # preserve query order
            session.queries = list(session.sequences)
            session.params["query_file"] = query_file

        if json_db:
            session.params["json_db"] = json_db

        if mode == "local":
            LOG.info("Starting cblaster in local mode")
            results = local.search(
                database,
                sequences=session.sequences,
                min_identity=min_identity,
                min_coverage=min_coverage,
                max_evalue=max_evalue,
                blast_file=blast_file,
            )
        elif mode == "remote":
            LOG.info("Starting cblaster in remote mode")
            if entrez_query:
                session.params["entrez_query"] = entrez_query
            rid, results = remote.search(
                sequences=session.sequences,
                rid=rid,
                database=database,
                min_identity=min_identity,
                min_coverage=min_coverage,
                max_evalue=max_evalue,
                entrez_query=entrez_query,
                blast_file=blast_file,
                hitlist_size=hitlist_size,
            )
            session.params["rid"] = rid

        LOG.info("Found %i hits meeting score thresholds", len(results))
        LOG.info("Fetching genomic context of hits")

        query_sequence_order = list(session.sequences.keys()) \
            if any(query_file.endswith(ext) for ext in (".gbk", ".gb", ".genbank", ".gbff", ".embl", ".emb"))\
            else None
        session.organisms = context.search(
            results,
            unique=unique,
            min_hits=min_hits,
            gap=gap,
            require=require,
            json_db=json_db,
            ipg_file=ipg_file,
            query_sequence_order=query_sequence_order)

        if session_file:
            LOG.info("Writing current search session to %s", session_file[0])
            if len(session_file) > 1:
                LOG.warning("Multiple session files specified, using first")
            with open(session_file[0], "w") as fp:
                session.to_json(fp, indent=indent)

    if binary:
        LOG.info("Writing binary summary table to %s", binary)
        session.format(
            "binary",
            open(binary, "w"),
            hide_headers=binary_hide_headers,
            delimiter=binary_delimiter,
            key=binary_key,
            attr=binary_attr,
            decimals=binary_decimals,
        )

    LOG.info("Writing summary to %s",
             "stdout" if output == sys.stdout else output)
    results = session.format(
        "summary",
        fp=open(output, "w") if output else sys.stdout,
        hide_headers=output_hide_headers,
        delimiter=output_delimiter,
        decimals=output_decimals,
    )

    if plot:
        plot = None if plot is True else plot
        plot_session(session, output=plot)

    LOG.info("Done.")
    return session
Пример #9
0
def test_get_sequences_query_file(mocker):
    sequences = helpers.get_sequences(query_file=TEST_DIR / "test.faa")
    assert {'QBE85648.1', 'QBE85647.1', 'QBE85646.1'}.issubset(sequences)