def search( database, sequences=None, query_file=None, query_ids=None, blast_file=None, **kwargs, ): """Launch a new BLAST search using either DIAMOND or command-line BLASTp (remote). Arguments: database (str): Path to DIAMOND database sequences (dict): Query sequences query_file (str): Path to FASTA file containing query sequences query_ids (list): NCBI sequence accessions blast_file (str): Path to the file blast results are written to Raises: ValueError: No value given for query_file or query_ids Returns: list: Parsed rows with hits from DIAMOND results table """ if query_file: table = diamond(query_file, database, **kwargs) else: if not sequences: sequences = helpers.get_sequences(query_ids=query_ids) # delete=False since you cannot open tempfiles twice in Windows # see: https://stackoverflow.com/questions/46497842/passing-namedtemporaryfile-to-a-subprocess-on-windows fasta = NTF("w", delete=False) text = helpers.sequences_to_fasta(sequences) try: with fasta: fasta.write(text) table = diamond(fasta.name, database, **kwargs) finally: os.unlink(fasta.name) results = parse(table) if blast_file: LOG.info("Writing DIAMOND hit table to %s", blast_file) blast_table = "".join(table) with open(blast_file, "w") as f: f.write(blast_table) return results
def search( database, sequences=None, query_file=None, query_ids=None, blast_file=None, **kwargs, ): """Launch a new BLAST search using either DIAMOND or command-line BLASTp (remote). Arguments: database (str): Path to DIAMOND database sequences (dict): Query sequences query_file (str): Path to FASTA file containing query sequences query_ids (list): NCBI sequence accessions blast_file (TextIOWrapper): file blast results are written to Raises: ValueError: No value given for query_file or query_ids Returns: list: Parsed rows with hits from DIAMOND results table """ if query_file: table = diamond(query_file, database, **kwargs) else: if not sequences: sequences = helpers.get_sequences(query_ids=query_ids) # delete=False since you cannot open tempfiles twice in Windows fasta = NTF("w", delete=False) text = helpers.sequences_to_fasta(sequences) try: with fasta: fasta.write(text) table = diamond(fasta.name, database, **kwargs) finally: os.unlink(fasta.name) results = parse(table) if blast_file: LOG.info("Writing DIAMOND hit table to %s", blast_file.name) blast = "\n".join(results) blast_file.write(blast) return results
def start( sequences=None, query_file=None, query_ids=None, database="nr", program="blastp", megablast=False, filtering="F", evalue=0.1, nucl_reward=None, nucl_penalty=None, gap_costs="11 1", matrix="BLOSUM62", hitlist_size=500, threshold=11, word_size=6, comp_based_stats=2, entrez_query=None, ): """Launch a remote BLAST search using NCBI BLAST API. Note that the HITLIST_SIZE, ALIGNMENTS and DESCRIPTIONS parameters must all be set together in order to mimic max_target_seqs behaviour. Usage guidelines: 1. Don't contact server more than once every 10 seconds 2. Don't poll for a single RID more than once a minute 3. Use URL parameter email/tool 4. Run scripts weekends or 9pm-5am Eastern time on weekdays if >50 searches For a full description of the parameters, see: 1. `BLAST API documentation<https://ncbi.github.io/blast-cloud/dev/api.html>` 2. `BLAST documentation <https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=BlastHelp>` Parameters: sequences (dict): Query sequence dict generated by helpers.get_sequences() query_file (str): Path to a query FASTA file query_ids (list): Collection of NCBI sequence identifiers database (str): Target NCBI BLAST database program (str): BLAST variant to run megablast (bool): Enable megaBLAST option (only with BLASTn) filtering (str): Low complexity filtering evalue (float): E-value cutoff nucl_reward (int): Reward for matching bases (only with BLASTN/megaBLAST) nucl_penalty (int): Penalty for mismatched bases (only with BLASTN/megaBLAST) gap_costs (str): Gap existence and extension costs matrix (str): Scoring matrix name hitlist_size (int): Number of database sequences to keep threshold (int): Neighbouring score for initial words word_size (int): Size of word for initial matches comp_based_stats (int): Composition based statistics algorithm entrez_query (str): NCBI Entrez search term for pre-filtering the BLAST database Returns: rid (str): Request Identifier (RID) assigned to the search rtoe (int): Request Time Of Execution (RTOE), estimated run time of the search """ if not sequences: sequences = helpers.get_sequences(query_file=query_file, query_ids=query_ids) query = helpers.sequences_to_fasta(sequences) parameters = { "CMD": "PUT", "DATABASE": database, "PROGRAM": program, "FILTER": filtering, "EXPECT": evalue, "GAPCOSTS": gap_costs, "MATRIX": matrix, "HITLIST_SIZE": hitlist_size, "ALIGNMENTS": hitlist_size, "DESCRIPTIONS": hitlist_size, "WORD_SIZE": word_size, "COMPOSITION_BASED_STATISTICS": comp_based_stats, } if entrez_query: parameters["ENTREZ_QUERY"] = entrez_query if program == "blastn": if megablast: parameters["MEGABLAST"] = "on" if nucl_reward: parameters["NUCL_REWARD"] = nucl_reward if nucl_penalty: parameters["NUCL_PENALTY"] = nucl_penalty else: # Does not apply to blastn parameters["THRESHOLD"] = threshold response = requests.post(BLAST_API_URL, files={"QUERY": query}, params=parameters) LOG.debug("Search parameters: %s", parameters) LOG.debug("Search URL: %s", response.url) rid, rtoe = re.findall(r"(?:RID|RTOE) = (.+?)[\n\s]", response.text) return rid, int(rtoe)