def test_sequences_from_query_string(self): """Check correct parsing when input is a string""" out_tax_dict = sequences_from_query(self.taxonomy_lines, 'Beggiatoa') self.assertEquals(out_tax_dict, { 'NZ_ABBZ01000843|640963011': 'VibrioBeggiatoa sp. PS', 'NZ_ABBZ01000613|640963011': 'VibrioBeggiatoa sp. PS', 'NZ_ABBZ01002042|640963011': 'VibrioBeggiatoa sp. PS', 'NZ_ABBY01000503|640963012': 'VibrioBeggiatoa sp. SS', 'NZ_ABBZ01000140|640963011': 'VibrioBeggiatoa sp. PS', 'NZ_ABBY01000221|640963012': 'VibrioBeggiatoa sp. SS', 'NZ_ABBZ01006278|640963011': 'VibrioBeggiatoa sp. PS', 'NZ_ABBZ01005870|640963011': 'VibrioBeggiatoa sp. PS'}) out_tax_dict = sequences_from_query(self.taxonomy_lines, 'Rumba') self.assertEquals(out_tax_dict, {})
def test_sequences_from_query_string(self): """Check correct parsing when input is a string""" out_tax_dict = sequences_from_query(self.taxonomy_lines, 'Beggiatoa') self.assertEquals( out_tax_dict, { 'NZ_ABBZ01000843|640963011': 'VibrioBeggiatoa sp. PS', 'NZ_ABBZ01000613|640963011': 'VibrioBeggiatoa sp. PS', 'NZ_ABBZ01002042|640963011': 'VibrioBeggiatoa sp. PS', 'NZ_ABBY01000503|640963012': 'VibrioBeggiatoa sp. SS', 'NZ_ABBZ01000140|640963011': 'VibrioBeggiatoa sp. PS', 'NZ_ABBY01000221|640963012': 'VibrioBeggiatoa sp. SS', 'NZ_ABBZ01006278|640963011': 'VibrioBeggiatoa sp. PS', 'NZ_ABBZ01005870|640963011': 'VibrioBeggiatoa sp. PS' }) out_tax_dict = sequences_from_query(self.taxonomy_lines, 'Rumba') self.assertEquals(out_tax_dict, {})
def split_db(tax_fp, seqs_fp, query, output_fp, split_fp): """Split a database in parts that match a query and parts that don't Parameters ---------- tax_fp : str Tab-delimited file with two columns, name/identifier of the sequence and the taxonomy. The sequence identifier is the longest string before a space in the header of the sequence. seqs_fp : str Path to a FASTA formatted file to split in interest and rest. Note: sequence identifiers must match the ones in the taxonomy file. query : str The query used to split the database, for example: salmonella. The query should be an exact match, no wild cards, it can have spaces, and it is case insensitive output_fp : str Output folder path where the results are stored. split_fp : str The tab delimited query file, where each line is a different sequence and the first column is the sequence id. Raises ------ BadParameter If the Taxonomy file is empty. If the query you passed retrieved no results. """ if query is not None: # query the taxonomy file for the required sequence identifiers try: interest_taxonomy = sequences_from_query(open(tax_fp, 'U'), query) except (PlatypusValueError, PlatypusParseError), e: raise BadParameter(e.message) if len(interest_taxonomy) == 0: raise BadParameter('The query could not retrieve any results, try ' 'a different one.')
def split_db(tax_fp, seqs_fp, query, output_fp, split_fp): """Split a database in parts that match a query and parts that don't Parameters ---------- tax_fp : str Tab-delimited file with two columns, name/identifier of the sequence and the taxonomy. The sequence identifier is the longest string before a space in the header of the sequence. seqs_fp : str Path to a FASTA formatted file to split in interest and rest. Note: sequence identifiers must match the ones in the taxonomy file. query : str The query used to split the database, for example: salmonella. The query should be an exact match, no wild cards, it can have spaces, and it is case insensitive output_fp : str Output folder path where the results are stored. split_fp : str The tab delimited query file, where each line is a different sequence and the first column is the sequence id. Raises ------ BadParameter If the Taxonomy file is empty. If the query you passed retrieved no results. """ if query is not None: # query the taxonomy file for the required sequence identifiers try: interest_taxonomy = sequences_from_query(open(tax_fp, 'U'), query) except (PlatypusValueError, PlatypusParseError), e: raise BadParameter(e.message) if len(interest_taxonomy) == 0: raise BadParameter('The query could not retrieve any results, try ' 'a different one.')
def split_db(tax_fp, seqs_fp, query, output_fp, split_fp): """Split a database in parts that match a query and parts that don't Parameters ---------- tax_fp : str Tab-delimited file with two columns, name/identifier of the sequence and the taxonomy. The sequence identifier is the longest string before a space in the header of the sequence. seqs_fp : str Path to a FASTA formatted file to split in interest and rest. Note: sequence identifiers must match the ones in the taxonomy file. query : str The query used to split the database, for example: salmonella. The query should be an exact match, no wild cards, it can have spaces, and it is case insensitive output_fp : str Output folder path where the results are stored. split_fp : str The tab delimited query file, where each line is a different sequence and the first column is the sequence id. Raises ------ BadParameter If the Taxonomy file is empty. If the query you passed retrieved no results. """ if query is not None: # query the taxonomy file for the required sequence identifiers try: interest_taxonomy = sequences_from_query(open(tax_fp, 'U'), query) except (PlatypusValueError, PlatypusParseError) as e: raise BadParameter(e.message) if len(interest_taxonomy) == 0: raise BadParameter('The query could not retrieve any results, try ' 'a different one.') else: interest_taxonomy = { l.strip().split('\t')[0].strip(): '' for l in open(split_fp, 'U') } if not interest_taxonomy: raise BadParameter('The split_fp is empty!') create_dir(output_fp, False) interest_fp = open(join(output_fp, 'interest.fna'), 'w') rest_fp = open(join(output_fp, 'rest.fna'), 'w') for record in read(seqs_fp, format='fasta'): full_name = record.id seq = record.sequence name = full_name.strip().split(' ')[0].strip() if name in interest_taxonomy: interest_fp.write(">%s\n%s\n" % (full_name, seq)) else: rest_fp.write(">%s\n%s\n" % (full_name, seq)) interest_fp.close() rest_fp.close()