Exemplo n.º 1
0
    def test_sequences_from_query_string(self):
        """Check correct parsing when input is a string"""
        out_tax_dict = sequences_from_query(self.taxonomy_lines, 'Beggiatoa')
        self.assertEquals(out_tax_dict, {
            'NZ_ABBZ01000843|640963011': 'VibrioBeggiatoa sp. PS',
            'NZ_ABBZ01000613|640963011': 'VibrioBeggiatoa sp. PS',
            'NZ_ABBZ01002042|640963011': 'VibrioBeggiatoa sp. PS',
            'NZ_ABBY01000503|640963012': 'VibrioBeggiatoa sp. SS',
            'NZ_ABBZ01000140|640963011': 'VibrioBeggiatoa sp. PS',
            'NZ_ABBY01000221|640963012': 'VibrioBeggiatoa sp. SS',
            'NZ_ABBZ01006278|640963011': 'VibrioBeggiatoa sp. PS',
            'NZ_ABBZ01005870|640963011': 'VibrioBeggiatoa sp. PS'})

        out_tax_dict = sequences_from_query(self.taxonomy_lines, 'Rumba')
        self.assertEquals(out_tax_dict, {})
Exemplo n.º 2
0
    def test_sequences_from_query_string(self):
        """Check correct parsing when input is a string"""
        out_tax_dict = sequences_from_query(self.taxonomy_lines, 'Beggiatoa')
        self.assertEquals(
            out_tax_dict, {
                'NZ_ABBZ01000843|640963011': 'VibrioBeggiatoa sp. PS',
                'NZ_ABBZ01000613|640963011': 'VibrioBeggiatoa sp. PS',
                'NZ_ABBZ01002042|640963011': 'VibrioBeggiatoa sp. PS',
                'NZ_ABBY01000503|640963012': 'VibrioBeggiatoa sp. SS',
                'NZ_ABBZ01000140|640963011': 'VibrioBeggiatoa sp. PS',
                'NZ_ABBY01000221|640963012': 'VibrioBeggiatoa sp. SS',
                'NZ_ABBZ01006278|640963011': 'VibrioBeggiatoa sp. PS',
                'NZ_ABBZ01005870|640963011': 'VibrioBeggiatoa sp. PS'
            })

        out_tax_dict = sequences_from_query(self.taxonomy_lines, 'Rumba')
        self.assertEquals(out_tax_dict, {})
Exemplo n.º 3
0
def split_db(tax_fp, seqs_fp, query, output_fp, split_fp):
    """Split a database in parts that match a query and parts that don't

    Parameters
    ----------
    tax_fp : str
        Tab-delimited file with two columns, name/identifier of the sequence
        and the taxonomy. The sequence identifier is the longest string before
        a space in the header of the sequence.
    seqs_fp : str
        Path to a FASTA formatted file to split in interest and rest. Note:
        sequence identifiers must match the ones in the taxonomy file.
    query : str
        The query used to split the database, for example: salmonella. The
        query should be an exact match, no wild cards, it can have spaces, and
        it is case insensitive
    output_fp : str
        Output folder path where the results are stored.
    split_fp : str
        The tab delimited query file, where each line is a different sequence
        and the first column is the sequence id.

    Raises
    ------
    BadParameter
        If the Taxonomy file is empty.
        If the query you passed retrieved no results.
    """

    if query is not None:
        # query the taxonomy file for the required sequence identifiers
        try:
            interest_taxonomy = sequences_from_query(open(tax_fp, 'U'),
                                                     query)
        except (PlatypusValueError, PlatypusParseError), e:
            raise BadParameter(e.message)

        if len(interest_taxonomy) == 0:
            raise BadParameter('The query could not retrieve any results, try '
                               'a different one.')
Exemplo n.º 4
0
def split_db(tax_fp, seqs_fp, query, output_fp, split_fp):
    """Split a database in parts that match a query and parts that don't

    Parameters
    ----------
    tax_fp : str
        Tab-delimited file with two columns, name/identifier of the sequence
        and the taxonomy. The sequence identifier is the longest string before
        a space in the header of the sequence.
    seqs_fp : str
        Path to a FASTA formatted file to split in interest and rest. Note:
        sequence identifiers must match the ones in the taxonomy file.
    query : str
        The query used to split the database, for example: salmonella. The
        query should be an exact match, no wild cards, it can have spaces, and
        it is case insensitive
    output_fp : str
        Output folder path where the results are stored.
    split_fp : str
        The tab delimited query file, where each line is a different sequence
        and the first column is the sequence id.

    Raises
    ------
    BadParameter
        If the Taxonomy file is empty.
        If the query you passed retrieved no results.
    """

    if query is not None:
        # query the taxonomy file for the required sequence identifiers
        try:
            interest_taxonomy = sequences_from_query(open(tax_fp, 'U'), query)
        except (PlatypusValueError, PlatypusParseError), e:
            raise BadParameter(e.message)

        if len(interest_taxonomy) == 0:
            raise BadParameter('The query could not retrieve any results, try '
                               'a different one.')
Exemplo n.º 5
0
def split_db(tax_fp, seqs_fp, query, output_fp, split_fp):
    """Split a database in parts that match a query and parts that don't

    Parameters
    ----------
    tax_fp : str
        Tab-delimited file with two columns, name/identifier of the sequence
        and the taxonomy. The sequence identifier is the longest string before
        a space in the header of the sequence.
    seqs_fp : str
        Path to a FASTA formatted file to split in interest and rest. Note:
        sequence identifiers must match the ones in the taxonomy file.
    query : str
        The query used to split the database, for example: salmonella. The
        query should be an exact match, no wild cards, it can have spaces, and
        it is case insensitive
    output_fp : str
        Output folder path where the results are stored.
    split_fp : str
        The tab delimited query file, where each line is a different sequence
        and the first column is the sequence id.

    Raises
    ------
    BadParameter
        If the Taxonomy file is empty.
        If the query you passed retrieved no results.
    """

    if query is not None:
        # query the taxonomy file for the required sequence identifiers
        try:
            interest_taxonomy = sequences_from_query(open(tax_fp, 'U'), query)
        except (PlatypusValueError, PlatypusParseError) as e:
            raise BadParameter(e.message)

        if len(interest_taxonomy) == 0:
            raise BadParameter('The query could not retrieve any results, try '
                               'a different one.')
    else:
        interest_taxonomy = {
            l.strip().split('\t')[0].strip(): ''
            for l in open(split_fp, 'U')
        }
        if not interest_taxonomy:
            raise BadParameter('The split_fp is empty!')

    create_dir(output_fp, False)

    interest_fp = open(join(output_fp, 'interest.fna'), 'w')
    rest_fp = open(join(output_fp, 'rest.fna'), 'w')

    for record in read(seqs_fp, format='fasta'):
        full_name = record.id
        seq = record.sequence

        name = full_name.strip().split(' ')[0].strip()

        if name in interest_taxonomy:
            interest_fp.write(">%s\n%s\n" % (full_name, seq))
        else:
            rest_fp.write(">%s\n%s\n" % (full_name, seq))

    interest_fp.close()
    rest_fp.close()