Пример #1
0
def test_return_list():
    test_data_dir = protfasta._get_data('test_data')
    duplicate_filename = '%s/testset_duplicate_seqs.fasta' % (test_data_dir)
    duplicate_record = '%s/testset_duplicate.fasta' % (test_data_dir)
    simple_filename = '%s/testset_1.fasta' % (test_data_dir)

    x = protfasta.read_fasta(simple_filename, duplicate_sequence_action='fail')
    assert type(x) == dict

    x = protfasta.read_fasta(simple_filename,
                             duplicate_sequence_action='fail',
                             return_list=True)
    assert type(x) == list

    # show we can use return_list to read in a FASTA file with two identical records (note when we did this before and
    # return_list=False then len(x) == 2 because the dictionary overwrites
    x = protfasta.read_fasta(duplicate_record,
                             duplicate_record_action='ignore',
                             return_list=True,
                             expect_unique_header=False)
    assert len(x) == 3

    x = protfasta.read_fasta(duplicate_record,
                             duplicate_record_action='remove',
                             return_list=True,
                             expect_unique_header=False)
    assert len(x) == 2
Пример #2
0
def test_duplicate_sequence_action():

    test_data_dir = protfasta._get_data('test_data')
    duplicate_filename = '%s/testset_duplicate_seqs.fasta' % (test_data_dir)
    simple_filename = '%s/testset_1.fasta' % (test_data_dir)

    # this should be fine because simple_filename is valid
    assert len(
        protfasta.read_fasta(simple_filename,
                             duplicate_sequence_action='fail')) == 9

    # this should be fine because simple_filename is valid
    with pytest.raises(ProtfastaException):
        assert protfasta.read_fasta(duplicate_filename,
                                    duplicate_sequence_action='fail')

    # this should be fine because simple_filename is valid
    assert len(
        protfasta.read_fasta(duplicate_filename,
                             duplicate_sequence_action='ignore')) == 3

    # remove duplciate sequence
    assert len(
        protfasta.read_fasta(duplicate_filename,
                             duplicate_sequence_action='remove',
                             verbose=True)) == 2

    # note only the sequences are duplicate, not the record
    assert len(
        protfasta.read_fasta(duplicate_filename,
                             duplicate_record_action='remove',
                             verbose=True)) == 3
Пример #3
0
def test_read_fasta_standard():

    test_data_dir = protfasta._get_data('test_data')

    simple_filename = '%s/testset_1.fasta' % (test_data_dir)

    x = protfasta.read_fasta(simple_filename)
    assert len(x) == 9

    # check can read in a sequence correctly
    assert x[test_data['test1'][0]] == test_data['test1'][1]
Пример #4
0
def test_expect_unique_header_toggle():

    test_data_dir = protfasta._get_data('test_data')
    simple_filename = '%s/testset_1.fasta' % (test_data_dir)

    x = protfasta.read_fasta(simple_filename, expect_unique_header=False)
    assert len(x) == 9
    assert x[test_data['test1'][0]] == test_data['test1'][1]

    x = protfasta.read_fasta(simple_filename, expect_unique_header=True)
    assert len(x) == 9
    assert x[test_data['test1'][0]] == test_data['test1'][1]

    # bool only
    with pytest.raises(ProtfastaException):
        assert protfasta.read_fasta(simple_filename,
                                    expect_unique_header='dog')

    # bool only
    with pytest.raises(ProtfastaException):
        assert protfasta.read_fasta(simple_filename, expect_unique_header=1)
Пример #5
0
def test_duplicate_record_action():

    test_data_dir = protfasta._get_data('test_data')
    duplicate_filename = '%s/testset_duplicate.fasta' % (test_data_dir)
    simple_filename = '%s/testset_1.fasta' % (test_data_dir)

    # this should be fine because simple_filename is valid
    assert len(
        protfasta.read_fasta(simple_filename,
                             duplicate_record_action='fail')) == 9

    # this should fail because duplicate_filename has duplicates
    with pytest.raises(ProtfastaException):
        assert protfasta.read_fasta(duplicate_filename,
                                    duplicate_record_action='fail')

    # this should fail because this combination of options (i.e. implicit expect_unique=True)
    # will throw and error
    with pytest.raises(ProtfastaException):
        assert protfasta.read_fasta(duplicate_filename,
                                    duplicate_record_action='ignore')

    # THIS should fail because even though we've said remove, we are still expecting uniqe
    with pytest.raises(ProtfastaException):
        assert protfasta.read_fasta(duplicate_filename,
                                    duplicate_record_action='remove')

    x = protfasta.read_fasta(duplicate_filename,
                             duplicate_record_action='remove',
                             expect_unique_header=False)
    assert len(x) == 2

    x = protfasta.read_fasta(duplicate_filename,
                             duplicate_record_action='ignore',
                             expect_unique_header=False,
                             return_list=True)
    assert len(x) == 3

    # this is not goood, BUT if we say expect uniuqe false, ignore duplicates and dont return a list we will use the first entry
    x = protfasta.read_fasta(duplicate_filename,
                             duplicate_record_action='ignore',
                             expect_unique_header=False)
    assert len(x) == 2

    # if we ignore or remove, same difference
    x = protfasta.read_fasta(duplicate_filename,
                             duplicate_record_action='remove',
                             expect_unique_header=False)
    assert len(x) == 2
Пример #6
0
def test_header_parser():

    test_data_dir = protfasta._get_data('test_data')
    simple_filename = '%s/testset_1.fasta' % (test_data_dir)

    def d(s):
        return s[0:10]

    def d_dumb(s):
        return "asas"

    def d_bad():
        return "asas"

    def d_bad2(s):
        return 1

    x = protfasta.read_fasta(simple_filename, header_parser=d)
    assert len(x) == 9
    assert x[test_data['test1'][0][0:10]] == test_data['test1'][1]

    # this dumb combination of settings means we overwrite the headers
    a = protfasta.read_fasta(simple_filename,
                             header_parser=d_dumb,
                             duplicate_sequence_action='ignore',
                             expect_unique_header=False)
    assert len(a) == 1

    # now we at least avoid overwriting by setting the return type to be a list
    a = protfasta.read_fasta(simple_filename,
                             header_parser=d_dumb,
                             duplicate_sequence_action='ignore',
                             expect_unique_header=False,
                             return_list=True)
    assert len(a) == 9

    # should fail because headers are duplicate
    with pytest.raises(ProtfastaException):
        assert protfasta.read_fasta(simple_filename, header_parser=d_dumb)

    # bool only
    with pytest.raises(ProtfastaException):
        assert protfasta.read_fasta(simple_filename, header_parser=d_bad)

    with pytest.raises(ProtfastaException):
        assert protfasta.read_fasta(simple_filename, header_parser=d_bad2)
Пример #7
0
import os
import pytest
import protfasta

import metapredict as meta
from metapredict.meta import MetapredictError

current_filepath = os.getcwd()
fasta_filepath = "{}/input_data/testing.fasta".format(current_filepath)
test_sequence = protfasta.read_fasta(fasta_filepath)['Q8N6T3']


def test_graph_disorder_png():

    # can make PNGs
    fn = 'demo1.png'
    full_fn = 'output/%s' % (fn)
    meta.graph_disorder(test_sequence, output_file=full_fn)
    assert os.path.isfile(full_fn) is True

    full_fn = 'output/demo1_custom_title.png'
    meta.graph_disorder(test_sequence,
                        output_file=full_fn,
                        title='Custom title')

    full_fn = 'output/demo1_disorder_thresh0p5.png'
    meta.graph_disorder(test_sequence,
                        output_file=full_fn,
                        disorder_threshold=0.5)

    full_fn = 'output/demo1_shaded_1_20.png'
Пример #8
0
def shephard_fasta_to_proteome(filename,
                               proteome=None,
                               force_overwrite=False,
                               invalid_sequence_action='fail'):
    """
    Stand alone function that allows the user to build a proteome from a FASTA
    file generated by SHEPHARD (using the proteome_to_fasta() function. When
    SHEPHARD generates a FASTA file it uses a general convention for encoding the
    unique ID, protein name,

    This function assumes the uniprot-standard format for the header
    file has been maintained - i.e.

    >xx|ACCESSION|xxxx

    Where ACCESSION is the uniprot accession and will be used as the unique_ID
    
    Parameters
    ------------

    filename : string
        Name of the FASTA file we're going to parse in. Note the protein name will be
        defined as the full FASTA header for each entry.

    proteome : Proteome
        If a Proteome object is provided the FASTA file will be read and added to the existing
        proteome, whereas if set to None a new Proteome will be generated.


    force_overwrite : bool
        [**Default = False**] Flag that if set to true and we encounter a unique_ID that is already in the proteome
        the newer value overwrites the older one without predudice. This is mostly useful if you are adding in a file
        with known duplicate entries OR combining multiple FASTA files where you know there's some duplications. Note
        that if build_unique_ID = None and user_header_as_unique_ID = None then fasta_to_proteome guarentees that every
        FASTA entry will be given a unique_ID (meaning force_overwrite is irrelevant in this case).

    invalid_sequence_action : ``'ignore'``, ``'fail'``, ``'remove'``, ``'convert'``, ``'convert-ignore'``
        [**Default = 'fail'**] Selector that determines how to deal with invalid sequences that contain invalid/non-standard 
        amino acids. If ``convert`` or ``convert-ignore`` are chosen, then conversion is completed with either the standard 
        conversion table (shown under the ``correction_dictionary`` documentation) or with a custom conversion dictionary 
        passed to ``correction_dictionary``. 

        Options are as follows: 
            * ``ignore``  - invalid sequences are completely ignored
            * ``fail``    - invalid sequence cause parsing to fail and throw an exception
            * ``remove`` - invalid sequences are removed
            * ``convert`` - invalid sequences are convert
            * ``convert-ignore`` - invalid sequences are converted to valid sequences and any remaining invalid residues are ignored
    
    Returns 
    --------
    Proteome Object
        Returns an initialized Proteome object 
    
    """

    # read in the fasta file using protfasta
    fasta_dictionary = protfasta.read_fasta(
        filename, invalid_sequence_action=invalid_sequence_action)

    # initialize the empty list
    proteome_list = []

    # for each entry
    for k in fasta_dictionary:

        # because we know what the header format will be we can be definitive about extracting the relevant information
        fasta_split = k.split('|')

        # ENSURE EVERY single line is a valid
        if fasta_split[0] != "SHPRD":
            raise APIException(
                'Trying to parse a FASTA file that is expected to be SHEPHARD generated but formatting does not comply [on entry %s in file %s]'
                % (k, filename))

        # extract out
        try:
            # get the unique ID
            unique_ID = fasta_split[1]

            # then take everything after the unique_ID
            tmp = "|".join(fasta_split[2:])
            attributes_string = tmp.split(SHEPHARD_ATTRIBUTE_SPLITTER)
            name = attributes_string[0]
        except IndexError:
            raise APIException(
                'Trying to parse a FASTA file that is expected to be SHEPHARD generated but formatting does not comply [on entry %s in file %s]'
                % (k, filename))

        attributes_dict = {}

        if len(attributes_string) > 1:
            attributes_string_s = attributes_string[1].split('\t')

            for a in attributes_string_s:
                local_k = a.strip().split('=')[0].strip()
                local_v = a.strip().split('=')[0].strip()
                attributes_dict[local_k] = local_v

        # now create an protein dictionary object and populate!
        newdict = {}
        newdict['sequence'] = str(fasta_dictionary[k])
        newdict['name'] = name
        newdict['unique_ID'] = unique_ID
        newdict['attributes'] = attributes_dict

        proteome_list.append(newdict)

    # finally if a proteome was provided then
    if proteome is not None:
        proteome.add_proteins(proteome_list, force_overwrite=force_overwrite)
        return proteome
    else:
        # no proteome provided so build a new proteome and return it
        return Proteome(proteome_list, force_overwrite=force_overwrite)
Пример #9
0
def fasta_to_proteome(filename,
                      proteome=None,
                      build_unique_ID=None,
                      build_attributes=None,
                      use_header_as_unique_ID=False,
                      force_overwrite=False,
                      invalid_sequence_action='fail'):
    """
    Stand alone function that allows the user to build a Proteome from a standard
    FASTA file, or add sequences in a FASTA file to an existing Proteome.

    The input filename must be a FASTA file without duplicate headers. If the file
    has duplicate headers and these have to be further processed we suggest using
    the protfasta (https://protfasta.readthedocs.io/) package to parse through the
    FASTA file first creating a santizied input FASTA.
    
    Each protein in a Proteome must have a unique_ID associated with it. There
    are two ways a FASTA file can be used to generate a unique ID:

        1. By parsing the FASTA header, which could be as much as simply reading 
           the header or couple involve some more complex logic.

        2. By incrementing an automatically unique ID.

    IF the argument ``build_unique_ID`` is not provided, the ``fasta_to_proteome`` function
    will automatically generate a unique numerical ID for each protein.

    However, if the ``build_unique_ID`` argument *is* provided, this function is used to
    convert the header into a unique key.
    
    Parameters
    ------------

    filename : string
        Name of the FASTA file we're going to parse in. Note the protein name will be
        defined as the full FASTA header for each entry **unless** a ``header_parser``
        function is provided.

    proteome : Proteome
        If a Proteome object is provided the FASTA file will be read and added to the existing
        proteome, whereas if set to None a new Proteome will be generated.

    build_unique_ID : function
        [**Default = None**] ``build_unique_ID`` allows a user-defined function that is 
        used to convert the FASTA header to a (hopefully) unique string. This can be 
        useful if the FASTA header is well structured and includes a specific, useful
        unique string that can be used as the unique_ID.
        
    build_attributes : function
        [**Default = None**] ``build_attributes`` allows a user-defined function that allows meta-information
        from the FASTA header to be converted into protein attributes. Specifically, build_attributes 
        should be a function which takes in the FASTA header as a string and returns a dictionary where
        key:value pairs are assigned as protein attributes. This can be useful if the FASTA header is well
        structured and includes a specific, useful information relivent to protein of interest. 
    
    use_header_as_unique_ID : bool
        [**Default = False**] ``user_header_as_unique_ID`` is a boolean flag which, if set to true
        means the unique_ID is set to the FASTA file header. NOTE that the combination of this parameter being
        set to true and `build_unique_ID` function not being set to None will trigger an exception as this means
        there are two conflicting definitions of how the unique_ID should be defined. Note that if non-unique
        headers are found this will trigger an exception.

    force_overwrite : bool
        [**Default = False**] Flag that if set to true and we encounter a unique_ID that is already in the proteome
        the newer value overwrites the older one without predudice. This is mostly useful if you are adding in a file
        with known duplicate entries OR combining multiple FASTA files where you know there's some duplications. Note
        that if build_unique_ID = None and user_header_as_unique_ID = None then fasta_to_proteome guarentees that every
        FASTA entry will be given a unique_ID (meaning force_overwrite is irrelevant in this case).

    invalid_sequence_action : ``'ignore'``, ``'fail'``, ``'remove'``, ``'convert'``, ``'convert-ignore'``
        [**Default = 'fail'**] Selector that determines how to deal with invalid sequences. If ``convert``
        or ``convert-ignore`` are chosen, then conversion is completed with either the standard conversion 
        table (shown under the ``correction_dictionary`` documentation) or with a custom conversion dictionary 
        passed to ``correction_dictionary``. 
        Options are as follows: 
            * ``ignore``  - invalid sequences are completely ignored

            * ``fail``    - invalid sequence cause parsing to fail and throw an exception
  
            * ``remove`` - invalid sequences are removed

            * ``convert`` - invalid sequences are convert

            * ``convert-ignore`` - invalid sequences are converted to valid sequences and any remaining invalid residues are ignored
        
    Returns 
    --------
    Proteome
        Returns an initialized Proteome object 
    
    """

    # parameter sanity checking
    if use_header_as_unique_ID is True and build_unique_ID is not None:
        raise APIException(
            'Cannot simultaneously set use_header_as_unique_ID = True and build_unique_ID to not None'
        )

    # read in the fasta file using protfasta
    fasta_dictionary = protfasta.read_fasta(
        filename, invalid_sequence_action=invalid_sequence_action)

    # extract the keys (FASTA headers) and initialize the record_index (internal
    # numbering used for construction. Also initialize the proteom_dict, which is
    # a dictionary of protein entries we passed to Proteome.
    record_index = 0

    # IF we're adding to a new proteome this bit of code sets the record_index to the largest new integer
    # such that we can add multiple proteomes in succession and we'll get a proteome where there are numerically
    # contigous unique_IDs.  Note we only do this if we'll be using the record_index
    if proteome is not None and (build_unique_ID is None
                                 or use_header_as_unique_ID is None):
        numeric_record_ids = []
        for uid in proteome.proteins:
            try:
                numeric_record_ids.append(int(uid))
            except ValueError:
                pass
        if len(numeric_record_ids) > 0:
            record_index = max(numeric_record_ids) + 1

    # initialize the empty list
    proteome_list = []

    # for each entry
    for k in fasta_dictionary:

        # create a key-value pair where
        #   key = the unique record_index (this is only used for internal structure
        #         within this function to assure we never overwrite in this dictionary
        #
        #  value = a four-position list where the positions reflect the following
        #        [0] = amino acid sequence
        #        [1] = name (this can be anything)
        #        [2] = unique_ID - this should be a unique identifier that can be used
        #              to cross-reference this entry to other data. If extrat_unique_ID
        #              is passed we try to use this
        #        [3] = attribute dictionary (we set this to None)

        # get unique_ID
        if build_unique_ID:
            unique_ID = build_unique_ID(k)
        elif use_header_as_unique_ID is True:
            unique_ID = k
        else:
            unique_ID = record_index

        # build an attributes dictionary using the user-provided custom function
        if build_attributes:
            attributes = build_attributes(k)
        else:
            attributes = {}

        # now create an input dictionary orbject
        newdict = {}
        newdict['sequence'] = str(fasta_dictionary[k])
        newdict['name'] = k
        newdict['unique_ID'] = unique_ID
        newdict['attributes'] = attributes

        proteome_list.append(newdict)

        record_index = record_index + 1

    # finally if a proteome was provided then
    if proteome is not None:
        proteome.add_proteins(proteome_list, force_overwrite=force_overwrite)
        return proteome
    else:
        # no proteome provided so build a new proteome and return it
        return Proteome(proteome_list, force_overwrite=force_overwrite)
Пример #10
0
def test_alignment_files():
    test_data_dir = protfasta._get_data('test_data')

    f1 = '%s/aligned_seq_all_valid.fasta' % (test_data_dir)
    f2 = '%s/aligned_seq_all_valid_convertable.fasta' % (test_data_dir)
    f3 = '%s/aligned_seq_all_valid_unconvertable.fasta' % (test_data_dir)

    x = protfasta.read_fasta(f1, alignment=True)

    assert x['Seq1'] == 'A-----CDEFGHIKLMNPQRSTVWY'

    # this should fail because by default dashes are invalid and fail upon invalid is set to true
    with pytest.raises(ProtfastaException):
        x = protfasta.read_fasta(f1)

    with pytest.raises(ProtfastaException):
        x = protfasta.read_fasta(f1, alignment=1)

    with pytest.raises(ProtfastaException):
        x = protfasta.read_fasta(f2, alignment=True)

    x = protfasta.read_fasta(f2,
                             alignment=True,
                             invalid_sequence_action='convert')
    assert x['Seq1'] == 'A-----CDEFGHIKLMNPQRSTVWY'

    with pytest.raises(ProtfastaException):
        x = protfasta.read_fasta(f3,
                                 alignment=True,
                                 invalid_sequence_action='convert')

    with pytest.raises(ProtfastaException):
        x = protfasta.read_fasta(f3, alignment=True)

    x = protfasta.read_fasta(f3,
                             alignment=True,
                             invalid_sequence_action='convert-ignore')
    assert x['Seq2'] == 'ACDEFGHIKL-----MNPQRSTVWYN'

    x = protfasta.read_fasta(f3,
                             alignment=True,
                             invalid_sequence_action='remove')
    assert len(x) == 0

    x = protfasta.read_fasta(f1,
                             alignment=True,
                             invalid_sequence_action='remove')
    assert len(x) == 3

    x = protfasta.read_fasta(f1, invalid_sequence_action='remove')
    assert len(x) == 0
Пример #11
0
def test_sequences_with_bad_chars():
    test_data_dir = protfasta._get_data('test_data')
    badchar_filename = '%s/test_data_with_bad_chars.fa' % (test_data_dir)
    nonstandard_filename = '%s/test_data_with_nonstandard_chars.fa' % (
        test_data_dir)

    # expect this to fail because invalid  characters are in here...
    with pytest.raises(ProtfastaException):
        x = protfasta.read_fasta(badchar_filename)

    # expect this to fail because non-standard characters are in here...
    with pytest.raises(ProtfastaException):
        x = protfasta.read_fasta(nonstandard_filename)

    # expect this to fail because invalid  characters are in here (explicitlty pass 'fail')
    with pytest.raises(ProtfastaException):
        x = protfasta.read_fasta(badchar_filename,
                                 invalid_sequence_action='fail')

    # expect this to fail because non-standard characters are in here... (explicitlty pass 'fail')
    with pytest.raises(ProtfastaException):
        x = protfasta.read_fasta(nonstandard_filename,
                                 invalid_sequence_action='fail')

    # make sure we can ignore bad chars regardless of if they're convertable or not
    assert len(
        protfasta.read_fasta(nonstandard_filename,
                             invalid_sequence_action='ignore')) == 4
    assert len(
        protfasta.read_fasta(badchar_filename,
                             invalid_sequence_action='ignore')) == 4

    # make sure we can convert nonstandard names
    assert len(
        protfasta.read_fasta(nonstandard_filename,
                             invalid_sequence_action='convert')) == 4

    # make sure we can't convert invalid character names
    with pytest.raises(ProtfastaException):
        x = protfasta.read_fasta(badchar_filename,
                                 invalid_sequence_action='convert')

    # make sure we can convert and ignore (even when ignore is not needed)
    assert len(
        protfasta.read_fasta(nonstandard_filename,
                             invalid_sequence_action='convert-ignore')) == 4

    # make sure we can convert and ignore (even when ignore is needed)
    assert len(
        protfasta.read_fasta(badchar_filename,
                             invalid_sequence_action='convert-ignore')) == 4

    # make sure we can remove sequences with bad chars regardless of if they're convertable or not
    assert len(
        protfasta.read_fasta(nonstandard_filename,
                             invalid_sequence_action='remove')) == 0
    assert len(
        protfasta.read_fasta(badchar_filename,
                             invalid_sequence_action='remove')) == 0

    #CD = {'-': '', '.': 'A', 'X':'Y'}
    CD = {'.': 'A'}

    # this should fail because no conversion has been requested
    with pytest.raises(ProtfastaException):
        x = protfasta.read_fasta(nonstandard_filename,
                                 correction_dictionary=CD)

    # THIS should fail because we've overwritten the default dictionary
    with pytest.raises(ProtfastaException):
        x = protfasta.read_fasta(nonstandard_filename,
                                 correction_dictionary=CD,
                                 invalid_sequence_action='convert')

    # this should fail because CD does not explain all chars that must be corrected
    with pytest.raises(ProtfastaException):
        x = protfasta.read_fasta(badchar_filename,
                                 correction_dictionary=CD,
                                 invalid_sequence_action='convert')

    CD = {'.': 'A', '-': 'C'}
    x = protfasta.read_fasta(badchar_filename,
                             correction_dictionary=CD,
                             invalid_sequence_action='convert')

    CD = {'.': 'A'}
    x = protfasta.read_fasta(badchar_filename,
                             correction_dictionary=CD,
                             invalid_sequence_action='convert-ignore')