예제 #1
0
    def setUp(self):
        self.dot = Alphabet_Encoder('ACGU', '().')
        self.ref_dot = ('ACGUUGCA', '(((..)))')
        self.decoded_dot = self.dot.encode(self.ref_dot)

        self.ann = Alphabet_Encoder('ACGU', 'HIMS')
        self.ref_ann = ('ACGUUGCA', 'SSHHSSIM')
        self.decoded_ann = self.ann.encode(self.ref_ann)
예제 #2
0
    def __init__(self, class_files, alphabet):
        """ Load the sequences and split the data into 70%/15%/15% training/validation/test.

        If the goal is to do single-label classification a list of fasta files must be provided
        (one file per class, the first file will correspond to 'class_0'). In this case
        fasta headers are ignored. If the goal is multi-label classification a single fasta file
        must be provided and headers must indicate class membership as a comma-separated list
        (e.g. header '>0,2' means that the sequence belongs to class 0 and 2).

        For sequence-only files fasta entries have no format restrictions. For sequence-structure
        files each sequence and structure must span a single line, e.g.:

        '>0,2
        'CCCCAUAGGGG
        '((((...)))) (-3.3)
        'SSSSHHHSSSS

        This kind of format is the default output of RNAfold. The third line containing the
        annotated structure string can be omitted if you want to do the training on the dot-bracket
        strings (RNAfold will not output the annotated structure string, but we provide
        a helper function in the utils file to annotate an existing fasta file).
        **Important: All sequences in all files must have the same length.**

        The provided alphabet must match the content of the fasta files. For sequence-only files
        a single string ('ACGT' or 'ACGU') should be provided and for sequence-structure files a 
        tuple should be provided (('ACGU', 'HIMS') to use the annotated structures or ('ACGU', '().')
        to use dot-bracket structures). Characters that are not part of the provided alphabets will
        be randomly replaced with an alphabet character.

        Parameters
        ----------
        class_files: str or [str]
            A fasta file (multi-label) or a list of fasta files (single-label).
        
        alphabet: str or tuple(str,str)
            A string for sequence-only files and a tuple for sequence-structure files.
        """
        if isinstance(alphabet, tuple):
            self.is_rna = True
            self.alpha_coder = Alphabet_Encoder(alphabet[0], alphabet[1])
            alphabet = self.alpha_coder.alphabet
            data_loader = self._load_encode_rna
        else:
            self.is_rna = False
            data_loader = self._load_encode_dna
        if not isinstance(class_files, list):
            class_files = [class_files]
            self.multilabel = True
        else:
            self.multilabel = False
        self.one_hot_encoder = One_Hot_Encoder(alphabet)
        data_loader(class_files)
        self._process_labels()
        self.train_val_test_split(0.7, 0.15)
예제 #3
0
    def __init__(self, class_files, alphabet, structure_pwm=False):
        """ Load the sequences and split the data into 70%/15%/15% training/validation/test.

        If the goal is to do single-label classification a list of fasta files must be provided
        (one file per class, the first file will correspond to 'class_0'). In this case
        fasta headers are ignored. If the goal is multi-label classification a single fasta file
        must be provided and headers must indicate class membership as a comma-separated list
        (e.g. header '>0,2' means that the entry belongs to class 0 and 2).

        For sequence-only files fasta entries have no format restrictions. For sequence-structure
        files each sequence and structure must span a single line, e.g.:

        '>header
        'CCCCAUAGGGG
        '((((...))))

        in which the second line contains the sequence and the third line the structure.
        **Important: All sequences in all files must have the same length.**

        The provided alphabet must match the content of the fasta files. For sequence-only files
        a single string (e.g. 'ACGT' or 'ACGU') should be provided and for sequence-structure files a 
        tuple should be provided (e.g. ('ACGU', '().')). Characters that are not part of the 
        provided alphabets will be randomly replaced with an alphabet character.

        We support all uppercase alphanumeric characters and the following additional characters
        for alphabets: "()[]{}<>,.|*". Thus, it is possible to use and combine (in the sequence-structure
        case) arbitrarily defined alphabets as long as the data is provided in the described fasta format.
        In particular, this means the usage of the package is not restricted to RNA secondary
        structure (this is only an example). If you have structure information for DNA or protein data
        that can be encoded by some alphabet, similar to RNA structure information, you can apply
        the package to this kind of data as well.

        If you don't want to work with a single minimum free energy structure (as some RNA structure
        prediction tools can output multiple predictions) you can also provide a position-weight
        matrix representing the structure instead of a single string (matrix entries must be
        separated by a space or tab):

        '>header
        'GGGGUUCCCC
        '0.9 0.8 0.7 0.9 0.0 0.0 0.0 0.0 0.0 0.0
        '0.0 0.0 0.0 0.0 0.0 0.0 0.2 0.7 0.8 0.9
        '0.1 0.2 0.3 0.1 1.0 1.0 0.8 0.3 0.2 0.1

        If you provide "()." as the alphabet the first line of the matrix given above will correspond to
        "(", the second to ")" and the third to ".". Each column of the matrix must add up to 1. Again,
        we don't restrict the usage of the package to RNA, therefore the matrix given above can represent
        whatever you want it to represent, as long as you provide a valid alphabet.

        Parameters
        ----------
        class_files: str or [str]
            A fasta file (multi-label) or a list of fasta files (single-label).
        
        alphabet: str or tuple(str,str)
            A string for sequence-only files and a tuple for sequence-structure files.
        
        structure_pwm: bool
            Are structures provided as single strings (False) or as PWMs (True)?
        """
        self.meta = OrderedDict()
        self.is_rna_pwm = False
        if isinstance(alphabet, tuple):
            self.is_rna = True
            self.is_rna_pwm = structure_pwm
            self.alpha_coder = Alphabet_Encoder(alphabet[0], alphabet[1])
            alphabet = self.alpha_coder.alphabet
            data_loader = self._load_encode_rna
        else:
            self.is_rna = False
            data_loader = self._load_encode_dna
        if not isinstance(class_files, list):
            class_files = [class_files]
            self.multilabel = True
        else:
            self.multilabel = False
        self.one_hot_encoder = One_Hot_Encoder(alphabet)
        data_loader(class_files)
        # check if all sequences have the same length
        length = self.data[0].shape[0]
        for x in range(1, len(self.data)):
            if length != self.data[x].shape[0]:
                raise RuntimeError('All sequences must have the same length.')
        self._process_labels()
        self.train_val_test_split(0.7, 0.15)