def setUp(self): self.dot = Alphabet_Encoder('ACGU', '().') self.ref_dot = ('ACGUUGCA', '(((..)))') self.decoded_dot = self.dot.encode(self.ref_dot) self.ann = Alphabet_Encoder('ACGU', 'HIMS') self.ref_ann = ('ACGUUGCA', 'SSHHSSIM') self.decoded_ann = self.ann.encode(self.ref_ann)
def __init__(self, class_files, alphabet): """ Load the sequences and split the data into 70%/15%/15% training/validation/test. If the goal is to do single-label classification a list of fasta files must be provided (one file per class, the first file will correspond to 'class_0'). In this case fasta headers are ignored. If the goal is multi-label classification a single fasta file must be provided and headers must indicate class membership as a comma-separated list (e.g. header '>0,2' means that the sequence belongs to class 0 and 2). For sequence-only files fasta entries have no format restrictions. For sequence-structure files each sequence and structure must span a single line, e.g.: '>0,2 'CCCCAUAGGGG '((((...)))) (-3.3) 'SSSSHHHSSSS This kind of format is the default output of RNAfold. The third line containing the annotated structure string can be omitted if you want to do the training on the dot-bracket strings (RNAfold will not output the annotated structure string, but we provide a helper function in the utils file to annotate an existing fasta file). **Important: All sequences in all files must have the same length.** The provided alphabet must match the content of the fasta files. For sequence-only files a single string ('ACGT' or 'ACGU') should be provided and for sequence-structure files a tuple should be provided (('ACGU', 'HIMS') to use the annotated structures or ('ACGU', '().') to use dot-bracket structures). Characters that are not part of the provided alphabets will be randomly replaced with an alphabet character. Parameters ---------- class_files: str or [str] A fasta file (multi-label) or a list of fasta files (single-label). alphabet: str or tuple(str,str) A string for sequence-only files and a tuple for sequence-structure files. """ if isinstance(alphabet, tuple): self.is_rna = True self.alpha_coder = Alphabet_Encoder(alphabet[0], alphabet[1]) alphabet = self.alpha_coder.alphabet data_loader = self._load_encode_rna else: self.is_rna = False data_loader = self._load_encode_dna if not isinstance(class_files, list): class_files = [class_files] self.multilabel = True else: self.multilabel = False self.one_hot_encoder = One_Hot_Encoder(alphabet) data_loader(class_files) self._process_labels() self.train_val_test_split(0.7, 0.15)
def __init__(self, class_files, alphabet, structure_pwm=False): """ Load the sequences and split the data into 70%/15%/15% training/validation/test. If the goal is to do single-label classification a list of fasta files must be provided (one file per class, the first file will correspond to 'class_0'). In this case fasta headers are ignored. If the goal is multi-label classification a single fasta file must be provided and headers must indicate class membership as a comma-separated list (e.g. header '>0,2' means that the entry belongs to class 0 and 2). For sequence-only files fasta entries have no format restrictions. For sequence-structure files each sequence and structure must span a single line, e.g.: '>header 'CCCCAUAGGGG '((((...)))) in which the second line contains the sequence and the third line the structure. **Important: All sequences in all files must have the same length.** The provided alphabet must match the content of the fasta files. For sequence-only files a single string (e.g. 'ACGT' or 'ACGU') should be provided and for sequence-structure files a tuple should be provided (e.g. ('ACGU', '().')). Characters that are not part of the provided alphabets will be randomly replaced with an alphabet character. We support all uppercase alphanumeric characters and the following additional characters for alphabets: "()[]{}<>,.|*". Thus, it is possible to use and combine (in the sequence-structure case) arbitrarily defined alphabets as long as the data is provided in the described fasta format. In particular, this means the usage of the package is not restricted to RNA secondary structure (this is only an example). If you have structure information for DNA or protein data that can be encoded by some alphabet, similar to RNA structure information, you can apply the package to this kind of data as well. If you don't want to work with a single minimum free energy structure (as some RNA structure prediction tools can output multiple predictions) you can also provide a position-weight matrix representing the structure instead of a single string (matrix entries must be separated by a space or tab): '>header 'GGGGUUCCCC '0.9 0.8 0.7 0.9 0.0 0.0 0.0 0.0 0.0 0.0 '0.0 0.0 0.0 0.0 0.0 0.0 0.2 0.7 0.8 0.9 '0.1 0.2 0.3 0.1 1.0 1.0 0.8 0.3 0.2 0.1 If you provide "()." as the alphabet the first line of the matrix given above will correspond to "(", the second to ")" and the third to ".". Each column of the matrix must add up to 1. Again, we don't restrict the usage of the package to RNA, therefore the matrix given above can represent whatever you want it to represent, as long as you provide a valid alphabet. Parameters ---------- class_files: str or [str] A fasta file (multi-label) or a list of fasta files (single-label). alphabet: str or tuple(str,str) A string for sequence-only files and a tuple for sequence-structure files. structure_pwm: bool Are structures provided as single strings (False) or as PWMs (True)? """ self.meta = OrderedDict() self.is_rna_pwm = False if isinstance(alphabet, tuple): self.is_rna = True self.is_rna_pwm = structure_pwm self.alpha_coder = Alphabet_Encoder(alphabet[0], alphabet[1]) alphabet = self.alpha_coder.alphabet data_loader = self._load_encode_rna else: self.is_rna = False data_loader = self._load_encode_dna if not isinstance(class_files, list): class_files = [class_files] self.multilabel = True else: self.multilabel = False self.one_hot_encoder = One_Hot_Encoder(alphabet) data_loader(class_files) # check if all sequences have the same length length = self.data[0].shape[0] for x in range(1, len(self.data)): if length != self.data[x].shape[0]: raise RuntimeError('All sequences must have the same length.') self._process_labels() self.train_val_test_split(0.7, 0.15)