def test_split_fasta(self): # without prefix for prefix in [None, 'dupa']: for inp_faa in [extract_sequences(self.input_faa), self.input_faa]: split_fasta(inp_faa, prefix=prefix, outdir=self.working_dir) if prefix is not None: exp_fnames = sorted(['%s_%s' % (prefix, basename(self.split_1)), '%s_%s' % (prefix, basename(self.split_2)), '%s_%s' % (prefix, basename(self.split_3))]) else: exp_fnames = sorted([basename(self.split_1), basename(self.split_2), basename(self.split_3)]) obs_paths = sorted(glob(self.working_dir+'/*.fasta')) obs_fnames = list(map(basename, obs_paths)) self.assertListEqual(obs_fnames, exp_fnames) for obs_fp, exp_fp in zip([self.split_1, self.split_2, self.split_3], obs_paths): obs = open(obs_fp, 'r').read() exp = open(exp_fp, 'r').read() self.assertEqual(obs, exp) # remove fastas for fasta in obs_paths: remove(fasta)
def write_db(fname, step=None, version=1, db_fp='/tmp/protein_db'): """Append protein name and other information into the sequence DB Parameters ---------- fname : str fasta file file path step : str processing step information (e.g. PDB, CM) version : int processing version db_fp : str output database information. sequence with header will be appended to `db_fp` and header with processing information do `db_fp.index` """ prots = process_fasta.extract_sequences(fname) for prot in prots: prot_name = prot.metadata['id'] timestamp = str(datetime.now()).split('.')[0] msa_size_len = msa_size(fname) # > protein_name # source # msa_size # commit_no # timestamp append_idx = '>%s # %s # %i # %i # %s\n' % ( prot_name, step, msa_size_len, version, timestamp) with open('%s.index' % db_fp, 'a') as f: f.write(append_idx) append_seq = '>%s\n%s\n' % (prot_name, textwrap.fill(str(prot[:]), 70)) with open(db_fp, 'a') as f: f.write(append_seq)
def write_db(fname, step=None, version=1, db_fp='/tmp/protein_db'): """Append protein name and other information into the sequence DB Parameters ---------- fname : str fasta file file path step : str processing step information (e.g. PDB, CM) version : int processing version db_fp : str output database information. sequence with header will be appended to `db_fp` and header with processing information do `db_fp.index` """ prots = process_fasta.extract_sequences(fname) for prot in prots: prot_name = prot.metadata['id'] timestamp = str(datetime.now()).split('.')[0] msa_size_len = msa_size(fname) # > protein_name # source # msa_size # commit_no # timestamp append_idx = '>%s # %s # %i # %i # %s\n' % (prot_name, step, msa_size_len, version, timestamp ) with open('%s.index' % db_fp, 'a') as f: f.write(append_idx) append_seq = '>%s\n%s\n' % (prot_name, textwrap.fill(str(prot[:]), 70)) with open(db_fp, 'a') as f: f.write(append_seq)
def test_write_sequences(self): seqs = extract_sequences(self.input_faa, identifiers='1') outfile = join(self.working_dir, 'output.faa') write_sequences(seqs, outfile) with open(outfile, 'r') as f: obs = f.read().splitlines() remove(outfile) exp = ['>%s %s' % (self.seqs[0].metadata['id'], self.seqs[0].metadata['description']), str(self.seqs[0])] self.assertListEqual(obs, exp)
def parse_inputs(inp_fp=None, inp_from=None, inp_to=None, microprot_inp=None, microprot_out=None): """ Parse multi-sequence FASTA file into single-sequence, remove any problematic characters from the name and add intormation to `processed_sequences.fasta` file Parameters ---------- inp_fp : str file path to a multi-sequence FASTA file inp_from : int number of the first sequence in the input file inp_to : int number of the last sequence in the input file microprot_inp : str input directory where individual files from inp_fp will be placed microprot_out : str output directory path where processed_sequences.fasta file will \ be created Returns ------- SEQ_ids : list of str list of sequence ids picked from the inp_fp """ for _dir in [microprot_inp, microprot_out]: if not os.path.exists(_dir): os.makedirs(_dir) SEQS = process_fasta.extract_sequences(inp_fp, identifiers=(inp_from, inp_to)) SEQ_ids = [] processed_fh = open('%s/%s' % (microprot_out, 'processed_sequences.fasta'), 'a') for i, SEQ in enumerate(SEQS): _seq = SEQ.metadata['id'] _seq = _seq.replace('/', '_') _seq = _seq.replace('\\', '_') _seq = _seq.replace('|', '_') SEQ_ids.append(_seq) SEQ.metadata['id'] = _seq io.write(SEQ, format='fasta', into='%s/%s.fasta' % (microprot_inp, _seq)) io.write(SEQ, format='fasta', into=processed_fh) processed_fh.close() return SEQ_ids
def _process_fasta_input(infile, outfile, sort_by_len, min_len, max_len): fp = infile fp_name = os.path.splitext(fp)[0] fasta = process_fasta.extract_sequences(fp) fasta = np.array(fasta) suffix = [] if sort_by_len is True: suffix.append('_sorted') s = [len(seq) for seq in fasta] idx = sorted(range(len(s)), key=lambda k: s[k]) fasta = fasta[idx] for i, seq in enumerate(fasta): if len(seq) >= min_len: output_fasta = fasta[i:] break for i, seq in enumerate(output_fasta): if len(seq) > max_len: output_fasta = output_fasta[:i] break else: output_fasta = [seq for seq in fasta if min_len <= len(seq) <= max_len] # checks if settings changed from default if min_len > 1: suffix.append('%s%i' % ('_min', min_len)) if max_len != 100000: suffix.append('%s%i' % ('_max', max_len)) suffix = ''.join(suffix) if outfile is None: outfile = '%s%s.fasta' % (fp_name, suffix) process_fasta.write_sequences(output_fasta, outfile)
def test_extract_sequences(self): # extract all sequences obs = extract_sequences(self.input_faa) exp = self.seqs self.assertListEqual(obs, exp) # specify protein index obs = extract_sequences(self.input_faa, identifiers=2) exp = [self.seqs[1]] self.assertListEqual(obs, exp) # specify protein ID obs = extract_sequences(self.input_faa, identifiers='1K5N_B') exp = [self.seqs[1]] self.assertListEqual(obs, exp) # specify protein indexes using a Python list of int obs = extract_sequences(self.input_faa, identifiers=[1, 3]) exp = [self.seqs[0], self.seqs[2]] self.assertListEqual(obs, exp) # specify protein indexes using a Python list of str obs = extract_sequences(self.input_faa, identifiers=['2', '3']) exp = [self.seqs[1], self.seqs[2]] self.assertListEqual(obs, exp) # specify protein indexes using a comma-separated list obs = extract_sequences(self.input_faa, identifiers='1,3') exp = [self.seqs[0], self.seqs[2]] self.assertListEqual(obs, exp) # specify protein entries using an external file listfile = join(self.working_dir, 'list.txt') with open(listfile, 'w') as f: f.write('%s\n%s\n' % ('1K5N_B', '2VB1_A')) obs = extract_sequences(self.input_faa, identifiers=listfile) remove(listfile) exp = [self.seqs[1], self.seqs[2]] self.assertListEqual(obs, exp) # specify protein index range using a Python tuple (start, end) obs = extract_sequences(self.input_faa, identifiers=(2, 3)) exp = [self.seqs[1], self.seqs[2]] self.assertListEqual(obs, exp) # raise error when tuple is not properly formatted err = 'Error: Index range must be a tuple of (start, end).' with self.assertRaises(ValueError, msg=err): extract_sequences(self.input_faa, identifiers=('hi', 'there')) with self.assertRaises(ValueError, msg=err): extract_sequences(self.input_faa, identifiers=(1, 2, 3)) with self.assertRaises(ValueError, msg=err): extract_sequences(self.input_faa, identifiers=(3, 1)) # specify protein index range using a str of "start..end" obs = extract_sequences(self.input_faa, identifiers='1..3') exp = [self.seqs[0], self.seqs[1], self.seqs[2]] self.assertListEqual(obs, exp) # raise error when "start..end" is not properly formatted err = 'Error: Index range must be formatted as "start..end".' with self.assertRaises(ValueError, msg=err): extract_sequences(self.input_faa, identifiers='not.an..id') with self.assertRaises(ValueError, msg=err): extract_sequences(self.input_faa, identifiers='1..2..3') with self.assertRaises(ValueError, msg=err): extract_sequences(self.input_faa, identifiers='3..1') # raise error when identifiers is of incorrect data type err = 'Error: Incorrect data type of identifiers.' with self.assertRaises(ValueError, msg=err): extract_sequences(self.input_faa, identifiers=1.23) with self.assertRaises(ValueError, msg=err): extract_sequences(self.input_faa, identifiers={'1K5N_B': 1})