示例#1
0
    def test_split_fasta(self):
        # without prefix
        for prefix in [None, 'dupa']:
            for inp_faa in [extract_sequences(self.input_faa),
                            self.input_faa]:
                split_fasta(inp_faa, prefix=prefix,
                            outdir=self.working_dir)
                if prefix is not None:
                    exp_fnames = sorted(['%s_%s' % (prefix,
                                                    basename(self.split_1)),
                                         '%s_%s' % (prefix,
                                                    basename(self.split_2)),
                                         '%s_%s' % (prefix,
                                                    basename(self.split_3))])
                else:
                    exp_fnames = sorted([basename(self.split_1),
                                         basename(self.split_2),
                                         basename(self.split_3)])

                obs_paths = sorted(glob(self.working_dir+'/*.fasta'))
                obs_fnames = list(map(basename,
                                      obs_paths))
                self.assertListEqual(obs_fnames, exp_fnames)

                for obs_fp, exp_fp in zip([self.split_1, self.split_2,
                                          self.split_3], obs_paths):
                    obs = open(obs_fp, 'r').read()
                    exp = open(exp_fp, 'r').read()
                    self.assertEqual(obs, exp)

                # remove fastas
                for fasta in obs_paths:
                    remove(fasta)
示例#2
0
def write_db(fname, step=None, version=1, db_fp='/tmp/protein_db'):
    """Append protein name and other information into the sequence DB
    Parameters
    ----------
    fname : str
        fasta file file path
    step : str
        processing step information (e.g. PDB, CM)
    version : int
        processing version
    db_fp : str
        output database information. sequence with header will be appended
        to `db_fp` and header with processing information do `db_fp.index`
    """
    prots = process_fasta.extract_sequences(fname)
    for prot in prots:
        prot_name = prot.metadata['id']
        timestamp = str(datetime.now()).split('.')[0]
        msa_size_len = msa_size(fname)

        # > protein_name # source # msa_size # commit_no # timestamp
        append_idx = '>%s # %s # %i # %i # %s\n' % (
            prot_name, step, msa_size_len, version, timestamp)
        with open('%s.index' % db_fp, 'a') as f:
            f.write(append_idx)

        append_seq = '>%s\n%s\n' % (prot_name, textwrap.fill(str(prot[:]), 70))
        with open(db_fp, 'a') as f:
            f.write(append_seq)
示例#3
0
def write_db(fname, step=None, version=1, db_fp='/tmp/protein_db'):
    """Append protein name and other information into the sequence DB
    Parameters
    ----------
    fname : str
        fasta file file path
    step : str
        processing step information (e.g. PDB, CM)
    version : int
        processing version
    db_fp : str
        output database information. sequence with header will be appended
        to `db_fp` and header with processing information do `db_fp.index`
    """
    prots = process_fasta.extract_sequences(fname)
    for prot in prots:
        prot_name = prot.metadata['id']
        timestamp = str(datetime.now()).split('.')[0]
        msa_size_len = msa_size(fname)

        # > protein_name # source # msa_size # commit_no # timestamp
        append_idx = '>%s # %s # %i # %i # %s\n' % (prot_name,
                                                    step,
                                                    msa_size_len,
                                                    version,
                                                    timestamp
                                                    )
        with open('%s.index' % db_fp, 'a') as f:
            f.write(append_idx)

        append_seq = '>%s\n%s\n' % (prot_name, textwrap.fill(str(prot[:]), 70))
        with open(db_fp, 'a') as f:
            f.write(append_seq)
示例#4
0
 def test_write_sequences(self):
     seqs = extract_sequences(self.input_faa, identifiers='1')
     outfile = join(self.working_dir, 'output.faa')
     write_sequences(seqs, outfile)
     with open(outfile, 'r') as f:
         obs = f.read().splitlines()
     remove(outfile)
     exp = ['>%s %s' % (self.seqs[0].metadata['id'],
                        self.seqs[0].metadata['description']),
            str(self.seqs[0])]
     self.assertListEqual(obs, exp)
示例#5
0
def parse_inputs(inp_fp=None,
                 inp_from=None,
                 inp_to=None,
                 microprot_inp=None,
                 microprot_out=None):
    """ Parse multi-sequence FASTA file into single-sequence, remove any
    problematic characters from the name and add intormation to
    `processed_sequences.fasta` file
    Parameters
    ----------
    inp_fp : str
        file path to a multi-sequence FASTA file
    inp_from : int
        number of the first sequence in the input file
    inp_to : int
        number of the last sequence in the input file
    microprot_inp : str
        input directory where individual files from inp_fp will be placed
    microprot_out : str
        output directory path where processed_sequences.fasta file will \
        be created

    Returns
    -------
    SEQ_ids : list of str
        list of sequence ids picked from the inp_fp
    """
    for _dir in [microprot_inp, microprot_out]:
        if not os.path.exists(_dir):
            os.makedirs(_dir)

    SEQS = process_fasta.extract_sequences(inp_fp,
                                           identifiers=(inp_from, inp_to))
    SEQ_ids = []
    processed_fh = open('%s/%s' % (microprot_out, 'processed_sequences.fasta'),
                        'a')
    for i, SEQ in enumerate(SEQS):
        _seq = SEQ.metadata['id']
        _seq = _seq.replace('/', '_')
        _seq = _seq.replace('\\', '_')
        _seq = _seq.replace('|', '_')
        SEQ_ids.append(_seq)
        SEQ.metadata['id'] = _seq
        io.write(SEQ,
                 format='fasta',
                 into='%s/%s.fasta' % (microprot_inp, _seq))
        io.write(SEQ, format='fasta', into=processed_fh)
    processed_fh.close()
    return SEQ_ids
示例#6
0
def parse_inputs(inp_fp=None, inp_from=None, inp_to=None,
                 microprot_inp=None, microprot_out=None):
    """ Parse multi-sequence FASTA file into single-sequence, remove any
    problematic characters from the name and add intormation to
    `processed_sequences.fasta` file
    Parameters
    ----------
    inp_fp : str
        file path to a multi-sequence FASTA file
    inp_from : int
        number of the first sequence in the input file
    inp_to : int
        number of the last sequence in the input file
    microprot_inp : str
        input directory where individual files from inp_fp will be placed
    microprot_out : str
        output directory path where processed_sequences.fasta file will \
        be created

    Returns
    -------
    SEQ_ids : list of str
        list of sequence ids picked from the inp_fp
    """
    for _dir in [microprot_inp, microprot_out]:
        if not os.path.exists(_dir):
            os.makedirs(_dir)

    SEQS = process_fasta.extract_sequences(inp_fp,
                                           identifiers=(inp_from, inp_to))
    SEQ_ids = []
    processed_fh = open('%s/%s' % (microprot_out,
                                   'processed_sequences.fasta'), 'a')
    for i, SEQ in enumerate(SEQS):
        _seq = SEQ.metadata['id']
        _seq = _seq.replace('/', '_')
        _seq = _seq.replace('\\', '_')
        _seq = _seq.replace('|', '_')
        SEQ_ids.append(_seq)
        SEQ.metadata['id'] = _seq
        io.write(SEQ, format='fasta', into='%s/%s.fasta' % (microprot_inp,
                                                            _seq))
        io.write(SEQ, format='fasta',
                 into=processed_fh)
    processed_fh.close()
    return SEQ_ids
示例#7
0
def _process_fasta_input(infile, outfile, sort_by_len, min_len, max_len):
    fp = infile
    fp_name = os.path.splitext(fp)[0]

    fasta = process_fasta.extract_sequences(fp)
    fasta = np.array(fasta)

    suffix = []

    if sort_by_len is True:
        suffix.append('_sorted')
        s = [len(seq) for seq in fasta]
        idx = sorted(range(len(s)), key=lambda k: s[k])
        fasta = fasta[idx]
        for i, seq in enumerate(fasta):
            if len(seq) >= min_len:
                output_fasta = fasta[i:]
                break
        for i, seq in enumerate(output_fasta):
            if len(seq) > max_len:
                output_fasta = output_fasta[:i]
                break
    else:
        output_fasta = [seq for seq in fasta if min_len <= len(seq) <= max_len]

    # checks if settings changed from default
    if min_len > 1:
        suffix.append('%s%i' % ('_min', min_len))
    if max_len != 100000:
        suffix.append('%s%i' % ('_max', max_len))

    suffix = ''.join(suffix)

    if outfile is None:
        outfile = '%s%s.fasta' % (fp_name, suffix)
    process_fasta.write_sequences(output_fasta, outfile)
示例#8
0
def _process_fasta_input(infile, outfile, sort_by_len, min_len, max_len):
    fp = infile
    fp_name = os.path.splitext(fp)[0]

    fasta = process_fasta.extract_sequences(fp)
    fasta = np.array(fasta)

    suffix = []

    if sort_by_len is True:
        suffix.append('_sorted')
        s = [len(seq) for seq in fasta]
        idx = sorted(range(len(s)), key=lambda k: s[k])
        fasta = fasta[idx]
        for i, seq in enumerate(fasta):
            if len(seq) >= min_len:
                output_fasta = fasta[i:]
                break
        for i, seq in enumerate(output_fasta):
            if len(seq) > max_len:
                output_fasta = output_fasta[:i]
                break
    else:
        output_fasta = [seq for seq in fasta if min_len <= len(seq) <= max_len]

    # checks if settings changed from default
    if min_len > 1:
        suffix.append('%s%i' % ('_min', min_len))
    if max_len != 100000:
        suffix.append('%s%i' % ('_max', max_len))

    suffix = ''.join(suffix)

    if outfile is None:
        outfile = '%s%s.fasta' % (fp_name, suffix)
    process_fasta.write_sequences(output_fasta, outfile)
示例#9
0
    def test_extract_sequences(self):
        # extract all sequences
        obs = extract_sequences(self.input_faa)
        exp = self.seqs
        self.assertListEqual(obs, exp)

        # specify protein index
        obs = extract_sequences(self.input_faa, identifiers=2)
        exp = [self.seqs[1]]
        self.assertListEqual(obs, exp)

        # specify protein ID
        obs = extract_sequences(self.input_faa, identifiers='1K5N_B')
        exp = [self.seqs[1]]
        self.assertListEqual(obs, exp)

        # specify protein indexes using a Python list of int
        obs = extract_sequences(self.input_faa, identifiers=[1, 3])
        exp = [self.seqs[0], self.seqs[2]]
        self.assertListEqual(obs, exp)

        # specify protein indexes using a Python list of str
        obs = extract_sequences(self.input_faa, identifiers=['2', '3'])
        exp = [self.seqs[1], self.seqs[2]]
        self.assertListEqual(obs, exp)

        # specify protein indexes using a comma-separated list
        obs = extract_sequences(self.input_faa, identifiers='1,3')
        exp = [self.seqs[0], self.seqs[2]]
        self.assertListEqual(obs, exp)

        # specify protein entries using an external file
        listfile = join(self.working_dir, 'list.txt')
        with open(listfile, 'w') as f:
            f.write('%s\n%s\n' % ('1K5N_B', '2VB1_A'))
        obs = extract_sequences(self.input_faa, identifiers=listfile)
        remove(listfile)
        exp = [self.seqs[1], self.seqs[2]]
        self.assertListEqual(obs, exp)

        # specify protein index range using a Python tuple (start, end)
        obs = extract_sequences(self.input_faa, identifiers=(2, 3))
        exp = [self.seqs[1], self.seqs[2]]
        self.assertListEqual(obs, exp)

        # raise error when tuple is not properly formatted
        err = 'Error: Index range must be a tuple of (start, end).'
        with self.assertRaises(ValueError, msg=err):
            extract_sequences(self.input_faa, identifiers=('hi', 'there'))
        with self.assertRaises(ValueError, msg=err):
            extract_sequences(self.input_faa, identifiers=(1, 2, 3))
        with self.assertRaises(ValueError, msg=err):
            extract_sequences(self.input_faa, identifiers=(3, 1))

        # specify protein index range using a str of "start..end"
        obs = extract_sequences(self.input_faa, identifiers='1..3')
        exp = [self.seqs[0], self.seqs[1], self.seqs[2]]
        self.assertListEqual(obs, exp)

        # raise error when "start..end" is not properly formatted
        err = 'Error: Index range must be formatted as "start..end".'
        with self.assertRaises(ValueError, msg=err):
            extract_sequences(self.input_faa, identifiers='not.an..id')
        with self.assertRaises(ValueError, msg=err):
            extract_sequences(self.input_faa, identifiers='1..2..3')
        with self.assertRaises(ValueError, msg=err):
            extract_sequences(self.input_faa, identifiers='3..1')

        # raise error when identifiers is of incorrect data type
        err = 'Error: Incorrect data type of identifiers.'
        with self.assertRaises(ValueError, msg=err):
            extract_sequences(self.input_faa, identifiers=1.23)
        with self.assertRaises(ValueError, msg=err):
            extract_sequences(self.input_faa, identifiers={'1K5N_B': 1})