Пример #1
0
    def test_parser_uniprotkb_decoydb(self):
        header = (
            'sp|P27748|ACOX_RALEH Acetoin catabolism protein X OS=Ralstonia'
            ' eutropha (strain ATCC 17699 / H16 / DSM 428 / Stanier 337)'
            ' GN=acoX PE=4 SV=2')
        sequence = 'SEQUENCE'
        with tempfile.TemporaryFile(mode='r+') as db:
            fasta.write([(header, sequence)], db)
            db.seek(0)
            entries = list(
                fasta.decoy_db(db,
                               prefix='PREFIX_',
                               parser=fasta.parse,
                               decoy_only=True))

        parsed = {
            'GN': 'acoX',
            'OS': 'Ralstonia eutropha '
            '(strain ATCC 17699 / H16 / DSM 428 / Stanier 337)',
            'PE': 4,
            'SV': 2,
            'db': 'PREFIX_sp',
            'entry': 'ACOX_RALEH',
            'id': 'P27748',
            'gene_id': 'ACOX',
            'name': 'Acetoin catabolism protein X',
            'taxon': 'RALEH'
        }
        self.assertEqual(entries[0][0], parsed)
        self.assertEqual(entries[0][1], 'SEQUENCE'[::-1])
        self.assertEqual(len(entries), 1)
Пример #2
0
 def test_read_and_write_fasta_short(self):
     with tempfile.TemporaryFile(mode='r+') as new_fasta_file:
         fasta.write(fasta.read(self.fasta_file, ignore_comments=True),
                     new_fasta_file)
         new_fasta_file.seek(0)
         new_entries = list(fasta.read(new_fasta_file,
                                       ignore_comments=True))
         self.assertEqual(new_entries, self.fasta_entries_short)
Пример #3
0
def make_reverse_fasta(input_file, output_file):
    '''
    Takes as input fasta file, drops all _REVERSED proteins and creates a new _REVERSED decoy proteins.
    '''
    prots = []
    for prot_desc, prot_seq in fasta.read(input_file):
        if not prot_desc.endswith('_REVERSED'):
            prots.append((prot_desc, prot_seq))
            prots.append((prot_desc + '_REVERSED', smart_reverse(prot_seq)))
    fasta.write(prots, output_file, file_mode='w')
Пример #4
0
def main():  # pragma: no cover
    """
    Execute pytrapment.

    Returns:
        None
    """
    today = date.today().strftime("%Y%m%d")

    parser = arg_parser()
    try:
        args = parser.parse_args(sys.argv[1:])
    except TypeError:
        parser.print_usage()

    # create dir if not there
    if not os.path.exists(args.out_dir):
        os.makedirs(args.out_dir)

    start_time = time.time()
    print("Starting pytrapment.")
    fasta_df = entrapment.get_nearest_neighbor_proteins(
        args.fasta_host, args.fasta_trap)
    print("Found neighbors.")
    print("Write fasta.")
    _ = fasta.write(zip(fasta_df.index, fasta_df.sequence),
                    os.path.join(args.out_dir, f"entrapment_{today}.fasta"),
                    file_mode="w")

    end_time = time.time()
    print(f"Took {(end_time-start_time)/60.:.2f} minutes")

    # doing qc
    print("Perform qc.")
    host_peptides = entrapment.digest_protein_df(
        fasta_df[fasta_df["db_type"] == "host"])
    trap_peptides = entrapment.digest_protein_df(
        fasta_df[fasta_df["db_type"] == "trap"])

    features_df_host = qc.compute_sequence_features(host_peptides)
    features_df_host["Type"] = "host"

    features_df_trap = qc.compute_sequence_features(trap_peptides)
    features_df_trap["Type"] = "trap"

    qc.qc_peptides(features_df_host, features_df_trap, args.out_dir)
    print("Done.")
Пример #5
0
def prepare_decoy_db(args):
    add_decoy = args['ad']
    if add_decoy:

        prefix = args['prefix']
        db = args['d']
        out1, out2 = os.path.splitext(db)
        out_db = out1 + '_shuffled' + out2
        logger.info('Creating decoy database: %s', out_db)

        extra_check = False
        if '{' in args['e']:
            extra_check = True
        if extra_check:
            banned_pairs = set()
            banned_aa = set()
            for enzyme_local in args['e'].split(','):
                if '{' in enzyme_local:
                    lpart, rpart = enzyme_local.split('|')
                    for aa_left, aa_right in itertools.product(
                            lpart[1:-1], rpart[1:-1]):
                        banned_aa.add(aa_left)
                        banned_aa.add(aa_right)
                        banned_pairs.add(aa_left + aa_right)

            logger.debug(banned_aa)
            logger.debug(banned_pairs)

        enzyme = get_enzyme(args['e'])
        cleave_rule_custom = enzyme + '|' + '([BXZUO])'
        # cleave_rule_custom = '([RKBXZUO])'
        logger.debug(cleave_rule_custom)

        shuf_map = dict()

        prots = []

        for p in fasta.read(db):
            if not p[0].startswith(prefix):
                target_peptides = [
                    x[1] for x in parser.icleave(p[1], cleave_rule_custom, 0)
                ]

                checked_peptides = set()
                sample_list = []
                for idx, pep in enumerate(target_peptides):

                    if len(pep) > 2:
                        pep_tmp = pep[1:-1]
                        if extra_check:
                            for bp in banned_pairs:
                                if bp in pep_tmp:
                                    pep_tmp = pep_tmp.replace(bp, '')
                                    checked_peptides.add(idx)

                        sample_list.extend(pep_tmp)
                random.shuffle(sample_list)
                idx_for_shuffle = 0

                decoy_peptides = []
                for idx, pep in enumerate(target_peptides):

                    if len(pep) > 2:

                        if pep in shuf_map:
                            tmp_seq = shuf_map[pep]
                        else:
                            if not extra_check or idx not in checked_peptides:
                                tmp_seq = pep[0]
                                for pep_aa in pep[1:-1]:
                                    tmp_seq += sample_list[idx_for_shuffle]
                                    idx_for_shuffle += 1
                                tmp_seq += pep[-1]
                            else:
                                max_l = len(pep)
                                tmp_seq = ''
                                ii = 0
                                while ii < max_l - 1:
                                    # for ii in range(max_l-1):
                                    if pep[ii] in banned_aa and pep[
                                            ii +
                                            1] in banned_aa and pep[ii] + pep[
                                                ii + 1] in banned_pairs:
                                        tmp_seq += pep[ii] + pep[ii + 1]
                                        ii += 1
                                    else:
                                        if ii == 0:
                                            tmp_seq += pep[ii]
                                        else:
                                            tmp_seq += sample_list[
                                                idx_for_shuffle]
                                            idx_for_shuffle += 1

                                    ii += 1
                                tmp_seq += pep[max_l - 1]

                            shuf_map[pep] = tmp_seq
                    else:
                        tmp_seq = pep

                    decoy_peptides.append(tmp_seq)

                assert len(target_peptides) == len(decoy_peptides)

                prots.append((p[0], ''.join(target_peptides)))
                prots.append(('DECOY_' + p[0], ''.join(decoy_peptides)))

        fasta.write(prots, open(out_db, 'w')).close()
        args['d'] = out_db
        args['ad'] = 0
    return args
Пример #6
0
 def test_read_and_write_long(self):
     with tempfile.TemporaryFile(mode='r+') as new_fasta_file:
         fasta.write(fasta.read(self.fasta_file), new_fasta_file)
         new_fasta_file.seek(0)
         new_entries = list(fasta.read(new_fasta_file))
         self.assertEqual(new_entries, self.fasta_entries_long)
Пример #7
0
    description was replaced by the gene name.
    '''
    descr, seq = e
    m = GN_PATTERN.search(descr)
    new_descr = m.group('gene') or descr
    return Protein(new_descr, seq)
    
parser = argparse.ArgumentParser()
parser.add_argument('files', type=str, nargs='+',
                        help='.fasta files containing plink output')
parser.add_argument('-o', '--output', type=str, default=None,
                        help='file to write output to.')
parser.add_argument('--gname', action='store_true',
                        help='only leave gene name (GN) in the output fasta header')                        
parser.add_argument('-v', '--verbosity', action='count',
                        help='increase output verbosity')

args = parser.parse_args()
unique_entries = set()
for filename in args.files:
    if args.verbosity > 0:
        print 'Processing {0}...'.format(filename)
    for entry in fasta.read(open(filename)):
        if args.gname:
            entry = get_gene_name(entry)
        unique_entries.update({entry,})
    if args.verbosity > 1:
        print '\t... found {0} unique entries so far'.format(len(unique_entries))

fasta.write(unique_entries, output=args.output)