def test_parser_uniprotkb_decoydb(self): header = ( 'sp|P27748|ACOX_RALEH Acetoin catabolism protein X OS=Ralstonia' ' eutropha (strain ATCC 17699 / H16 / DSM 428 / Stanier 337)' ' GN=acoX PE=4 SV=2') sequence = 'SEQUENCE' with tempfile.TemporaryFile(mode='r+') as db: fasta.write([(header, sequence)], db) db.seek(0) entries = list( fasta.decoy_db(db, prefix='PREFIX_', parser=fasta.parse, decoy_only=True)) parsed = { 'GN': 'acoX', 'OS': 'Ralstonia eutropha ' '(strain ATCC 17699 / H16 / DSM 428 / Stanier 337)', 'PE': 4, 'SV': 2, 'db': 'PREFIX_sp', 'entry': 'ACOX_RALEH', 'id': 'P27748', 'gene_id': 'ACOX', 'name': 'Acetoin catabolism protein X', 'taxon': 'RALEH' } self.assertEqual(entries[0][0], parsed) self.assertEqual(entries[0][1], 'SEQUENCE'[::-1]) self.assertEqual(len(entries), 1)
def test_read_and_write_fasta_short(self): with tempfile.TemporaryFile(mode='r+') as new_fasta_file: fasta.write(fasta.read(self.fasta_file, ignore_comments=True), new_fasta_file) new_fasta_file.seek(0) new_entries = list(fasta.read(new_fasta_file, ignore_comments=True)) self.assertEqual(new_entries, self.fasta_entries_short)
def make_reverse_fasta(input_file, output_file): ''' Takes as input fasta file, drops all _REVERSED proteins and creates a new _REVERSED decoy proteins. ''' prots = [] for prot_desc, prot_seq in fasta.read(input_file): if not prot_desc.endswith('_REVERSED'): prots.append((prot_desc, prot_seq)) prots.append((prot_desc + '_REVERSED', smart_reverse(prot_seq))) fasta.write(prots, output_file, file_mode='w')
def main(): # pragma: no cover """ Execute pytrapment. Returns: None """ today = date.today().strftime("%Y%m%d") parser = arg_parser() try: args = parser.parse_args(sys.argv[1:]) except TypeError: parser.print_usage() # create dir if not there if not os.path.exists(args.out_dir): os.makedirs(args.out_dir) start_time = time.time() print("Starting pytrapment.") fasta_df = entrapment.get_nearest_neighbor_proteins( args.fasta_host, args.fasta_trap) print("Found neighbors.") print("Write fasta.") _ = fasta.write(zip(fasta_df.index, fasta_df.sequence), os.path.join(args.out_dir, f"entrapment_{today}.fasta"), file_mode="w") end_time = time.time() print(f"Took {(end_time-start_time)/60.:.2f} minutes") # doing qc print("Perform qc.") host_peptides = entrapment.digest_protein_df( fasta_df[fasta_df["db_type"] == "host"]) trap_peptides = entrapment.digest_protein_df( fasta_df[fasta_df["db_type"] == "trap"]) features_df_host = qc.compute_sequence_features(host_peptides) features_df_host["Type"] = "host" features_df_trap = qc.compute_sequence_features(trap_peptides) features_df_trap["Type"] = "trap" qc.qc_peptides(features_df_host, features_df_trap, args.out_dir) print("Done.")
def prepare_decoy_db(args): add_decoy = args['ad'] if add_decoy: prefix = args['prefix'] db = args['d'] out1, out2 = os.path.splitext(db) out_db = out1 + '_shuffled' + out2 logger.info('Creating decoy database: %s', out_db) extra_check = False if '{' in args['e']: extra_check = True if extra_check: banned_pairs = set() banned_aa = set() for enzyme_local in args['e'].split(','): if '{' in enzyme_local: lpart, rpart = enzyme_local.split('|') for aa_left, aa_right in itertools.product( lpart[1:-1], rpart[1:-1]): banned_aa.add(aa_left) banned_aa.add(aa_right) banned_pairs.add(aa_left + aa_right) logger.debug(banned_aa) logger.debug(banned_pairs) enzyme = get_enzyme(args['e']) cleave_rule_custom = enzyme + '|' + '([BXZUO])' # cleave_rule_custom = '([RKBXZUO])' logger.debug(cleave_rule_custom) shuf_map = dict() prots = [] for p in fasta.read(db): if not p[0].startswith(prefix): target_peptides = [ x[1] for x in parser.icleave(p[1], cleave_rule_custom, 0) ] checked_peptides = set() sample_list = [] for idx, pep in enumerate(target_peptides): if len(pep) > 2: pep_tmp = pep[1:-1] if extra_check: for bp in banned_pairs: if bp in pep_tmp: pep_tmp = pep_tmp.replace(bp, '') checked_peptides.add(idx) sample_list.extend(pep_tmp) random.shuffle(sample_list) idx_for_shuffle = 0 decoy_peptides = [] for idx, pep in enumerate(target_peptides): if len(pep) > 2: if pep in shuf_map: tmp_seq = shuf_map[pep] else: if not extra_check or idx not in checked_peptides: tmp_seq = pep[0] for pep_aa in pep[1:-1]: tmp_seq += sample_list[idx_for_shuffle] idx_for_shuffle += 1 tmp_seq += pep[-1] else: max_l = len(pep) tmp_seq = '' ii = 0 while ii < max_l - 1: # for ii in range(max_l-1): if pep[ii] in banned_aa and pep[ ii + 1] in banned_aa and pep[ii] + pep[ ii + 1] in banned_pairs: tmp_seq += pep[ii] + pep[ii + 1] ii += 1 else: if ii == 0: tmp_seq += pep[ii] else: tmp_seq += sample_list[ idx_for_shuffle] idx_for_shuffle += 1 ii += 1 tmp_seq += pep[max_l - 1] shuf_map[pep] = tmp_seq else: tmp_seq = pep decoy_peptides.append(tmp_seq) assert len(target_peptides) == len(decoy_peptides) prots.append((p[0], ''.join(target_peptides))) prots.append(('DECOY_' + p[0], ''.join(decoy_peptides))) fasta.write(prots, open(out_db, 'w')).close() args['d'] = out_db args['ad'] = 0 return args
def test_read_and_write_long(self): with tempfile.TemporaryFile(mode='r+') as new_fasta_file: fasta.write(fasta.read(self.fasta_file), new_fasta_file) new_fasta_file.seek(0) new_entries = list(fasta.read(new_fasta_file)) self.assertEqual(new_entries, self.fasta_entries_long)
description was replaced by the gene name. ''' descr, seq = e m = GN_PATTERN.search(descr) new_descr = m.group('gene') or descr return Protein(new_descr, seq) parser = argparse.ArgumentParser() parser.add_argument('files', type=str, nargs='+', help='.fasta files containing plink output') parser.add_argument('-o', '--output', type=str, default=None, help='file to write output to.') parser.add_argument('--gname', action='store_true', help='only leave gene name (GN) in the output fasta header') parser.add_argument('-v', '--verbosity', action='count', help='increase output verbosity') args = parser.parse_args() unique_entries = set() for filename in args.files: if args.verbosity > 0: print 'Processing {0}...'.format(filename) for entry in fasta.read(open(filename)): if args.gname: entry = get_gene_name(entry) unique_entries.update({entry,}) if args.verbosity > 1: print '\t... found {0} unique entries so far'.format(len(unique_entries)) fasta.write(unique_entries, output=args.output)