def pre(hla_arr, input_dir, output_dir): #pid = os.getpid() #p = psutil.Process(pid) #print ('Process info:') #print ('name: ', p.name()) #print ('exe: ', p.exe()) files = os.listdir(input_dir) for file in files: output_file = '{0}{1}'.format(output_dir, file) input_file = '{0}{1}'.format(input_dir, file) sh.mkdir(output_file) for item in hla_arr: start = time.time() tracemalloc.start(10) predict(class_='I', peptides_path=input_file, mhc=item, output='{0}{1}/{2}.csv'.format(output_dir, file, item)) snapshot = tracemalloc.take_snapshot() top_stats = snapshot.statistics('traceback') end = time.time() stat = top_stats[0] #print("%s memory blocks: %.1f KiB" % (stat.count, stat.size / 1024)) #for line in stat.traceback.format(): # print(line) print(end - start)
def main(args_input=sys.argv[1:]): parser = argparse.ArgumentParser( 'mhcnuggets', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input_file', help="Input FASTA file") parser.add_argument('allele', help="Allele for which to make prediction") parser.add_argument('epitope_length', type=int, choices=[8, 9, 10, 11, 12, 13, 14, 15], help="Length of subpeptides (epitopes) to predict") parser.add_argument('class_type', choices=['I', 'II'], help="Class I or class II") parser.add_argument('output_file', help="Output file from iedb") args = parser.parse_args(args_input) epitope_seq_nums = defaultdict(list) for record in SeqIO.parse(args.input_file, "fasta"): seq_num = record.id peptide = str(record.seq) epitopes = find_neoepitopes(peptide, args.epitope_length) for epitope, starts in epitopes.items(): for start in starts: epitope_seq_nums[epitope].append((seq_num, start)) tmp_file = tempfile.NamedTemporaryFile('w', delete=False) for epitope in epitope_seq_nums.keys(): tmp_file.write("{}\n".format(epitope)) tmp_file.close() tmp_output_file = tempfile.NamedTemporaryFile('r', delete=False) predict(args.class_type, tmp_file.name, mhcnuggets_allele(args.allele, args.class_type), output=tmp_output_file.name) tmp_output_file.close() df = pd.read_csv(tmp_output_file.name) processed_df = pd.DataFrame() for index, row in df.iterrows(): seq_nums = epitope_seq_nums[row['peptide']] for seq_num, start in seq_nums: new_row = row.copy() new_row['seq_num'] = seq_num new_row['start'] = start new_row['allele'] = args.allele processed_df = processed_df.append(new_row) processed_df['start'] = pd.to_numeric(processed_df['start'], downcast='integer') processed_df = processed_df[[ 'peptide', 'ic50', 'seq_num', 'start', 'allele' ]] processed_df.to_csv(args.output_file, index=False)
def predict(self, input_file, allele, epitope_length, iedb_executable_path, iedb_retries): epitope_seq_nums = defaultdict(list) for line in input_file: match = re.search('^>([0-9]+)$', line) if match: seq_num = match.group(1) else: epitopes = self.find_neoepitopes(line.rstrip()) for epitope, starts in epitopes.items(): for start in starts: epitope_seq_nums[epitope].append((seq_num, start)) tmp_file = tempfile.NamedTemporaryFile('w', delete=False) for epitope in epitope_seq_nums.keys(): tmp_file.write("{}\n".format(epitope)) tmp_file.close() tmp_output_file = tempfile.NamedTemporaryFile('r', delete=False) mhcnuggets_allele = "HLA-{}".format(allele).replace('*', '') predict('II', tmp_file.name, mhcnuggets_allele, output=tmp_output_file.name) tmp_output_file.close() df = pd.read_csv(tmp_output_file.name) processed_df = pd.DataFrame() for index, row in df.iterrows(): seq_nums = epitope_seq_nums[row['peptide']] for seq_num, start in seq_nums: new_row = row.copy() new_row['seq_num'] = seq_num new_row['start'] = start new_row['allele'] = allele processed_df = processed_df.append(new_row) processed_df['start'] = pd.to_numeric(processed_df['start'], downcast='integer') processed_df = processed_df[[ 'peptide', 'ic50', 'seq_num', 'start', 'allele' ]] return (processed_df, 'pandas')
def predict(self, input_file, allele, epitope_length, iedb_executable_path, iedb_retries, class_type): epitope_seq_nums = defaultdict(list) for record in SeqIO.parse(input_file, "fasta"): seq_num = record.id peptide = str(record.seq) epitopes = self.find_neoepitopes(peptide, epitope_length) for epitope, starts in epitopes.items(): for start in starts: epitope_seq_nums[epitope].append((seq_num, start)) tmp_file = tempfile.NamedTemporaryFile('w', delete=False) for epitope in epitope_seq_nums.keys(): tmp_file.write("{}\n".format(epitope)) tmp_file.close() tmp_output_file = tempfile.NamedTemporaryFile('r', delete=False) predict(class_type, tmp_file.name, self.mhcnuggets_allele(allele), output=tmp_output_file.name) tmp_output_file.close() df = pd.read_csv(tmp_output_file.name) processed_df = pd.DataFrame() for index, row in df.iterrows(): seq_nums = epitope_seq_nums[row['peptide']] for seq_num, start in seq_nums: new_row = row.copy() new_row['seq_num'] = seq_num new_row['start'] = start new_row['allele'] = allele processed_df = processed_df.append(new_row) processed_df['start'] = pd.to_numeric(processed_df['start'], downcast='integer') processed_df = processed_df[[ 'peptide', 'ic50', 'seq_num', 'start', 'allele' ]] return (processed_df, 'pandas')
def main(): model = argparse.ArgumentParser( description='MHCNuggets binding prediction') model.add_argument('-p', '--peptides', type=str, help='mhcnuggets input') model.add_argument('-a', '--alleles', type=str, help='class 2 alleles') model.add_argument('-o', '--output', type=str, help='mhcnuggets output') args = model.parse_args() if open(args.peptides).readlines() != []: supp_alleles = parse_alleles(args.alleles) for allele in supp_alleles: predict(class_='II', peptides_path=args.peptides, mhc=allele, output=allele + args.output) else: op = open('predicted_neoepitopes_class_2', 'w') op.close()
def get_affinity_mhcnuggets(peptides, allele, version, remove_files=True): """ Obtains binding affinities from list of peptides peptides: peptides of interest (list of strings) allele: Allele to use for binding affinity (string) scores: list of scoring methods version: version of mhcnuggets remove_files: option to remove intermediate files Return value: affinities (a list of binding affinities as strings) """ from mhcnuggets.src.predict import predict files_to_remove = [] try: # Check that allele is valid for method with open( os.path.join(neoepiscope_dir, "neoepiscope", "availableAlleles.pickle"), "rb", ) as allele_stream: avail_alleles = pickle.load(allele_stream) # Check that allele is valid for method allele = allele.replace("*", "") if allele in avail_alleles["mhcnuggets_mhcI"]: allele_class = "I" max_length = 15 elif allele in avail_alleles["mhcnuggets_mhcII"]: allele_class = "II" max_length = 30 else: warnings.warn( " ".join([allele, "is not a valid allele for mhcnuggets"]), Warning ) return [(peptides[i], "NA") for i in range(0, len(peptides))] # Establish return list and sample id sample_id = ".".join( [peptides[0], str(len(peptides)), allele, "mhcnuggets", version] ) affinities = [] # Write one peptide per line to a temporary file for # input if peptide length is at least 9 # Count instances of smaller peptides # Establish temporary file to hold output peptide_file = tempfile.mkstemp( suffix=".txt", prefix="".join([sample_id, "."]), text=True )[1] files_to_remove.append(peptide_file) na_count = 0 with open(peptide_file, "w") as f: for sequence in peptides: if len(sequence) > max_length: na_count += 1 else: print(sequence, file=f) if na_count > 0: warnings.warn( " ".join( [ str(na_count), "peptides not compatible with", "mhcnuggets will not receive score", ] ), Warning, ) # Establish temporary file to hold output mhc_out = tempfile.mkstemp( suffix=".mhcnuggets.out", prefix="".join([sample_id, "."]), text=True )[1] files_to_remove.append(mhc_out) # Run mhcnuggets predict( class_=allele_class, peptides_path=peptide_file, mhc=allele, output=mhc_out ) # Retrieve scores for valid peptides score_dict = {} with open(mhc_out, "r") as f: # Skip headers f.readline() for line in f: tokens = line.strip("\n").split(",") score_dict[tokens[0]] = tokens[1] # Produce list of scores for valid peptides # Invalid peptides receive "NA" score for sequence in peptides: if sequence in score_dict: nM = (sequence, score_dict[sequence]) else: nM = (sequence, "NA") affinities.append(nM) return affinities finally: if remove_files: for file_to_remove in files_to_remove: os.remove(file_to_remove)
def get_normal_binding_scores(blast_dict, alleles, available_alleles, output_dir, pat_id, remove_files=True): ''' Creates dictionary linking matched normal epitopes to binding scores for different HLA alleles blast_dict: blast_dict - dictionary that links epitopes to a list of [match E value, set of transcripts it comes from, set of genes it comes from, match pepetide sequence] (from process_blast()) alleles: list of HLA alleles to use for binding predictions available_alleles: path to pickled dictionary describing available HLA alleles for different binding affinity predictors output_dir: path to output directory for writing temporary files pat_id: patient identifier Return value: nested dictionary, where keys are matched normal epitopes and values are dictionaries, where keys are HLA alleles and values are binding scores for that epitope/allele combo ''' # Create list of temporary files to remove files_to_remove = [] # Extract matched normal peptide sequences normal_epitopes = set() for epitope in blast_dict: normal_epitopes.add(blast_dict[epitope][3]) # Load available alleles with open(available_alleles, 'rb') as allele_stream: avail_alleles = pickle.load(allele_stream) # Initialize dictionary normal_dict = defaultdict(dict) for hla in alleles: # Determine if allele is valid for mhcnuggets if hla in avail_alleles["mhcnuggets_mhcI"]: # Class I allele allele_class = "I" max_length = 15 elif hla in avail_alleles["mhcnuggets_mhcII"]: # Class II allele allele_class = "II" max_length = 30 else: # Not a valid allele continue # Write relevant peptides to file peptide_file = os.path.join(output_dir, ''.join([pat_id, '.mhc.', hla, '.csv'])) files_to_remove.append(peptide_file) with open(peptide_file, 'w') as f: for sequence in normal_epitopes: if len(sequence) <= max_length: print(sequence, file=f) # Run binding predictions mhc_out = os.path.join(output_dir, ''.join([pat_id, '.mhc.', hla, '.out'])) files_to_remove.append(mhc_out) predict(class_=allele_class, peptides_path=peptide_file, mhc=hla, output=mhc_out) # Process mhcnuggets results score_dict = {} with open(mhc_out) as f: f.readline() for line in f: tokens = line.strip().split(',') score_dict[tokens[0]] = tokens[1] # Store score for each epitope if available for sequence in normal_epitopes: if sequence in score_dict: normal_dict[sequence][hla] = float(score_dict[sequence]) # Remove temporary files if remove_files: for file_to_remove in files_to_remove: os.remove(file_to_remove) # Return dictionary return normal_dict
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Mon Jul 2 16:55:19 2018 @author: frank-lsy """ # importing the predict module from mhcnuggets.src.predict import predict # predicting new line separated peptides present in the peptides_path file # for MHC class_I allele HLA-A*02:01 predict(class_='I', peptides_path='test.peps', mhc='HLA-A02:01', output = 'new.csv') print("\n") # similarly doing the same prediction for MHC class_II allele HLA-DRB1*01:01 """ predict(class_='II', peptides_path='mhcnuggets/mhcnuggets/data/test/test_peptides.peps', mhc='HLA-DRB101:01', output = 'II.csv') print("\n") # as an example of prediction of rare alleles asking MHCnuggets to make predictions for HLA-A*02:60 # will make it search for the closest allele (HLA-A*02:01 in this case), and use the corresponding # network for prediction predict(class_='I', peptides_path='mhcnuggets/mhcnuggets/data/test/test_peptides.peps', mhc='HLA-A02:60', output = 'III.csv') """
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Mon Jul 2 16:55:19 2018 @author: frank-lsy """ # importing the predict module from mhcnuggets.src.predict import predict # predicting new line separated peptides present in the peptides_path file # for MHC class_I allele HLA-A*02:01 predict(class_='I', peptides_path= '2018summer/mhcnuggets-2.0/mhcnuggets/data/test/test_peptides.peps', mhc='HLA-A02:01', output='I.csv') print("\n") # similarly doing the same prediction for MHC class_II allele HLA-DRB1*01:01 """ predict(class_='II', peptides_path='mhcnuggets/mhcnuggets/data/test/test_peptides.peps', mhc='HLA-DRB101:01', output = 'II.csv') print("\n") # as an example of prediction of rare alleles asking MHCnuggets to make predictions for HLA-A*02:60 # will make it search for the closest allele (HLA-A*02:01 in this case), and use the corresponding # network for prediction predict(class_='I', peptides_path='mhcnuggets/mhcnuggets/data/test/test_peptides.peps', mhc='HLA-A02:60', output = 'III.csv')