def run(): from mhcflurry.amino_acid import COMMON_AMINO_ACIDS args = parser.parse_args(sys.argv[1:]) configure_logging() serial_run = not args.cluster_parallelism and args.num_jobs == 0 if not args.affinity_predictor: args.affinity_predictor = get_path( "models_class1_pan", "models.combined") print("Using downloaded affinity predictor: ", args.affinity_predictor) if not args.frequency_matrices: args.frequency_matrices = os.path.join( args.affinity_predictor, "frequency_matrices.csv.bz2") if not args.length_distributions: args.length_distributions = os.path.join(args.affinity_predictor, "length_distributions.csv.bz2") if not args.train_data: args.train_data = os.path.join(args.affinity_predictor, "train_data.csv.bz2") frequency_matrices_df = pandas.read_csv(args.frequency_matrices) length_distributions = pandas.read_csv(args.length_distributions) train_data = pandas.read_csv(args.train_data) alleles = args.alleles if alleles: print("Using specified alleles, ", *alleles) else: alleles = frequency_matrices_df.allele.unique() if args.max_alleles: alleles = alleles[:args.max_alleles] print("Using %d alleles" % len(alleles), alleles) amino_acids = sorted(COMMON_AMINO_ACIDS) distribution = frequency_matrices_df.loc[ (frequency_matrices_df.cutoff_fraction == 1.0), amino_acids ].mean(0) normalized_frequency_matrices = frequency_matrices_df.copy() normalized_frequency_matrices.loc[:, amino_acids] = ( normalized_frequency_matrices[amino_acids] / distribution) GLOBAL_DATA["args"] = args GLOBAL_DATA["normalized_frequency_matrices"] = normalized_frequency_matrices GLOBAL_DATA["length_distributions"] = length_distributions GLOBAL_DATA["train_data"] = train_data artifacts_out = os.path.join(args.out, "artifacts") if not os.path.exists(args.out): os.mkdir(args.out) if not os.path.exists(artifacts_out): os.mkdir(artifacts_out) tasks = [ { "task_num": i, "allele": allele, "out_dir": artifacts_out, } for (i, allele) in enumerate(alleles) ] jobs = [] for task in tasks: if not jobs or len(jobs[-1]['tasks']) >= args.chunk_size: jobs.append({'tasks': []}) jobs[-1]['tasks'].append(task) print("Generated %d tasks, packed into %d jobs" % (len(tasks), len(jobs))) worker_pool = None start = time.time() if serial_run: # Serial run print("Running in serial.") results = ( do_job(**job) for job in jobs) elif args.cluster_parallelism: # Run using separate processes HPC cluster. print("Running on cluster.") results = cluster_results_from_args( args, work_function=do_job, work_items=jobs, constant_data=GLOBAL_DATA, input_serialization_method="dill", result_serialization_method="pickle", clear_constant_data=False) else: worker_pool = worker_pool_with_gpu_assignments_from_args(args) print("Worker pool", worker_pool) assert worker_pool is not None for task in tasks: task['constant_data'] = GLOBAL_DATA results = worker_pool.imap_unordered( partial(call_wrapped_kwargs, do_job), jobs, chunksize=1) print("Reading results") task_results = {} for job_result in tqdm.tqdm(results, total=len(jobs)): for task_result in job_result: task_results[task_result['task_num']] = task_result print("Received all results in %0.2f sec" % (time.time() - start)) artifacts_df = pandas.DataFrame(task_results).T.set_index("task_num") length_distributions_out = os.path.join(args.out, "length_distributions.csv") length_distributions.to_csv(length_distributions_out, index=False) print("Wrote: ", length_distributions_out) artifacts_summary_out = os.path.join(args.out, "artifacts.csv") artifacts_df.to_csv(artifacts_summary_out) print("Wrote: ", artifacts_summary_out) if worker_pool: worker_pool.close() worker_pool.join()
def run(): import mhcflurry args = parser.parse_args(sys.argv[1:]) configure_logging() serial_run = not args.cluster_parallelism and args.num_jobs == 0 if not args.data: args.data = os.path.join(args.predictor, 'model_selection_data.csv.bz2') print("Defaulting data to: ", args.data) data_df = pandas.read_csv(args.data) print("Read %d rows:" % len(data_df)) print(data_df) fold_cols = [col for col in data_df.columns if col.startswith("fold_")] print("Fold cols", fold_cols) assert len(fold_cols) > 1 eval_df = data_df.loc[data_df[fold_cols].sum(1) < len(fold_cols)].copy() print("Reduced to data held-out at least once: ", len(eval_df)) predictor = mhcflurry.Class1AffinityPredictor.load(args.predictor, optimization_level=0) print("Loaded predictor", predictor) fold_to_ensemble = collections.defaultdict(list) for n in predictor.neural_networks: fold = n.fit_info[-1]['training_info']['fold_num'] fold_to_ensemble[fold].append(n) print("Constructed fold_to_ensemble", fold_to_ensemble) eval_df["ensemble_key"] = ((~eval_df[fold_cols]).astype(str) + "_").sum(1).str.strip("_") print("Established ensemble keys:") print(eval_df.ensemble_key.value_counts()) def predictor_for_ensemble_key(key_string): indicators = [eval(s) for s in key_string.split("_")] ensemble = [] for fold, indicator in enumerate(indicators): if indicator: ensemble.extend(fold_to_ensemble[fold]) pred = mhcflurry.Class1AffinityPredictor( class1_pan_allele_models=ensemble, allele_to_sequence=predictor.allele_to_sequence) return pred tasks = [] for (key, sub_df) in eval_df.groupby("ensemble_key"): print(key) pred = predictor_for_ensemble_key(key) assert len(pred.neural_networks) > 0 eval_df.loc[sub_df.index, "ensemble_size"] = len(pred.neural_networks) tasks.append({ "key": key, "predictor": pred, "sub_df": sub_df[["peptide", "allele"]].copy() }) worker_pool = None start = time.time() if serial_run: # Serial run print("Running in serial.") results = (do_predict(**task) for task in tasks) elif args.cluster_parallelism: # Run using separate processes HPC cluster. print("Running on cluster.") results = cluster_results_from_args( args, work_function=do_predict, work_items=tasks, constant_data=GLOBAL_DATA, input_serialization_method="dill", result_serialization_method="pickle", clear_constant_data=False) else: worker_pool = worker_pool_with_gpu_assignments_from_args(args) print("Worker pool", worker_pool) assert worker_pool is not None results = worker_pool.imap_unordered(partial(call_wrapped_kwargs, do_predict), tasks, chunksize=1) print("Reading results") for worker_result in tqdm.tqdm(results, total=len(tasks)): print("Received worker result:", worker_result['key']) print(worker_result) eval_df.loc[worker_result['index'], "prediction"] = worker_result["prediction"] print("Received all results in %0.2f sec" % (time.time() - start)) eval_df.to_csv(args.out, index=False) print("Wrote: ", args.out) if worker_pool: worker_pool.close() worker_pool.join()
def run(): import mhcflurry args = parser.parse_args(sys.argv[1:]) configure_logging() serial_run = not args.cluster_parallelism and args.num_jobs == 0 hit_df = pandas.read_csv(args.hits) numpy.testing.assert_equal(hit_df.hit_id.nunique(), len(hit_df)) hit_df = hit_df.loc[ (hit_df.mhc_class == "I") & (hit_df.peptide.str.len() <= 11) & (hit_df.peptide.str.len() >= 8) & (~hit_df.protein_ensembl.isnull()) & (hit_df.peptide.str.match("^[%s]+$" % "".join( mhcflurry.amino_acid.COMMON_AMINO_ACIDS))) ] print("Loaded hits from %d samples" % hit_df.sample_id.nunique()) hit_df = hit_df.loc[hit_df.format == "MONOALLELIC"].copy() print("Subselected to %d monoallelic samples" % hit_df.sample_id.nunique()) hit_df["allele"] = hit_df.hla hit_df = hit_df.loc[hit_df.allele.str.match("^HLA-[ABC]")] print("Subselected to %d HLA-A/B/C samples" % hit_df.sample_id.nunique()) if args.exclude_contig: new_hit_df = hit_df.loc[ hit_df.protein_primary_ensembl_contig.astype(str) != args.exclude_contig ] print( "Excluding contig", args.exclude_contig, "reduced dataset from", len(hit_df), "to", len(new_hit_df)) hit_df = new_hit_df.copy() if args.alleles: filter_alleles = set(args.alleles) new_hit_df = hit_df.loc[ hit_df.allele.isin(filter_alleles) ] print( "Selecting alleles", args.alleles, "reduced dataset from", len(hit_df), "to", len(new_hit_df)) hit_df = new_hit_df.copy() sample_table = hit_df.drop_duplicates("sample_id").set_index("sample_id") grouped = hit_df.groupby("sample_id").nunique() for col in sample_table.columns: if (grouped[col] > 1).any(): del sample_table[col] sample_table["total_hits"] = hit_df.groupby("sample_id").peptide.nunique() print("Loading proteome peptides") all_peptides_df = pandas.read_csv(args.proteome_peptides) print("Loaded: ", all_peptides_df.shape) all_peptides_df = all_peptides_df.loc[ all_peptides_df.protein_accession.isin(hit_df.protein_accession.unique()) & all_peptides_df.peptide.str.match("^[%s]+$" % "".join( mhcflurry.amino_acid.COMMON_AMINO_ACIDS)) ].copy() all_peptides_df["length"] = all_peptides_df.peptide.str.len() print("Subselected proteome peptides by accession: ", all_peptides_df.shape) all_peptides_by_length = dict(iter(all_peptides_df.groupby("length"))) print("Selecting decoys.") GLOBAL_DATA['args'] = args GLOBAL_DATA['lengths'] = [8, 9, 10, 11] GLOBAL_DATA['all_peptides_by_length'] = all_peptides_by_length GLOBAL_DATA['sample_table'] = sample_table GLOBAL_DATA['hit_df'] = hit_df worker_pool = None start = time.time() tasks = [ {"samples": [sample]} for sample in hit_df.sample_id.unique() ] if serial_run: # Serial run print("Running in serial.") results = [do_process_samples(hit_df.sample_id.unique())] elif args.cluster_parallelism: # Run using separate processes HPC cluster. print("Running on cluster.") results = cluster_results_from_args( args, work_function=do_process_samples, work_items=tasks, constant_data=GLOBAL_DATA, input_serialization_method="dill", result_serialization_method="pickle", clear_constant_data=False) else: worker_pool = worker_pool_with_gpu_assignments_from_args(args) print("Worker pool", worker_pool) assert worker_pool is not None results = worker_pool.imap_unordered( partial(call_wrapped_kwargs, do_process_samples), tasks, chunksize=1) print("Reading results") result_df = [] for worker_result in tqdm.tqdm(results, total=len(tasks)): for sample_id, selected_df in worker_result.groupby("sample_id"): print( "Received result for sample", sample_id, "with hit and decoys:\n", selected_df.hit.value_counts()) result_df.append(worker_result) print("Received all results in %0.2f sec" % (time.time() - start)) result_df = pandas.concat(result_df, ignore_index=True, sort=False) result_df["hla"] = result_df.sample_id.map(sample_table.hla) print(result_df) print("Counts:") print(result_df.groupby(["sample_id", "hit"]).peptide.nunique()) print("Hit counts:") print( result_df.loc[ result_df.hit == 1 ].groupby("sample_id").hit.count().sort_values()) print("Hit rates:") print(result_df.groupby("sample_id").hit.mean().sort_values()) result_df.to_csv(args.out, index=False) print("Wrote: ", args.out) if worker_pool: worker_pool.close() worker_pool.join()
def run(argv=sys.argv[1:]): global GLOBAL_DATA # On sigusr1 print stack trace print("To show stack trace, run:\nkill -s USR1 %d" % os.getpid()) signal.signal(signal.SIGUSR1, lambda sig, frame: traceback.print_stack()) args = parser.parse_args(argv) configure_logging() serial_run = not args.cluster_parallelism and args.num_jobs == 0 alleles = [ normalize_allele_name(a, raise_on_error=False) for a in args.allele ] n_bad_alleles = sum([a is None for a in alleles]) if n_bad_alleles > 0: print("Dropping %d bad alleles" % n_bad_alleles) alleles = numpy.array(sorted({a for a in alleles if a})) peptides = pandas.read_csv( args.input_peptides, nrows=args.max_peptides).peptide.drop_duplicates() print("Filtering to valid peptides. Starting at: ", len(peptides)) peptides = peptides[peptides.str.match("^[ACDEFGHIKLMNPQRSTVWY]+$")] print("Filtered to: ", len(peptides)) peptides = peptides.unique() num_peptides = len(peptides) print("Predictions for %d alleles x %d peptides." % (len(alleles), num_peptides)) if not os.path.exists(args.out): print("Creating", args.out) os.mkdir(args.out) GLOBAL_DATA["predictor"] = args.predictor GLOBAL_DATA["args"] = args GLOBAL_DATA["cols"] = PREDICTOR_TO_COLS[args.predictor] # Write peptide and allele lists to out dir. out_peptides = os.path.abspath(os.path.join(args.out, "peptides.csv")) pandas.DataFrame({"peptide": peptides}).to_csv(out_peptides, index=False) print("Wrote: ", out_peptides) manifest_df = [] for allele in alleles: for col in PREDICTOR_TO_COLS[args.predictor]: manifest_df.append((allele, col)) manifest_df = pandas.DataFrame(manifest_df, columns=["allele", "kind"]) manifest_df["col"] = (manifest_df.allele + " " + manifest_df.kind) manifest_df["path"] = manifest_df.col.map( lambda s: s.replace("*", "").replace(" ", ".")) + ".npz" out_manifest = os.path.abspath(os.path.join(args.out, "alleles.csv")) manifest_df.to_csv(out_manifest, index=False) col_to_filename = manifest_df.set_index("col").path.map( lambda s: os.path.abspath(os.path.join(args.out, s))) print("Wrote: ", out_manifest) result_df = pandas.DataFrame(index=peptides, columns=manifest_df.col.values, dtype=args.result_dtype) result_df[:] = numpy.nan if args.reuse_predictions: # Allocating this here to hit any memory errors as early as possible. is_null_matrix = numpy.ones(shape=(result_df.shape[0], len(alleles)), dtype="int8") for dirname in args.reuse_predictions: if not dirname: continue # ignore empty strings if os.path.exists(dirname): print("Loading predictions", dirname) result_df = load_results(dirname, result_df, dtype=args.result_dtype) else: print("WARNING: skipping because does not exist", dirname) # We rerun any alleles that have nulls for any kind of values # (e.g. affinity, percentile rank, elution score). for (i, allele) in enumerate(alleles): sub_df = manifest_df.loc[manifest_df.allele == allele] is_null_matrix[:, i] = result_df[sub_df.col.values].isnull().any(1) print("Fraction null", is_null_matrix.mean()) print("Grouping peptides by alleles") allele_indices_to_peptides = collections.defaultdict(list) for (i, peptide) in tqdm.tqdm(enumerate(peptides), total=len(peptides)): (allele_indices, ) = numpy.where(is_null_matrix[i]) if len(allele_indices) > 0: allele_indices_to_peptides[tuple(allele_indices)].append( peptide) del is_null_matrix work_items = [] print("Assigning peptides to work items.") for (indices, block_peptides) in allele_indices_to_peptides.items(): num_chunks = int(math.ceil(len(block_peptides) / args.chunk_size)) peptide_chunks = numpy.array_split(peptides, num_chunks) for chunk_peptides in peptide_chunks: work_items.append({ 'alleles': alleles[list(indices)], 'peptides': chunk_peptides, }) else: # Same number of chunks for all alleles num_chunks = int(math.ceil(len(peptides) / args.chunk_size)) print("Splitting peptides into %d chunks" % num_chunks) peptide_chunks = numpy.array_split(peptides, num_chunks) work_items = [] for (_, chunk_peptides) in enumerate(peptide_chunks): work_item = { 'alleles': alleles, 'peptides': chunk_peptides, } work_items.append(work_item) print("Work items: ", len(work_items)) for (i, work_item) in enumerate(work_items): work_item["work_item_num"] = i # Combine work items to form tasks. tasks = [] peptides_in_last_task = None # We sort work_items to put small items first so they get combined. for work_item in sorted(work_items, key=lambda d: len(d['peptides'])): if peptides_in_last_task is not None and ( len(work_item['peptides']) + peptides_in_last_task < args.chunk_size): # Add to last task. tasks[-1]['work_item_dicts'].append(work_item) peptides_in_last_task += len(work_item['peptides']) else: # New task tasks.append({'work_item_dicts': [work_item]}) peptides_in_last_task = len(work_item['peptides']) print("Collected %d work items into %d tasks" % (len(work_items), len(tasks))) if args.predictor == "mhcflurry": do_predictions_function = do_predictions_mhcflurry else: do_predictions_function = do_predictions_mhctools worker_pool = None start = time.time() if serial_run: # Serial run print("Running in serial.") results = (do_predictions_function(**task) for task in tasks) elif args.cluster_parallelism: # Run using separate processes HPC cluster. print("Running on cluster.") results = cluster_results_from_args( args, work_function=do_predictions_function, work_items=tasks, constant_data=GLOBAL_DATA, input_serialization_method="dill", result_serialization_method="pickle", clear_constant_data=True) else: worker_pool = worker_pool_with_gpu_assignments_from_args(args) print("Worker pool", worker_pool) assert worker_pool is not None results = worker_pool.imap_unordered(partial(call_wrapped_kwargs, do_predictions_function), tasks, chunksize=1) allele_to_chunk_index_to_predictions = {} for allele in alleles: allele_to_chunk_index_to_predictions[allele] = {} def write_col(col): out_path = os.path.join(args.out, col_to_filename[col]) numpy.savez(out_path, result_df[col].values) print("Wrote [%f%% null]:" % (result_df[col].isnull().mean() * 100.0), out_path) print("Writing all columns.") last_write_time_per_column = {} for col in result_df.columns: write_col(col) last_write_time_per_column[col] = time.time() print("Done writing all columns. Reading results.") for worker_results in tqdm.tqdm(results, total=len(work_items)): for (work_item_num, col_to_predictions) in worker_results: for (col, predictions) in col_to_predictions.items(): result_df.loc[work_items[work_item_num]['peptides'], col] = predictions if time.time() - last_write_time_per_column[col] > 180: write_col(col) last_write_time_per_column[col] = time.time() print("Done processing. Final write for each column.") for col in result_df.columns: write_col(col) if worker_pool: worker_pool.close() worker_pool.join() prediction_time = time.time() - start print("Done generating predictions in %0.2f min." % (prediction_time / 60.0))