예제 #1
0
def run():
    from mhcflurry.amino_acid import COMMON_AMINO_ACIDS

    args = parser.parse_args(sys.argv[1:])

    configure_logging()

    serial_run = not args.cluster_parallelism and args.num_jobs == 0

    if not args.affinity_predictor:
        args.affinity_predictor = get_path(
            "models_class1_pan", "models.combined")
        print("Using downloaded affinity predictor: ", args.affinity_predictor)

    if not args.frequency_matrices:
        args.frequency_matrices = os.path.join(
            args.affinity_predictor, "frequency_matrices.csv.bz2")

    if not args.length_distributions:
        args.length_distributions = os.path.join(args.affinity_predictor,
            "length_distributions.csv.bz2")

    if not args.train_data:
        args.train_data = os.path.join(args.affinity_predictor,
            "train_data.csv.bz2")

    frequency_matrices_df = pandas.read_csv(args.frequency_matrices)
    length_distributions = pandas.read_csv(args.length_distributions)
    train_data = pandas.read_csv(args.train_data)

    alleles = args.alleles
    if alleles:
        print("Using specified alleles, ", *alleles)
    else:
        alleles = frequency_matrices_df.allele.unique()

    if args.max_alleles:
        alleles = alleles[:args.max_alleles]

    print("Using %d alleles" % len(alleles), alleles)

    amino_acids = sorted(COMMON_AMINO_ACIDS)

    distribution = frequency_matrices_df.loc[
        (frequency_matrices_df.cutoff_fraction == 1.0), amino_acids
    ].mean(0)

    normalized_frequency_matrices = frequency_matrices_df.copy()
    normalized_frequency_matrices.loc[:, amino_acids] = (
            normalized_frequency_matrices[amino_acids] / distribution)

    GLOBAL_DATA["args"] = args
    GLOBAL_DATA["normalized_frequency_matrices"] = normalized_frequency_matrices
    GLOBAL_DATA["length_distributions"] = length_distributions
    GLOBAL_DATA["train_data"] = train_data

    artifacts_out = os.path.join(args.out, "artifacts")

    if not os.path.exists(args.out):
        os.mkdir(args.out)

    if not os.path.exists(artifacts_out):
        os.mkdir(artifacts_out)

    tasks = [
        {
            "task_num": i,
            "allele": allele,
            "out_dir": artifacts_out,
        }
        for (i, allele) in enumerate(alleles)
    ]

    jobs = []
    for task in tasks:
        if not jobs or len(jobs[-1]['tasks']) >= args.chunk_size:
            jobs.append({'tasks': []})
        jobs[-1]['tasks'].append(task)

    print("Generated %d tasks, packed into %d jobs" % (len(tasks), len(jobs)))

    worker_pool = None
    start = time.time()

    if serial_run:
        # Serial run
        print("Running in serial.")
        results = (
            do_job(**job) for job in jobs)
    elif args.cluster_parallelism:
        # Run using separate processes HPC cluster.
        print("Running on cluster.")
        results = cluster_results_from_args(
            args,
            work_function=do_job,
            work_items=jobs,
            constant_data=GLOBAL_DATA,
            input_serialization_method="dill",
            result_serialization_method="pickle",
            clear_constant_data=False)
    else:
        worker_pool = worker_pool_with_gpu_assignments_from_args(args)
        print("Worker pool", worker_pool)
        assert worker_pool is not None

        for task in tasks:
            task['constant_data'] = GLOBAL_DATA

        results = worker_pool.imap_unordered(
            partial(call_wrapped_kwargs, do_job),
            jobs,
            chunksize=1)

    print("Reading results")

    task_results = {}

    for job_result in tqdm.tqdm(results, total=len(jobs)):
        for task_result in job_result:
            task_results[task_result['task_num']] = task_result

    print("Received all results in %0.2f sec" % (time.time() - start))

    artifacts_df = pandas.DataFrame(task_results).T.set_index("task_num")

    length_distributions_out = os.path.join(args.out,
        "length_distributions.csv")
    length_distributions.to_csv(length_distributions_out,
        index=False)
    print("Wrote: ", length_distributions_out)

    artifacts_summary_out = os.path.join(args.out, "artifacts.csv")
    artifacts_df.to_csv(artifacts_summary_out)
    print("Wrote: ", artifacts_summary_out)

    if worker_pool:
        worker_pool.close()
        worker_pool.join()
예제 #2
0
def run():
    import mhcflurry

    args = parser.parse_args(sys.argv[1:])

    configure_logging()

    serial_run = not args.cluster_parallelism and args.num_jobs == 0

    if not args.data:
        args.data = os.path.join(args.predictor,
                                 'model_selection_data.csv.bz2')
        print("Defaulting data to: ", args.data)

    data_df = pandas.read_csv(args.data)
    print("Read %d rows:" % len(data_df))
    print(data_df)

    fold_cols = [col for col in data_df.columns if col.startswith("fold_")]
    print("Fold cols", fold_cols)
    assert len(fold_cols) > 1

    eval_df = data_df.loc[data_df[fold_cols].sum(1) < len(fold_cols)].copy()

    print("Reduced to data held-out at least once: ", len(eval_df))

    predictor = mhcflurry.Class1AffinityPredictor.load(args.predictor,
                                                       optimization_level=0)
    print("Loaded predictor", predictor)

    fold_to_ensemble = collections.defaultdict(list)
    for n in predictor.neural_networks:
        fold = n.fit_info[-1]['training_info']['fold_num']
        fold_to_ensemble[fold].append(n)
    print("Constructed fold_to_ensemble", fold_to_ensemble)

    eval_df["ensemble_key"] = ((~eval_df[fold_cols]).astype(str) +
                               "_").sum(1).str.strip("_")
    print("Established ensemble keys:")
    print(eval_df.ensemble_key.value_counts())

    def predictor_for_ensemble_key(key_string):
        indicators = [eval(s) for s in key_string.split("_")]
        ensemble = []
        for fold, indicator in enumerate(indicators):
            if indicator:
                ensemble.extend(fold_to_ensemble[fold])
        pred = mhcflurry.Class1AffinityPredictor(
            class1_pan_allele_models=ensemble,
            allele_to_sequence=predictor.allele_to_sequence)
        return pred

    tasks = []
    for (key, sub_df) in eval_df.groupby("ensemble_key"):
        print(key)
        pred = predictor_for_ensemble_key(key)
        assert len(pred.neural_networks) > 0
        eval_df.loc[sub_df.index, "ensemble_size"] = len(pred.neural_networks)
        tasks.append({
            "key": key,
            "predictor": pred,
            "sub_df": sub_df[["peptide", "allele"]].copy()
        })

    worker_pool = None
    start = time.time()

    if serial_run:
        # Serial run
        print("Running in serial.")
        results = (do_predict(**task) for task in tasks)
    elif args.cluster_parallelism:
        # Run using separate processes HPC cluster.
        print("Running on cluster.")
        results = cluster_results_from_args(
            args,
            work_function=do_predict,
            work_items=tasks,
            constant_data=GLOBAL_DATA,
            input_serialization_method="dill",
            result_serialization_method="pickle",
            clear_constant_data=False)
    else:
        worker_pool = worker_pool_with_gpu_assignments_from_args(args)
        print("Worker pool", worker_pool)
        assert worker_pool is not None
        results = worker_pool.imap_unordered(partial(call_wrapped_kwargs,
                                                     do_predict),
                                             tasks,
                                             chunksize=1)

    print("Reading results")

    for worker_result in tqdm.tqdm(results, total=len(tasks)):
        print("Received worker result:", worker_result['key'])
        print(worker_result)

        eval_df.loc[worker_result['index'],
                    "prediction"] = worker_result["prediction"]

    print("Received all results in %0.2f sec" % (time.time() - start))

    eval_df.to_csv(args.out, index=False)
    print("Wrote: ", args.out)

    if worker_pool:
        worker_pool.close()
        worker_pool.join()
예제 #3
0
def run():
    import mhcflurry

    args = parser.parse_args(sys.argv[1:])

    configure_logging()

    serial_run = not args.cluster_parallelism and args.num_jobs == 0

    hit_df = pandas.read_csv(args.hits)
    numpy.testing.assert_equal(hit_df.hit_id.nunique(), len(hit_df))
    hit_df = hit_df.loc[
        (hit_df.mhc_class == "I") &
        (hit_df.peptide.str.len() <= 11) &
        (hit_df.peptide.str.len() >= 8) &
        (~hit_df.protein_ensembl.isnull()) &
        (hit_df.peptide.str.match("^[%s]+$" % "".join(
            mhcflurry.amino_acid.COMMON_AMINO_ACIDS)))
    ]
    print("Loaded hits from %d samples" % hit_df.sample_id.nunique())
    hit_df = hit_df.loc[hit_df.format == "MONOALLELIC"].copy()
    print("Subselected to %d monoallelic samples" % hit_df.sample_id.nunique())
    hit_df["allele"] = hit_df.hla

    hit_df = hit_df.loc[hit_df.allele.str.match("^HLA-[ABC]")]
    print("Subselected to %d HLA-A/B/C samples" % hit_df.sample_id.nunique())

    if args.exclude_contig:
        new_hit_df = hit_df.loc[
            hit_df.protein_primary_ensembl_contig.astype(str) !=
            args.exclude_contig
        ]
        print(
            "Excluding contig",
            args.exclude_contig,
            "reduced dataset from",
            len(hit_df),
            "to",
            len(new_hit_df))
        hit_df = new_hit_df.copy()
    if args.alleles:
        filter_alleles = set(args.alleles)
        new_hit_df = hit_df.loc[
            hit_df.allele.isin(filter_alleles)
        ]
        print(
            "Selecting alleles",
            args.alleles,
            "reduced dataset from",
            len(hit_df),
            "to",
            len(new_hit_df))
        hit_df = new_hit_df.copy()

    sample_table = hit_df.drop_duplicates("sample_id").set_index("sample_id")
    grouped = hit_df.groupby("sample_id").nunique()
    for col in sample_table.columns:
        if (grouped[col] > 1).any():
            del sample_table[col]
    sample_table["total_hits"] = hit_df.groupby("sample_id").peptide.nunique()

    print("Loading proteome peptides")
    all_peptides_df = pandas.read_csv(args.proteome_peptides)
    print("Loaded: ", all_peptides_df.shape)

    all_peptides_df = all_peptides_df.loc[
        all_peptides_df.protein_accession.isin(hit_df.protein_accession.unique()) &
        all_peptides_df.peptide.str.match("^[%s]+$" % "".join(
            mhcflurry.amino_acid.COMMON_AMINO_ACIDS))
    ].copy()
    all_peptides_df["length"] = all_peptides_df.peptide.str.len()
    print("Subselected proteome peptides by accession: ", all_peptides_df.shape)

    all_peptides_by_length = dict(iter(all_peptides_df.groupby("length")))

    print("Selecting decoys.")

    GLOBAL_DATA['args'] = args
    GLOBAL_DATA['lengths'] = [8, 9, 10, 11]
    GLOBAL_DATA['all_peptides_by_length'] = all_peptides_by_length
    GLOBAL_DATA['sample_table'] = sample_table
    GLOBAL_DATA['hit_df'] = hit_df

    worker_pool = None
    start = time.time()

    tasks = [
        {"samples": [sample]} for sample in hit_df.sample_id.unique()
    ]

    if serial_run:
        # Serial run
        print("Running in serial.")
        results = [do_process_samples(hit_df.sample_id.unique())]
    elif args.cluster_parallelism:
        # Run using separate processes HPC cluster.
        print("Running on cluster.")
        results = cluster_results_from_args(
            args,
            work_function=do_process_samples,
            work_items=tasks,
            constant_data=GLOBAL_DATA,
            input_serialization_method="dill",
            result_serialization_method="pickle",
            clear_constant_data=False)
    else:
        worker_pool = worker_pool_with_gpu_assignments_from_args(args)
        print("Worker pool", worker_pool)
        assert worker_pool is not None
        results = worker_pool.imap_unordered(
            partial(call_wrapped_kwargs, do_process_samples),
            tasks,
            chunksize=1)

    print("Reading results")

    result_df = []
    for worker_result in tqdm.tqdm(results, total=len(tasks)):
        for sample_id, selected_df in worker_result.groupby("sample_id"):
            print(
                "Received result for sample",
                sample_id,
                "with hit and decoys:\n",
                selected_df.hit.value_counts())
        result_df.append(worker_result)

    print("Received all results in %0.2f sec" % (time.time() - start))

    result_df = pandas.concat(result_df, ignore_index=True, sort=False)
    result_df["hla"] = result_df.sample_id.map(sample_table.hla)

    print(result_df)
    print("Counts:")
    print(result_df.groupby(["sample_id", "hit"]).peptide.nunique())

    print("Hit counts:")
    print(
        result_df.loc[
            result_df.hit == 1
        ].groupby("sample_id").hit.count().sort_values())

    print("Hit rates:")
    print(result_df.groupby("sample_id").hit.mean().sort_values())

    result_df.to_csv(args.out, index=False)
    print("Wrote: ", args.out)

    if worker_pool:
        worker_pool.close()
        worker_pool.join()
예제 #4
0
def run(argv=sys.argv[1:]):
    global GLOBAL_DATA

    # On sigusr1 print stack trace
    print("To show stack trace, run:\nkill -s USR1 %d" % os.getpid())
    signal.signal(signal.SIGUSR1, lambda sig, frame: traceback.print_stack())

    args = parser.parse_args(argv)

    configure_logging()

    serial_run = not args.cluster_parallelism and args.num_jobs == 0

    alleles = [
        normalize_allele_name(a, raise_on_error=False) for a in args.allele
    ]
    n_bad_alleles = sum([a is None for a in alleles])
    if n_bad_alleles > 0:
        print("Dropping %d bad alleles" % n_bad_alleles)

    alleles = numpy.array(sorted({a for a in alleles if a}))

    peptides = pandas.read_csv(
        args.input_peptides,
        nrows=args.max_peptides).peptide.drop_duplicates()
    print("Filtering to valid peptides. Starting at: ", len(peptides))
    peptides = peptides[peptides.str.match("^[ACDEFGHIKLMNPQRSTVWY]+$")]
    print("Filtered to: ", len(peptides))
    peptides = peptides.unique()
    num_peptides = len(peptides)

    print("Predictions for %d alleles x %d peptides." %
          (len(alleles), num_peptides))

    if not os.path.exists(args.out):
        print("Creating", args.out)
        os.mkdir(args.out)

    GLOBAL_DATA["predictor"] = args.predictor
    GLOBAL_DATA["args"] = args
    GLOBAL_DATA["cols"] = PREDICTOR_TO_COLS[args.predictor]

    # Write peptide and allele lists to out dir.
    out_peptides = os.path.abspath(os.path.join(args.out, "peptides.csv"))
    pandas.DataFrame({"peptide": peptides}).to_csv(out_peptides, index=False)
    print("Wrote: ", out_peptides)

    manifest_df = []
    for allele in alleles:
        for col in PREDICTOR_TO_COLS[args.predictor]:
            manifest_df.append((allele, col))
    manifest_df = pandas.DataFrame(manifest_df, columns=["allele", "kind"])
    manifest_df["col"] = (manifest_df.allele + " " + manifest_df.kind)
    manifest_df["path"] = manifest_df.col.map(
        lambda s: s.replace("*", "").replace(" ", ".")) + ".npz"
    out_manifest = os.path.abspath(os.path.join(args.out, "alleles.csv"))
    manifest_df.to_csv(out_manifest, index=False)
    col_to_filename = manifest_df.set_index("col").path.map(
        lambda s: os.path.abspath(os.path.join(args.out, s)))
    print("Wrote: ", out_manifest)

    result_df = pandas.DataFrame(index=peptides,
                                 columns=manifest_df.col.values,
                                 dtype=args.result_dtype)
    result_df[:] = numpy.nan

    if args.reuse_predictions:
        # Allocating this here to hit any memory errors as early as possible.
        is_null_matrix = numpy.ones(shape=(result_df.shape[0], len(alleles)),
                                    dtype="int8")

        for dirname in args.reuse_predictions:
            if not dirname:
                continue  # ignore empty strings
            if os.path.exists(dirname):
                print("Loading predictions", dirname)
                result_df = load_results(dirname,
                                         result_df,
                                         dtype=args.result_dtype)
            else:
                print("WARNING: skipping because does not exist", dirname)

        # We rerun any alleles that have nulls for any kind of values
        # (e.g. affinity, percentile rank, elution score).
        for (i, allele) in enumerate(alleles):
            sub_df = manifest_df.loc[manifest_df.allele == allele]
            is_null_matrix[:, i] = result_df[sub_df.col.values].isnull().any(1)
        print("Fraction null", is_null_matrix.mean())

        print("Grouping peptides by alleles")
        allele_indices_to_peptides = collections.defaultdict(list)
        for (i, peptide) in tqdm.tqdm(enumerate(peptides),
                                      total=len(peptides)):
            (allele_indices, ) = numpy.where(is_null_matrix[i])
            if len(allele_indices) > 0:
                allele_indices_to_peptides[tuple(allele_indices)].append(
                    peptide)

        del is_null_matrix

        work_items = []
        print("Assigning peptides to work items.")
        for (indices, block_peptides) in allele_indices_to_peptides.items():
            num_chunks = int(math.ceil(len(block_peptides) / args.chunk_size))
            peptide_chunks = numpy.array_split(peptides, num_chunks)
            for chunk_peptides in peptide_chunks:
                work_items.append({
                    'alleles': alleles[list(indices)],
                    'peptides': chunk_peptides,
                })
    else:
        # Same number of chunks for all alleles
        num_chunks = int(math.ceil(len(peptides) / args.chunk_size))
        print("Splitting peptides into %d chunks" % num_chunks)
        peptide_chunks = numpy.array_split(peptides, num_chunks)

        work_items = []
        for (_, chunk_peptides) in enumerate(peptide_chunks):
            work_item = {
                'alleles': alleles,
                'peptides': chunk_peptides,
            }
            work_items.append(work_item)
    print("Work items: ", len(work_items))

    for (i, work_item) in enumerate(work_items):
        work_item["work_item_num"] = i

    # Combine work items to form tasks.
    tasks = []
    peptides_in_last_task = None
    # We sort work_items to put small items first so they get combined.
    for work_item in sorted(work_items, key=lambda d: len(d['peptides'])):
        if peptides_in_last_task is not None and (
                len(work_item['peptides']) + peptides_in_last_task <
                args.chunk_size):

            # Add to last task.
            tasks[-1]['work_item_dicts'].append(work_item)
            peptides_in_last_task += len(work_item['peptides'])
        else:
            # New task
            tasks.append({'work_item_dicts': [work_item]})
            peptides_in_last_task = len(work_item['peptides'])

    print("Collected %d work items into %d tasks" %
          (len(work_items), len(tasks)))

    if args.predictor == "mhcflurry":
        do_predictions_function = do_predictions_mhcflurry
    else:
        do_predictions_function = do_predictions_mhctools

    worker_pool = None
    start = time.time()
    if serial_run:
        # Serial run
        print("Running in serial.")
        results = (do_predictions_function(**task) for task in tasks)
    elif args.cluster_parallelism:
        # Run using separate processes HPC cluster.
        print("Running on cluster.")
        results = cluster_results_from_args(
            args,
            work_function=do_predictions_function,
            work_items=tasks,
            constant_data=GLOBAL_DATA,
            input_serialization_method="dill",
            result_serialization_method="pickle",
            clear_constant_data=True)
    else:
        worker_pool = worker_pool_with_gpu_assignments_from_args(args)
        print("Worker pool", worker_pool)
        assert worker_pool is not None
        results = worker_pool.imap_unordered(partial(call_wrapped_kwargs,
                                                     do_predictions_function),
                                             tasks,
                                             chunksize=1)

    allele_to_chunk_index_to_predictions = {}
    for allele in alleles:
        allele_to_chunk_index_to_predictions[allele] = {}

    def write_col(col):
        out_path = os.path.join(args.out, col_to_filename[col])
        numpy.savez(out_path, result_df[col].values)
        print("Wrote [%f%% null]:" % (result_df[col].isnull().mean() * 100.0),
              out_path)

    print("Writing all columns.")
    last_write_time_per_column = {}
    for col in result_df.columns:
        write_col(col)
        last_write_time_per_column[col] = time.time()
    print("Done writing all columns. Reading results.")

    for worker_results in tqdm.tqdm(results, total=len(work_items)):
        for (work_item_num, col_to_predictions) in worker_results:
            for (col, predictions) in col_to_predictions.items():
                result_df.loc[work_items[work_item_num]['peptides'],
                              col] = predictions
                if time.time() - last_write_time_per_column[col] > 180:
                    write_col(col)
                    last_write_time_per_column[col] = time.time()

    print("Done processing. Final write for each column.")
    for col in result_df.columns:
        write_col(col)

    if worker_pool:
        worker_pool.close()
        worker_pool.join()

    prediction_time = time.time() - start
    print("Done generating predictions in %0.2f min." %
          (prediction_time / 60.0))