Exemplo n.º 1
0
def test_netmhcii_pan_multiple_alleles():
    alleles = [
        normalize_allele_name("HLA-DPA1*01:05-DPB1*100:01"),
        normalize_allele_name("HLA-DQA1*05:11-DQB1*03:02"),
        normalize_allele_name("HLA-DRA1*01:01-DRB1*01:01")
    ]
    ii_pan_predictor = NetMHCIIpan(alleles=alleles)
    fasta_dictionary = {"TP53-001": "SQAMDDLMLSPDDIEQWFTED"}
    binding_predictions = ii_pan_predictor.predict_subsequences(
        sequence_dict=fasta_dictionary, peptide_lengths=[15, 16])

    unique_lengths = {x.length for x in binding_predictions}
    eq_(unique_lengths, {15, 16})

    unique_alleles = {x.allele for x in binding_predictions}
    eq_(
        unique_alleles, {
            "HLA-DPA1*01:05-DPB1*100:01", "HLA-DQA1*05:11-DQB1*03:02",
            "HLA-DRA1*01:01-DRB1*01:01"
        })

    # length of "SQAMDDLMLSPDDIEQWFTED" is 21
    # Expect 3 * ((21-15+1) + (21-16+1)) = 39 entries
    assert len(binding_predictions) == 39, \
        "Expected 39 epitopes from %s" % (binding_predictions,)
Exemplo n.º 2
0
def normalize_allele_name(s):
    try:
        return mhcnames.normalize_allele_name(s)
    except Exception:
        try:
            (a, b, c) = s.split("-")
            return mhcnames.normalize_allele_name("%s-%s*%s" % (a, b, c))
        except Exception:
            return "UNKNOWN"
Exemplo n.º 3
0
def test_mouse_class1_alleles_H2_Kk():
    # H2-Kk
    eq_(parse_allele_name("H2-Kk"), AlleleName("H-2", "K", "", "k"))
    eq_(normalize_allele_name("H2-Kk"), "H-2-Kk")
    eq_(compact_allele_name("H-2-Kk"), "Kk")

    # with a hyphen in "H-2"
    eq_(parse_allele_name("H-2-Kk"), AlleleName("H-2", "K", "", "k"))
    eq_(normalize_allele_name("H-2-Kk"), "H-2-Kk")
    eq_(compact_allele_name("H-2-Kk"), "Kk")
Exemplo n.º 4
0
def test_mouse_class2_alleles():
    # H2-IAb
    eq_(parse_allele_name("H2-IAb"), AlleleName("H-2", "IA", "", "b"))
    eq_(normalize_allele_name("H2-IAb"), "H-2-IAb")
    eq_(compact_allele_name("H2-IAb"), "IAb")

    # with hyphen in "H-2"
    eq_(parse_allele_name("H-2-IAb"), AlleleName("H-2", "IA", "", "b"))
    eq_(normalize_allele_name("H-2-IAb"), "H-2-IAb")
    eq_(compact_allele_name("H-2-IAb"), "IAb")
Exemplo n.º 5
0
def test_macaque_alleles():
    allele_name = "Mamu-B*082:02"
    eq_(normalize_allele_name(allele_name), "Mamu-B*82:02")
    eq_(compact_allele_name(allele_name), "B8202")

    # expect 3rd zero in the family "007" to be trimmed in the normalized form
    # of this allele
    allele_name = "Mamu-B*007:02"
    eq_(normalize_allele_name(allele_name), "Mamu-B*07:02")
    eq_(compact_allele_name(allele_name), "B0702")
Exemplo n.º 6
0
def test_mouse_class1_alleles_H2_Db():
    # H2-Db
    eq_(parse_allele_name("H2-Db"), AlleleName("H-2", "D", "", "b"))
    eq_(normalize_allele_name("H2-Db"), "H-2-Db")
    eq_(compact_allele_name("H2-Db"), "Db")

    # with hyphen in "H-2"
    eq_(parse_allele_name("H-2-Db"), AlleleName("H-2", "D", "", "b"))
    eq_(normalize_allele_name("H-2-Db"), "H-2-Db")
    eq_(compact_allele_name("H-2-Db"), "Db")
Exemplo n.º 7
0
def test_hla_with_3_digit_allele_code():
    # B*15:120
    eq_(normalize_allele_name("HLA-B*15:120"), "HLA-B*15:120")
    eq_(compact_allele_name("HLA-B*15:120"), "B15120")
    eq_(normalize_allele_name("B15120"), "HLA-B*15:120")
    eq_(compact_allele_name("B15120"), "B15120")

    # A*02*123
    eq_(normalize_allele_name("HLA-A*02:123"), "HLA-A*02:123")
    eq_(compact_allele_name("HLA-A*02:123"), "A02123")
    eq_(normalize_allele_name("A02123"), "HLA-A*02:123")
    eq_(compact_allele_name("A02123"), "A02123")
Exemplo n.º 8
0
def test_mouse_class2_alleles():
    # H2-IAb
    eq_(parse_allele_name("H2-IAb"),
        AlleleName("H-2", "IA", "", "b"))
    eq_(normalize_allele_name("H2-IAb"), "H-2-IAb")
    eq_(compact_allele_name("H2-IAb"), "IAb")

    # with hyphen in "H-2"
    eq_(parse_allele_name("H-2-IAb"),
        AlleleName("H-2", "IA", "", "b"))
    eq_(normalize_allele_name("H-2-IAb"), "H-2-IAb")
    eq_(compact_allele_name("H-2-IAb"), "IAb")
Exemplo n.º 9
0
def test_hla_with_3_digit_allele_code():
    # B*15:120
    eq_(normalize_allele_name("HLA-B*15:120"), "HLA-B*15:120")
    eq_(compact_allele_name("HLA-B*15:120"), "B15120")
    eq_(normalize_allele_name("B15120"), "HLA-B*15:120")
    eq_(compact_allele_name("B15120"), "B15120")

    # A*02*123
    eq_(normalize_allele_name("HLA-A*02:123"), "HLA-A*02:123")
    eq_(compact_allele_name("HLA-A*02:123"), "A02123")
    eq_(normalize_allele_name("A02123"), "HLA-A*02:123")
    eq_(compact_allele_name("A02123"), "A02123")
Exemplo n.º 10
0
def test_mouse_class1_alleles_H2_Db():
    # H2-Db
    eq_(parse_allele_name("H2-Db"),
        AlleleName("H-2", "D", "", "b"))
    eq_(normalize_allele_name("H2-Db"), "H-2-Db")
    eq_(compact_allele_name("H2-Db"), "Db")

    # with hyphen in "H-2"
    eq_(parse_allele_name("H-2-Db"),
        AlleleName("H-2", "D", "", "b"))
    eq_(normalize_allele_name("H-2-Db"), "H-2-Db")
    eq_(compact_allele_name("H-2-Db"), "Db")
Exemplo n.º 11
0
def test_mouse_class1_alleles_H2_Kk():
    # H2-Kk
    eq_(parse_allele_name("H2-Kk"),
        AlleleName("H-2", "K", "", "k"))
    eq_(normalize_allele_name("H2-Kk"), "H-2-Kk")
    eq_(compact_allele_name("H-2-Kk"), "Kk")

    # with a hyphen in "H-2"
    eq_(parse_allele_name("H-2-Kk"),
        AlleleName("H-2", "K", "", "k"))
    eq_(normalize_allele_name("H-2-Kk"), "H-2-Kk")
    eq_(compact_allele_name("H-2-Kk"), "Kk")
Exemplo n.º 12
0
def normalize(s, disallowed=["MIC", "HFE"]):
    if any(item in s for item in disallowed):
        return None
    try:
        return mhcnames.normalize_allele_name(s)
    except:
        while s:
            s = ":".join(s.split(":")[:-1])
            try:
                return mhcnames.normalize_allele_name(s)
            except:
                pass
        return None
Exemplo n.º 13
0
def mhc_alleles_from_args(args):
    alleles = [
        normalize_allele_name(allele.strip())
        for allele in args.mhc_alleles.split(",") if allele.strip()
    ]
    if args.mhc_alleles_file:
        with open(args.mhc_alleles_file, 'r') as f:
            for line in f:
                line = line.strip()
                if line:
                    alleles.append(normalize_allele_name(line))
    if len(alleles) == 0:
        raise ValueError(
            "MHC alleles required (use --mhc-alleles or --mhc-alleles-file)")
    return alleles
Exemplo n.º 14
0
    def predict_min_across_alleles(self, alleles, peptides):
        alleles = list(
            set([normalize_allele_name(allele) for allele in alleles]))
        peptides = EncodableSequences.create(peptides)
        df = pandas.DataFrame()
        df["peptide"] = peptides.sequences
        for allele in alleles:
            df[allele] = self.predictor.predict(peptides, allele=allele)
        result = {
            self.predictor_name + '_affinity':
            (df[list(df.columns)[1:]].min(axis=1))
        }
        if self.percent_rank_transforms is not None:
            self.fit_percentile_rank_if_needed(alleles)
            percentile_ranks = pandas.DataFrame(index=df.index)
            for allele in alleles:
                percentile_ranks[allele] = (
                    self.percent_rank_transforms[allele].transform(
                        df[allele].values))
            result[self.predictor_name +
                   '_percentile_rank'] = (percentile_ranks.min(axis=1).values)

        for (key, value) in result.items():
            assert len(value) == len(peptides), (len(peptides), result)
        return result
Exemplo n.º 15
0
def test_netmhc_cons_process_limits():
    alleles = [normalize_allele_name(DEFAULT_ALLELE)]
    sequence_dict = {
        "SMAD4-001": "ASIINFKELA",
        "TP53-001": "ASILLLVFYW",
        "SMAD4-002": "ASIINFKELS",
        "TP53-002": "ASILLLVFYS",
        "TP53-003": "ASILLLVFYT",
        "TP53-004": "ASILLLVFYG",
        "TP53-005": "ASILLLVFYG"
    }
    for process_limit in [1, 2, 10]:
        cons_predictor = NetMHCcons(alleles=alleles,
                                    default_peptide_lengths=[9],
                                    process_limit=process_limit)
        binding_predictions = cons_predictor.predict_subsequences(
            sequence_dict=sequence_dict)
        assert len(binding_predictions) == 14, \
            "Expected 14 binding predictions from but got %d: %s" % (
                len(binding_predictions),
                binding_predictions)

        source_names = [bp.source_sequence_name for bp in binding_predictions]
        for fasta_key in sequence_dict.keys():
            fasta_count = source_names.count(fasta_key)
            assert fasta_count == 2, \
                ("Expected each fasta key to appear twice, once for "
                 "each length, but saw %s %d time(s)" % (
                     fasta_key, fasta_count))
Exemplo n.º 16
0
    def _check_hla_alleles(alleles, valid_alleles=None):
        """
        Given a list of HLA alleles and an optional list of valid
        HLA alleles, return a set of alleles that we will pass into
        the MHC binding predictor.
        """
        require_iterable_of(alleles, string_types, "HLA alleles")

        # Don't run the MHC predictor twice for homozygous alleles,
        # only run it for unique alleles
        alleles = {
            normalize_allele_name(allele.strip().upper())
            for allele in alleles
        }
        if valid_alleles:
            # For some reason netMHCpan drops the '*' in names, so
            # 'HLA-A*03:01' becomes 'HLA-A03:01'
            missing_alleles = [
                allele for allele in alleles if allele not in valid_alleles
            ]
            if len(missing_alleles) > 0:
                raise UnsupportedAllele("Unsupported HLA alleles: %s" %
                                        missing_alleles)

        return list(alleles)
Exemplo n.º 17
0
def mhc_alleles_from_args(args):
    alleles = [
        normalize_allele_name(allele.strip())
        for allele in args.mhc_alleles.split(",")
        if allele.strip()
    ]
    if args.mhc_alleles_file:
        with open(args.mhc_alleles_file, 'r') as f:
            for line in f:
                line = line.strip()
                if line:
                    alleles.append(normalize_allele_name(line))
    if len(alleles) == 0:
        raise ValueError(
            "MHC alleles required (use --mhc-alleles or --mhc-alleles-file)")
    return alleles
Exemplo n.º 18
0
def parse_mixmhcpred_results(filename):
    """
    Parses output files of MixMHCpred that are expected to look like:

        Peptide  Score_bestAllele BestAllele  %Rank_bestAllele  Score_A0201  %Rank_A0201
        MLDDFSAGA          0.182093      A0201               0.3     0.182093          0.3
        SPEGEETII         -0.655341      A0201              51.0    -0.655341         51.0
        ILDRIITNA          0.203906      A0201               0.3     0.203906          0.3

    Parameters
    ----------
    filename : str
    
    Returns list of BindingPrediction
    """
    df = pd.read_csv(filename, comment="#", sep="\t")
    binding_predictions = []
    for peptide, allele, score, pr in zip(df["Peptide"], df["BestAllele"],
                                          df["Score_bestAllele"],
                                          df["%Rank_bestAllele"]):
        binding_predictions.append(
            BindingPrediction(peptide=peptide,
                              allele=normalize_allele_name(allele),
                              score=score,
                              percentile_rank=pr,
                              prediction_method_name="mixmhcpred"))
    return binding_predictions
Exemplo n.º 19
0
 def _determine_supported_alleles(command, supported_allele_flag):
     """
     Try asking the commandline predictor (e.g. netMHCpan)
     which alleles it supports.
     """
     try:
         # convert to str since Python3 returns a `bytes` object
         supported_alleles_output = check_output(
             [command, supported_allele_flag])
         supported_alleles_str = supported_alleles_output.decode(
             "ascii", "ignore")
         assert len(supported_alleles_str) > 0, \
             '%s returned empty allele list' % command
         supported_alleles = set([])
         for line in supported_alleles_str.split("\n"):
             line = line.strip()
             if not line.startswith('#') and len(line) > 0:
                 try:
                     # We need to normalize these alleles (the output of the predictor
                     # when it lists its supported alleles) so that they are comparable with
                     # our own alleles.
                     supported_alleles.add(normalize_allele_name(line))
                 except AlleleParseError as error:
                     logger.info("Skipping allele %s: %s", line, error)
                     continue
         if len(supported_alleles) == 0:
             raise ValueError("Unable to determine supported alleles")
         return supported_alleles
     except Exception as e:
         logger.exception(e)
         raise SystemError(
             "Failed to run %s %s. Possibly an incorrect executable version?"
             % (command, supported_allele_flag))
Exemplo n.º 20
0
def parse_allele_name(allele_name):
    if allele_name not in allele_list:
        return None
    try:
        name = mhcnames.normalize_allele_name(allele_name)
        return name
    except:
        return None
Exemplo n.º 21
0
def parse_stdout(stdout,
                 prediction_method_name,
                 sequence_key_mapping,
                 key_index,
                 offset_index,
                 peptide_index,
                 allele_index,
                 ic50_index,
                 rank_index,
                 log_ic50_index,
                 ignored_value_indices={},
                 transforms={}):
    """
    Generic function for parsing any NetMHC* output, given expected indices
    of values of interest.

    Parameters
    ----------
    ignored_value_indices : dict
        Map from values to the positions we'll ignore them at. See clean_fields.

    transforms  : dict
        Map from field index to a transform function to be applied to values in
        that field. See clean_fields.

    Returns BindingPredictionCollection
    """

    binding_predictions = []
    for fields in split_stdout_lines(stdout):
        fields = clean_fields(fields, ignored_value_indices, transforms)

        offset = int(fields[offset_index])
        peptide = str(fields[peptide_index])
        allele = str(fields[allele_index])
        ic50 = float(fields[ic50_index])
        rank = float(fields[rank_index]) if rank_index else 0.0
        log_ic50 = float(fields[log_ic50_index])

        key = str(fields[key_index])
        if sequence_key_mapping:
            original_key = sequence_key_mapping[key]
        else:
            # if sequence_key_mapping isn't provided then let's assume it's the
            # identity function
            original_key = key

        binding_predictions.append(
            BindingPrediction(source_sequence_name=original_key,
                              offset=offset,
                              peptide=peptide,
                              allele=normalize_allele_name(allele),
                              affinity=ic50,
                              percentile_rank=rank,
                              log_affinity=log_ic50,
                              prediction_method_name=prediction_method_name))
    return binding_predictions
Exemplo n.º 22
0
def test_netmhc_cons():
    alleles = [normalize_allele_name(DEFAULT_ALLELE)]
    cons_predictor = NetMHCcons(alleles=alleles, default_peptide_lengths=[9])
    sequence_dict = {"SMAD4-001": "ASIINFKELA", "TP53-001": "ASILLLVFYW"}
    binding_predictions = cons_predictor.predict_subsequences(
        sequence_dict=sequence_dict)

    assert len(binding_predictions) == 4, \
        "Expected 4 epitopes from %s" % (binding_predictions,)
Exemplo n.º 23
0
def test_wrapper_function():
    alleles = [normalize_allele_name("HLA-A*02:01")]
    wrapped_4 = NetMHC(alleles=alleles,
                       default_peptide_lengths=[9],
                       program_name="netMHC")
    eq_(type(wrapped_4), NetMHC4)
    wrapped_3 = NetMHC(alleles=alleles,
                       default_peptide_lengths=[9],
                       program_name="netMHC-3.4")
    eq_(type(wrapped_3), NetMHC3)
Exemplo n.º 24
0
def test_human_class2_alpha_beta():
    expected = "HLA-DPA1*01:05-DPB1*100:01"
    expected_compact = "DPA10105-DPB110001"
    for name in ["DPA10105-DPB110001",
                 "HLA-DPA1*01:05-DPB1*100:01",
                 "hla-dpa1*0105-dpb1*10001",
                 "dpa1*0105-dpb1*10001",
                 "HLA-DPA1*01:05/DPB1*100:01"]:
        eq_(normalize_allele_name(name), expected)
        eq_(compact_allele_name(name), expected_compact)
Exemplo n.º 25
0
def run_class1_presentation_predictor(args, peptides):
    # This will trigger a Keras import - will break local parallelism.
    predictor = Class1PresentationPredictor.load(args.models_dir)

    if args.allele:
        alleles = [normalize_allele_name(a) for a in args.allele]
    elif args.alleles_file:
        alleles = pandas.read_csv(args.alleles_file).allele.unique()
    else:
        alleles = predictor.supported_alleles

    print("Num alleles", len(alleles))

    genotypes = {}
    if args.alleles_per_genotype == 6:
        gene_to_alleles = collections.defaultdict(list)
        for a in alleles:
            for gene in ["A", "B", "C"]:
                if a.startswith("HLA-%s" % gene):
                    gene_to_alleles[gene].append(a)

        for _ in range(args.num_genotypes):
            genotype = []
            for gene in ["A", "A", "B", "B", "C", "C"]:
                genotype.append(numpy.random.choice(gene_to_alleles[gene]))
            genotypes[",".join(genotype)] = genotype
    elif args.alleles_per_genotype == 1:
        for _ in range(args.num_genotypes):
            genotype = [numpy.random.choice(alleles)]
            genotypes[",".join(genotype)] = genotype
    else:
        raise ValueError("Alleles per genotype must be 6 or 1")

    print("Sampled genotypes: ", list(genotypes))
    print("Num peptides: ", len(peptides))

    start = time.time()
    print("Generating predictions")
    predictions_df = predictor.predict(peptides=peptides, alleles=genotypes)
    print("Finished in %0.2f sec." % (time.time() - start))
    print(predictions_df)

    print("Calibrating ranks")
    scores = predictions_df.presentation_score.values
    predictor.calibrate_percentile_ranks(scores)
    print("Done. Saving.")

    predictor.save(args.models_dir,
                   write_affinity_predictor=False,
                   write_processing_predictor=False,
                   write_weights=False,
                   write_percent_ranks=True,
                   write_info=False,
                   write_metdata=False)
    print("Wrote predictor to: %s" % args.models_dir)
Exemplo n.º 26
0
def test_multiple_lengths_netmhc3():
    alleles = [normalize_allele_name("H-2-Kb")]
    predictor = NetMHC3(alleles=alleles,
                        default_peptide_lengths=[9],
                        program_name="netMHC-3.4")
    protein_sequence_dict = {
        'seq': 'AETDEIKILLEE',
    }
    binding_predictions = predictor.predict_subsequences(
        protein_sequence_dict, peptide_lengths=[10, 11])
    eq_(5, len(binding_predictions))
Exemplo n.º 27
0
def test_human_class2_alpha_beta():
    expected = "HLA-DPA1*01:05-DPB1*100:01"
    expected_compact = "DPA10105-DPB110001"
    for name in ["DPA10105-DPB110001",
                 "HLA-DPA1*01:05-DPB1*100:01",
                 "hla-dpa1*0105-dpb1*10001",
                 "dpa1*0105-dpb1*10001",
                 "HLA-DPA1*01:05/DPB1*100:01",
                 "DPA10105/DPB110001"]:
        eq_(normalize_allele_name(name), expected)
        eq_(compact_allele_name(name), expected_compact)
Exemplo n.º 28
0
def test_human_class2():
    expected = "HLA-DRB1*01:02"
    expected_compact = "DRB10102"
    for name in ["DRB1_0102",
                 "DRB101:02",
                 "HLA-DRB1_0102",
                 "DRB10102",
                 "DRB1*0102",
                 "HLA-DRB1*0102",
                 "HLA-DRB1*01:02"]:
        eq_(normalize_allele_name(name), expected)
        eq_(compact_allele_name(name), expected_compact)
Exemplo n.º 29
0
def test_human_class2():
    expected = "HLA-DRA1*01:01-DRB1*01:02"
    expected_compact = "DRB10102"
    for name in ["DRB1_0102",
                 "DRB101:02",
                 "HLA-DRB1_0102",
                 "DRB10102",
                 "DRB1*0102",
                 "HLA-DRB1*0102",
                 "HLA-DRB1*01:02",
                 "DRB0102"]:
        eq_(normalize_allele_name(name), expected)
        eq_(compact_allele_name(name), expected_compact)
Exemplo n.º 30
0
    def predict_peptides(self, peptides):
        """

        Parameters
        ----------
        peptides : list of str


        Returns
        -------
        list of BindingPrediction
        """
        self._check_peptide_inputs(peptides)
        results = []
        for allele in self.alleles:

            temp_dir = mkdtemp(prefix="mhctools", suffix="mixmhcpred")
            input_file_path = join(temp_dir, "mixmhcpred_inputs.txt")
            output_file_path = join(temp_dir, "mixmhcpred_outputs.txt")

            with open(input_file_path, "w") as f:
                for i, p in enumerate(peptides):
                    f.write(p)
                    if i < len(peptides) - 1:
                        f.write("\n")
            with CleanupFiles(filenames=[input_file_path, output_file_path],
                              directories=[temp_dir]):
                with NamedTemporaryFile(prefix="MixMHCpred_stdout",
                                        mode="w",
                                        delete=False) as stdout_file:
                    stdout_file_name = stdout_file.name
                    run_command([
                        self.program_name, "-i", input_file_path, "-o",
                        output_file_path, "-a",
                        normalize_allele_name(allele)
                    ] + self.extra_commandline_args,
                                suppress_stderr=False,
                                redirect_stdout_file=stdout_file)
                if exists(output_file_path):
                    results.extend(parse_mixmhcpred_results(output_file_path))
                else:
                    with open(stdout_file_name, "r") as f:
                        stdout = f.read().strip()
                    raise ValueError(
                        "MixMHCpred failed on allele '%s' with stdout '%s'" %
                        (allele, stdout))
                remove(stdout_file_name)
        return BindingPredictionCollection(results)
Exemplo n.º 31
0
 def fit_allele(self, allele, hit_list, decoys_list):
     allele = normalize_allele_name(allele)
     hit_list = set(hit_list)
     df = pandas.DataFrame(
         {"peptide": sorted(set(hit_list).union(decoys_list))})
     df["allele"] = allele
     df["species"] = "human"
     df["affinity"] = ((~df.peptide.isin(hit_list)).astype(float) *
                       (self.decoy_affinity - self.hit_affinity) +
                       self.hit_affinity)
     df["sample_weight"] = 1.0
     df["peptide_length"] = 9
     self.predictor.fit_allele_specific_predictors(
         n_models=self.ensemble_size,
         architecture_hyperparameters=self.mhcflurry_hyperparameters,
         allele=allele,
         peptides=df.peptide.values,
         affinities=df.affinity.values,
     )
Exemplo n.º 32
0
def test_netmhcii_pan_mouse():
    alleles = [normalize_allele_name("H2-IAb")]
    ii_pan_predictor = NetMHCIIpan(alleles=alleles)
    fasta_dictionary = {
        "SMAD4-001": "PAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGT",
        "TP53-001": "SQAMDDLMLSPDDIEQWFTED"
    }
    binding_predictions = ii_pan_predictor.predict_subsequences(
        sequence_dict=fasta_dictionary, peptide_lengths=[15, 16])

    unique_lengths = {x.length for x in binding_predictions}
    eq_(unique_lengths, {15, 16})

    unique_alleles = {x.allele for x in binding_predictions}
    eq_(unique_alleles, {"H-2-IAb"})

    # length of "PAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGT" is 34
    # length of "SQAMDDLMLSPDDIEQWFTED" is 21
    # Expect (34-15+1) + (34-16+1) + (21-15+1) + (21-16+1) = 52 entries
    assert len(binding_predictions) == 52, \
        "Expected 52 epitopes from %s" % (binding_predictions,)
    def predict_to_dataframe(self,
                             peptides,
                             alleles=None,
                             allele=None,
                             throw=True,
                             include_individual_model_predictions=False,
                             include_percentile_ranks=True,
                             include_confidence_intervals=True,
                             centrality_measure=DEFAULT_CENTRALITY_MEASURE):
        """
        Predict nM binding affinities. Gives more detailed output than `predict`
        method, including 5-95% prediction intervals.
        
        If multiple predictors are available for an allele, the predictions are
        the geometric means of the individual model predictions.
        
        One of 'allele' or 'alleles' must be specified. If 'allele' is specified
        all predictions will be for the given allele. If 'alleles' is specified
        it must be the same length as 'peptides' and give the allele
        corresponding to each peptide. 
        
        Parameters
        ----------
        peptides : `EncodableSequences` or list of string
        alleles : list of string
        allele : string
        throw : boolean
            If True, a ValueError will be raised in the case of unsupported
            alleles or peptide lengths. If False, a warning will be logged and
            the predictions for the unsupported alleles or peptides will be NaN.
        include_individual_model_predictions : boolean
            If True, the predictions of each individual model are included as
            columns in the result dataframe.
        include_percentile_ranks : boolean, default True
            If True, a "prediction_percentile" column will be included giving the
            percentile ranks. If no percentile rank information is available,
            this will be ignored with a warning.
        centrality_measure : string or callable
            Measure of central tendency to use to combine predictions in the
            ensemble. Options include: mean, median, robust_mean.

        Returns
        -------
        `pandas.DataFrame` of predictions
        """
        if isinstance(peptides, string_types):
            raise TypeError("peptides must be a list or array, not a string")
        if isinstance(alleles, string_types):
            raise TypeError("alleles must be a list or array, not a string")
        if allele is None and alleles is None:
            raise ValueError("Must specify 'allele' or 'alleles'.")

        peptides = EncodableSequences.create(peptides)
        df = pandas.DataFrame({'peptide': peptides.sequences}, copy=False)

        if allele is not None:
            if alleles is not None:
                raise ValueError("Specify exactly one of allele or alleles")
            df["allele"] = allele
            normalized_allele = mhcnames.normalize_allele_name(allele)
            df["normalized_allele"] = normalized_allele
            unique_alleles = [normalized_allele]
        else:
            df["allele"] = numpy.array(alleles)
            df["normalized_allele"] = df.allele.map(
                mhcnames.normalize_allele_name)
            unique_alleles = df.normalized_allele.unique()

        if len(df) == 0:
            # No predictions.
            logging.warning("Predicting for 0 peptides.")
            empty_result = pandas.DataFrame(columns=[
                'peptide', 'allele', 'prediction', 'prediction_low',
                'prediction_high'
            ])
            return empty_result

        (min_peptide_length,
         max_peptide_length) = (self.supported_peptide_lengths)

        if (peptides.min_length < min_peptide_length
                or peptides.max_length > max_peptide_length):
            # Only compute this if needed
            all_peptide_lengths_supported = False
            sequence_length = df.peptide.str.len()
            df["supported_peptide_length"] = (
                (sequence_length >= min_peptide_length) &
                (sequence_length <= max_peptide_length))
            if (~df.supported_peptide_length).any():
                msg = (
                    "%d peptides have lengths outside of supported range [%d, %d]: "
                    "%s" %
                    ((~df.supported_peptide_length).sum(), min_peptide_length,
                     max_peptide_length,
                     str(df.ix[~df.supported_peptide_length].peptide.unique()))
                )
                logging.warning(msg)
                if throw:
                    raise ValueError(msg)
        else:
            # Handle common case efficiently.
            df["supported_peptide_length"] = True
            all_peptide_lengths_supported = True

        num_pan_models = len(self.class1_pan_allele_models)
        max_single_allele_models = max(
            len(self.allele_to_allele_specific_models.get(allele, []))
            for allele in unique_alleles)
        predictions_array = numpy.zeros(shape=(df.shape[0], num_pan_models +
                                               max_single_allele_models),
                                        dtype="float64")
        predictions_array[:] = numpy.nan

        if self.class1_pan_allele_models:
            unsupported_alleles = [
                allele for allele in df.normalized_allele.unique()
                if allele not in self.allele_to_fixed_length_sequence
            ]
            if unsupported_alleles:
                msg = ("No sequences for allele(s): %s.\n"
                       "Supported alleles: %s" %
                       (" ".join(unsupported_alleles), " ".join(
                           sorted(self.allele_to_fixed_length_sequence))))
                logging.warning(msg)
                if throw:
                    raise ValueError(msg)
            mask = df.supported_peptide_length
            if mask.sum() > 0:
                masked_allele_encoding = AlleleEncoding(
                    df.loc[mask].normalized_allele,
                    allele_to_fixed_length_sequence=self.
                    allele_to_fixed_length_sequence)
                masked_peptides = peptides.sequences[mask]
                for (i, model) in enumerate(self.class1_pan_allele_models):
                    predictions_array[mask, i] = model.predict(
                        masked_peptides,
                        allele_encoding=masked_allele_encoding)

        if self.allele_to_allele_specific_models:
            unsupported_alleles = [
                allele for allele in unique_alleles
                if not self.allele_to_allele_specific_models.get(allele)
            ]
            if unsupported_alleles:
                msg = ("No single-allele models for allele(s): %s.\n"
                       "Supported alleles are: %s" %
                       (" ".join(unsupported_alleles), " ".join(
                           sorted(self.allele_to_allele_specific_models))))
                logging.warning(msg)
                if throw:
                    raise ValueError(msg)

            for allele in unique_alleles:
                models = self.allele_to_allele_specific_models.get(allele, [])
                if len(unique_alleles) == 1 and all_peptide_lengths_supported:
                    mask = None
                else:
                    mask = ((df.normalized_allele == allele)
                            & df.supported_peptide_length).values
                if mask is None or mask.all():
                    # Common case optimization
                    for (i, model) in enumerate(models):
                        predictions_array[:, num_pan_models +
                                          i] = (model.predict(peptides))
                elif mask.sum() > 0:
                    peptides_for_allele = EncodableSequences.create(
                        df.ix[mask].peptide.values)
                    for (i, model) in enumerate(models):
                        predictions_array[mask, num_pan_models +
                                          i, ] = model.predict(
                                              peptides_for_allele)

        if callable(centrality_measure):
            centrality_function = centrality_measure
        else:
            centrality_function = CENTRALITY_MEASURES[centrality_measure]

        logs = numpy.log(predictions_array)
        log_centers = centrality_function(logs)
        df["prediction"] = numpy.exp(log_centers)

        if include_confidence_intervals:
            df["prediction_low"] = numpy.exp(
                numpy.nanpercentile(logs, 5.0, axis=1))
            df["prediction_high"] = numpy.exp(
                numpy.nanpercentile(logs, 95.0, axis=1))

        if include_individual_model_predictions:
            for i in range(num_pan_models):
                df["model_pan_%d" % i] = predictions_array[:, i]

            for i in range(max_single_allele_models):
                df["model_single_%d" %
                   i] = predictions_array[:, num_pan_models + i]

        if include_percentile_ranks:
            if self.allele_to_percent_rank_transform:
                df["prediction_percentile"] = self.percentile_ranks(
                    df.prediction,
                    alleles=df.normalized_allele.values,
                    throw=throw)
            else:
                warnings.warn("No percentile rank information available.")

        del df["supported_peptide_length"]
        del df["normalized_allele"]
        return df
    def fit_allele_specific_predictors(self,
                                       n_models,
                                       architecture_hyperparameters_list,
                                       allele,
                                       peptides,
                                       affinities,
                                       inequalities=None,
                                       train_rounds=None,
                                       models_dir_for_save=None,
                                       verbose=0,
                                       progress_preamble="",
                                       progress_print_interval=5.0):
        """
        Fit one or more allele specific predictors for a single allele using one
        or more neural network architectures.
        
        The new predictors are saved in the Class1AffinityPredictor instance
        and will be used on subsequent calls to `predict`.
        
        Parameters
        ----------
        n_models : int
            Number of neural networks to fit
        
        architecture_hyperparameters_list : list of dict
            List of hyperparameter sets.
               
        allele : string
        
        peptides : `EncodableSequences` or list of string
        
        affinities : list of float
            nM affinities

        inequalities : list of string, each element one of ">", "<", or "="
            See Class1NeuralNetwork.fit for details.

        train_rounds : sequence of int
            Each training point i will be used on training rounds r for which
            train_rounds[i] > r, r >= 0.
        
        models_dir_for_save : string, optional
            If specified, the Class1AffinityPredictor is (incrementally) written
            to the given models dir after each neural network is fit.
        
        verbose : int
            Keras verbosity

        progress_preamble : string
            Optional string of information to include in each progress update

        progress_print_interval : float
            How often (in seconds) to print progress. Set to None to disable.

        Returns
        -------
        list of `Class1NeuralNetwork`
        """

        allele = mhcnames.normalize_allele_name(allele)
        if allele not in self.allele_to_allele_specific_models:
            self.allele_to_allele_specific_models[allele] = []

        encodable_peptides = EncodableSequences.create(peptides)
        peptides_affinities_inequalities_per_round = [
            (encodable_peptides, affinities, inequalities)
        ]

        if train_rounds is not None:
            for round in sorted(set(train_rounds)):
                round_mask = train_rounds > round
                if round_mask.any():
                    sub_encodable_peptides = EncodableSequences.create(
                        encodable_peptides.sequences[round_mask])
                    peptides_affinities_inequalities_per_round.append(
                        (sub_encodable_peptides, affinities[round_mask],
                         None if inequalities is None else
                         inequalities[round_mask]))
        n_rounds = len(peptides_affinities_inequalities_per_round)

        n_architectures = len(architecture_hyperparameters_list)

        # Adjust progress info to indicate number of models and
        # architectures.
        pieces = []
        if n_models > 1:
            pieces.append("Model {model_num:2d} / {n_models:2d}")
        if n_architectures > 1:
            pieces.append(
                "Architecture {architecture_num:2d} / {n_architectures:2d}")
        if len(peptides_affinities_inequalities_per_round) > 1:
            pieces.append("Round {round:2d} / {n_rounds:2d}")
        pieces.append("{n_peptides:4d} peptides")
        progress_preamble_template = "[ %s ] {user_progress_preamble}" % (
            ", ".join(pieces))

        models = []
        for model_num in range(n_models):
            for (architecture_num, architecture_hyperparameters
                 ) in enumerate(architecture_hyperparameters_list):
                model = Class1NeuralNetwork(**architecture_hyperparameters)
                for round_num in range(n_rounds):
                    (round_peptides, round_affinities, round_inequalities) = (
                        peptides_affinities_inequalities_per_round[round_num])
                    model.fit(
                        round_peptides,
                        round_affinities,
                        inequalities=round_inequalities,
                        verbose=verbose,
                        progress_preamble=progress_preamble_template.format(
                            n_peptides=len(round_peptides),
                            round=round_num,
                            n_rounds=n_rounds,
                            user_progress_preamble=progress_preamble,
                            model_num=model_num + 1,
                            n_models=n_models,
                            architecture_num=architecture_num + 1,
                            n_architectures=n_architectures),
                        progress_print_interval=progress_print_interval)

                model_name = self.model_name(allele, model_num)
                row = pandas.Series(
                    collections.OrderedDict([
                        ("model_name", model_name),
                        ("allele", allele),
                        ("config_json", json.dumps(model.get_config())),
                        ("model", model),
                    ])).to_frame().T
                self._manifest_df = pandas.concat([self.manifest_df, row],
                                                  ignore_index=True)
                self.allele_to_allele_specific_models[allele].append(model)
                if models_dir_for_save:
                    self.save(models_dir_for_save,
                              model_names_to_write=[model_name])
                models.append(model)

        self.clear_cache()
        return models
Exemplo n.º 35
0
def parse_stdout(
        stdout,
        prediction_method_name,
        sequence_key_mapping,
        key_index,
        offset_index,
        peptide_index,
        allele_index,
        score_index,
        rank_index=None,
        ic50_index=None,

        ignored_value_indices={},
        transforms={}):
    """
    Generic function for parsing any NetMHC* output, given expected indices
    of values of interest.

    Parameters
    ----------
    stdout : str

    prediction_method_name : str

    key_index : int

    offset_index : int

    peptide_index : int

    allele_index : int

    score_index : int

    rank_index : int

    ic50_index : int

    sequence_key_mapping : dict
        Dictionary mapping sequence names (which might be hashes or truncated)
        to the sequence names which should be used in the parsed
        BindingPrediction objects

    ignored_value_indices : dict
        Map from values to the positions we'll ignore them at.
        See clean_fields.

    transforms  : dict
        Map from field index to a transform function to be applied to values in
        that field. See clean_fields.

    Returns BindingPredictionCollection
    """

    binding_predictions = []
    for fields in split_stdout_lines(stdout):
        fields = clean_fields(fields, ignored_value_indices, transforms)

        offset = int(fields[offset_index])
        peptide = str(fields[peptide_index])
        allele = str(fields[allele_index])

        if score_index is None:
            score = None
        else:
            score = float(fields[score_index])

        if rank_index is None:
            rank = None
        else:
            rank = float(fields[rank_index])

        if ic50_index is None:
            ic50 = None
        else:
            ic50 = float(fields[ic50_index])

        key = str(fields[key_index])
        if sequence_key_mapping:
            original_key = sequence_key_mapping[key]
        else:
            # if sequence_key_mapping isn't provided then let's assume it's the
            # identity function
            original_key = key

        # if we have a bad IC50 score we might still get a salvageable
        # log of the score. Strangely, this is necessary sometimes!
        if ic50_index is not None and (not valid_affinity(ic50)) and np.isfinite(score):
            # pylint: disable=invalid-unary-operand-type
            ic50 = 50000 ** (1 - score)

        binding_predictions.append(BindingPrediction(
            source_sequence_name=original_key,
            offset=offset,
            peptide=peptide,
            allele=normalize_allele_name(allele),
            score=score,
            affinity=ic50,
            percentile_rank=rank,
            prediction_method_name=prediction_method_name))
    return binding_predictions
Exemplo n.º 36
0
def run(argv=sys.argv[1:]):
    global GLOBAL_DATA

    # On sigusr1 print stack trace
    print("To show stack trace, run:\nkill -s USR1 %d" % os.getpid())
    signal.signal(signal.SIGUSR1, lambda sig, frame: traceback.print_stack())

    args = parser.parse_args(argv)

    args.out_models_dir = os.path.abspath(args.out_models_dir)

    configure_logging(verbose=args.verbosity > 1)

    input_predictor = Class1AffinityPredictor.load(args.models_dir)
    print("Loaded: %s" % input_predictor)

    if args.allele:
        alleles = [normalize_allele_name(a) for a in args.allele]
    else:
        alleles = input_predictor.supported_alleles

    metadata_dfs = {}
    if args.data:
        df = pandas.read_csv(args.data)
        print("Loaded data: %s" % (str(df.shape)))

        df = df.loc[(df.peptide.str.len() >= 8) & (df.peptide.str.len() <= 15)]
        print("Subselected to 8-15mers: %s" % (str(df.shape)))

        # Allele names in data are assumed to be already normalized.
        df = df.loc[df.allele.isin(alleles)].dropna()
        print("Selected %d alleles: %s" % (len(alleles), ' '.join(alleles)))

        if args.exclude_data:
            exclude_df = pandas.read_csv(args.exclude_data)
            metadata_dfs["model_selection_exclude"] = exclude_df
            print("Loaded exclude data: %s" % (str(df.shape)))

            df["_key"] = df.allele + "__" + df.peptide
            exclude_df["_key"] = exclude_df.allele + "__" + exclude_df.peptide
            df["_excluded"] = df._key.isin(exclude_df._key.unique())
            print("Excluding measurements per allele (counts): ")
            print(df.groupby("allele")._excluded.sum())

            print("Excluding measurements per allele (fractions): ")
            print(df.groupby("allele")._excluded.mean())

            df = df.loc[~df._excluded]
            del df["_excluded"]
            del df["_key"]
            print("Reduced data to: %s" % (str(df.shape)))

        metadata_dfs["model_selection_data"] = df

        df["mass_spec"] = df.measurement_source.str.contains(
            args.mass_spec_regex)
    else:
        df = None

    if args.out_unselected_predictions:
        df["unselected_prediction"] = input_predictor.predict(
            alleles=df.allele.values, peptides=df.peptide.values)
        df.to_csv(args.out_unselected_predictions)
        print("Wrote: %s" % args.out_unselected_predictions)

    selectors = {}
    selector_to_model_selection_kwargs = {}

    def make_selector(scoring,
                      combined_min_contribution_percent=args.
                      combined_min_contribution_percent):
        if scoring in selectors:
            return (selectors[scoring],
                    selector_to_model_selection_kwargs[scoring])

        start = time.time()
        if scoring.startswith("combined:"):
            model_selection_kwargs = {
                'min_models': args.combined_min_models,
                'max_models': args.combined_max_models,
            }
            component_selectors = []
            for component_selector in scoring.split(":", 1)[1].split(","):
                component_selectors.append(
                    make_selector(component_selector)[0])
            selector = CombinedModelSelector(
                component_selectors,
                min_contribution_percent=combined_min_contribution_percent)
        elif scoring == "mse":
            model_selection_kwargs = {
                'min_models': args.mse_min_models,
                'max_models': args.mse_max_models,
            }
            min_measurements = args.mse_min_measurements
            selector = MSEModelSelector(df=df.loc[~df.mass_spec],
                                        predictor=input_predictor,
                                        min_measurements=min_measurements)
        elif scoring == "mass-spec":
            mass_spec_df = df.loc[df.mass_spec]
            model_selection_kwargs = {
                'min_models': args.mass_spec_min_models,
                'max_models': args.mass_spec_max_models,
            }
            min_measurements = args.mass_spec_min_measurements
            selector = MassSpecModelSelector(df=mass_spec_df,
                                             predictor=input_predictor,
                                             min_measurements=min_measurements)
        elif scoring == "consensus":
            model_selection_kwargs = {
                'min_models': args.consensus_min_models,
                'max_models': args.consensus_max_models,
            }
            selector = ConsensusModelSelector(
                predictor=input_predictor,
                num_peptides_per_length=args.consensus_num_peptides_per_length)
        else:
            raise ValueError("Unsupported scoring method: %s" % scoring)
        print("Instantiated model selector %s in %0.2f sec." %
              (scoring, time.time() - start))
        return (selector, model_selection_kwargs)

    for scoring in args.scoring:
        (selector, model_selection_kwargs) = make_selector(scoring)
        selectors[scoring] = selector
        selector_to_model_selection_kwargs[scoring] = model_selection_kwargs

    unselected_accuracy_scorer = None
    if args.unselected_accuracy_scorer:
        # Force running all selectors by setting combined_min_contribution_percent=0.
        unselected_accuracy_scorer = make_selector(
            args.unselected_accuracy_scorer,
            combined_min_contribution_percent=0.0)[0]
        print("Using unselected accuracy scorer: %s" %
              unselected_accuracy_scorer)
    GLOBAL_DATA["unselected_accuracy_scorer"] = unselected_accuracy_scorer

    print("Selectors for alleles:")
    allele_to_selector = {}
    allele_to_model_selection_kwargs = {}
    for allele in alleles:
        selector = None
        for possible_selector in args.scoring:
            if selectors[possible_selector].usable_for_allele(allele=allele):
                selector = selectors[possible_selector]
                print("%20s %s" % (allele, selector.plan_summary(allele)))
                break
        if selector is None:
            raise ValueError("No selectors usable for allele: %s" % allele)
        allele_to_selector[allele] = selector
        allele_to_model_selection_kwargs[allele] = (
            selector_to_model_selection_kwargs[possible_selector])

    GLOBAL_DATA["args"] = args
    GLOBAL_DATA["input_predictor"] = input_predictor
    GLOBAL_DATA["unselected_accuracy_scorer"] = unselected_accuracy_scorer
    GLOBAL_DATA["allele_to_selector"] = allele_to_selector
    GLOBAL_DATA[
        "allele_to_model_selection_kwargs"] = allele_to_model_selection_kwargs

    if not os.path.exists(args.out_models_dir):
        print("Attempting to create directory: %s" % args.out_models_dir)
        os.mkdir(args.out_models_dir)
        print("Done.")

    result_predictor = Class1AffinityPredictor(
        metadata_dataframes=metadata_dfs)

    worker_pool = worker_pool_with_gpu_assignments_from_args(args)

    start = time.time()

    if worker_pool is None:
        # Serial run
        print("Running in serial.")
        results = (model_select(allele) for allele in alleles)
    else:
        # Parallel run
        random.shuffle(alleles)
        results = worker_pool.imap_unordered(partial(
            model_select, constant_data=GLOBAL_DATA),
                                             alleles,
                                             chunksize=1)

    unselected_summary = []
    model_selection_dfs = []
    for result in tqdm.tqdm(results, total=len(alleles)):
        pprint(result)

        summary_dict = dict(result)
        summary_dict["retained"] = result["selected"] is not None
        del summary_dict["selected"]

        unselected_summary.append(summary_dict)
        if result['selected'] is not None:
            model_selection_dfs.append(
                result['selected'].metadata_dataframes['model_selection'])
            result_predictor.merge_in_place([result['selected']])

    if model_selection_dfs:
        model_selection_df = pandas.concat(model_selection_dfs,
                                           ignore_index=True)
        model_selection_df["selector"] = model_selection_df.allele.map(
            allele_to_selector)
        result_predictor.metadata_dataframes["model_selection"] = (
            model_selection_df)

    result_predictor.metadata_dataframes["unselected_summary"] = (
        pandas.DataFrame(unselected_summary))

    print("Done model selecting for %d alleles." % len(alleles))
    result_predictor.save(args.out_models_dir)

    model_selection_time = time.time() - start

    if worker_pool:
        worker_pool.close()
        worker_pool.join()

    print("Model selection time %0.2f min." % (model_selection_time / 60.0))
    print("Predictor written to: %s" % args.out_models_dir)
Exemplo n.º 37
0
def test_extra_text_after_allele():
    normalize_allele_name("HLA-A*02:01 zipper")
Exemplo n.º 38
0
def test_hla_long_names():
    expected = "HLA-A*02:01"
    for name in hla_02_01_names:
        result = normalize_allele_name(name)
        eq_(result, expected)
Exemplo n.º 39
0
def test_extra_text_after_allele():
    normalize_allele_name("HLA-A*02:01 zipper")