예제 #1
0
파일: hla_file.py 프로젝트: xflicsu/immuno
def read_hla_file(path, permissive_parsing=True):
    """
    Read in HLA alleles and normalize them, returning a list of HLA allele
    names.
    """
    assert path.endswith(".hla"), \
        "Expected HLA file %s to end with suffix .hla" % path

    logging.info("Reading HLA file %s", path)
    alleles = []
    with open(path, 'r') as f:
        contents = f.read()
        for line in contents.split("\n"):
            for raw_allele in line.split(","):
                if permissive_parsing:
                    # get rid of surrounding whitespace
                    raw_allele = raw_allele.strip()
                    # sometimes we get extra columns with scores,
                    # ignore those
                    raw_allele = raw_allele.split(" ")[0]
                    raw_allele = raw_allele.split("\t")[0]
                    raw_allele = raw_allele.split("'")[0]
                if len(raw_allele) > 0:
                    alleles.append(normalize_hla_allele_name(raw_allele))
    return alleles
예제 #2
0
def read_hla_file(path, permissive_parsing=True):
    """
    Read in HLA alleles and normalize them, returning a list of HLA allele
    names.
    """
    assert path.endswith(".hla"), \
        "Expected HLA file %s to end with suffix .hla" % path

    logging.info("Reading HLA file %s", path)
    alleles = []
    with open(path, 'r') as f:
        contents = f.read()
        for line in contents.split("\n"):
            for raw_allele in line.split(","):
                if permissive_parsing:
                    # get rid of surrounding whitespace
                    raw_allele = raw_allele.strip()
                    # sometimes we get extra columns with scores,
                    # ignore those
                    raw_allele = raw_allele.split(" ")[0]
                    raw_allele = raw_allele.split("\t")[0]
                    raw_allele = raw_allele.split("'")[0]
                if len(raw_allele) > 0:
                    alleles.append(
                        normalize_hla_allele_name(
                            raw_allele))
    return alleles
예제 #3
0
    def __init__(
            self,
            hla_alleles,
            netmhc_command = "netMHCcons"):
        self.netmhc_command = netmhc_command

        try:
            subprocess.check_output([self.netmhc_command],
                stderr=subprocess.STDOUT)
        except:
            assert False, "Failed to run %s" % self.netmhc_command

        # normalize alleles and keep only unique names
        normalized_alleles = {
            normalize_hla_allele_name(allele.strip().upper()).replace("*", "")
            for allele in hla_alleles
        }

        self.alleles = []

        # try running "netMHCcons -a" with each allele name
        # and check if it gives you back a "wrong format" error
        for allele in normalized_alleles:
            try:
                subprocess.check_output(
                    [self.netmhc_command, '-a', allele],
                    stderr=subprocess.STDOUT)
            except subprocess.CalledProcessError, e:
                if "allele" in e.output and "wrong format" in e.output:
                    logging.warning(
                        "Allele %s not recognized by NetMHCcons", allele)
                    continue
            except:
예제 #4
0
    def __init__(
            self,
            hla_alleles,
            netmhc_command = "netMHCpan"):
        self.netmhc_command = netmhc_command

        try:
            run_command([self.netmhc_command])
        except:
            assert False, "Failed to run %s" % self.netmhc_command

        try:
            valid_alleles_str = check_output([self.netmhc_command, "-listMHC"])
            assert len(valid_alleles_str) > 0, \
                "%s returned empty allele list" % self.self.netmhc_command
            valid_alleles = set([])
            for line in valid_alleles_str.split("\n"):
                if not line.startswith("#"):
                    valid_alleles.add(line)
        except:
            logging.warning("Failed to run %s -listMHC", self.netmhc_command)
            valid_alleles = None

        self.alleles = []
        for allele in hla_alleles:
            allele = normalize_hla_allele_name(allele.strip().upper())
            # for some reason netMHCpan drop the "*" in names
            # such as "HLA-A*03:01" becomes "HLA-A03:01"
            if valid_alleles and allele.replace("*", "") not in valid_alleles:
                print "Skipping %s (not available in NetMHCpan)" % allele
            else:
                self.alleles.append(allele)
        # don't run the MHC predictor twice for homozygous alleles,
        # only run it for unique alleles
        self.alleles = set(self.alleles)
예제 #5
0
파일: ui.py 프로젝트: xflicsu/immuno
def run_pipeline(patient_id, score_epitopes):
    """Run the pipeline for this patient, and save the output to the DB as a
    Run."""
    hla_types = HLAType.query.with_entities(HLAType.allele,
        HLAType.mhc_class).filter_by(patient_id=patient_id).all()

    peptide_length = 31
    alleles = [normalize_hla_allele_name(
        allele) for allele, mhc_class in hla_types]

    vcf_df = get_vcf_df(patient_id)
    transcripts_df, vcf_df, variant_report = expand_transcripts(
        vcf_df,
        patient_id,
        min_peptide_length = peptide_length,
        max_peptide_length = peptide_length)

    scored_epitopes = score_epitopes(transcripts_df, alleles)
    imm = ImmunogenicityPredictor(alleles=alleles)
    scored_epitopes = imm.predict(scored_epitopes)

    # TODO(tavi) Make this expansion more robust. It breaks the IEDB predictor,
    # for example.
    short_transcripts_df = transcripts_df[['chr', 'pos', 'ref',
        'alt', 'TranscriptId']]
    scored_epitopes = merge(scored_epitopes, short_transcripts_df,
        on='TranscriptId', how='left')
    peptides = group_epitopes_dataframe(
        scored_epitopes, use_transcript_name = True)

    run = Run(patient_id=patient_id, output=dumps(peptides))
    db.session.add(run)
예제 #6
0
    def __init__(self, hla_alleles, netmhc_command="netMHCpan"):
        self.netmhc_command = netmhc_command

        try:
            run_command([self.netmhc_command])
        except:
            assert False, "Failed to run %s" % self.netmhc_command

        try:
            valid_alleles_str = check_output([self.netmhc_command, "-listMHC"])
            assert len(valid_alleles_str) > 0, \
                "%s returned empty allele list" % self.self.netmhc_command
            valid_alleles = set([])
            for line in valid_alleles_str.split("\n"):
                if not line.startswith("#"):
                    valid_alleles.add(line)
        except:
            logging.warning("Failed to run %s -listMHC", self.netmhc_command)
            valid_alleles = None

        self.alleles = []
        for allele in hla_alleles:
            allele = normalize_hla_allele_name(allele.strip().upper())
            # for some reason netMHCpan drop the "*" in names
            # such as "HLA-A*03:01" becomes "HLA-A03:01"
            if valid_alleles and allele.replace("*", "") not in valid_alleles:
                print "Skipping %s (not available in NetMHCpan)" % allele
            else:
                self.alleles.append(allele)
        # don't run the MHC predictor twice for homozygous alleles,
        # only run it for unique alleles
        self.alleles = set(self.alleles)
예제 #7
0
def create_binding_result_row(mutation_entry,
                              allele,
                              pos,
                              epitope,
                              log_ic50,
                              ic50,
                              rank,
                              mutation_window_size=None):
    # if we have a bad IC50 score we might still get a salvageable
    # log of the score. Strangely, this is necessary sometimes!
    if invalid_binding_score(ic50):
        ic50 = 50000**(-log_ic50 + 1)
        # if IC50 is still NaN or otherwise invalid, abort
        if invalid_binding_score(ic50):
            logging.warn("Invalid IC50 value %0.4f for %s w/ allele %s", ic50,
                         epitope, allele)
            return None

    if invalid_binding_score(rank) or rank > 100:
        logging.warn("Invalid percentile rank %s for %s w/ allele %s", rank,
                     epitope, allele)
        return None

    if mutation_window_size:
        # if we clipped parts of the amino acid sequence which don't
        # overlap mutations then we have to offset epitope positions by
        # however much was removed from the beginning of the sequence
        original_start = max(
            0, mutation_entry.MutationStart - mutation_window_size)
        pos += original_start

    # keep track of original genetic variant that
    # gave rise to this epitope
    new_row = {}

    # fields shared by all epitopes from this sequence
    new_row['chr'] = mutation_entry.chr
    new_row['pos'] = mutation_entry.pos
    new_row['ref'] = mutation_entry.ref
    new_row['alt'] = mutation_entry.alt

    new_row['SourceSequence'] = mutation_entry.SourceSequence
    new_row['MutationStart'] = mutation_entry.MutationStart
    new_row['MutationEnd'] = mutation_entry.MutationEnd
    new_row['GeneInfo'] = mutation_entry.GeneInfo
    new_row['Gene'] = mutation_entry.Gene
    new_row["GeneMutationInfo"] = mutation_entry.GeneMutationInfo
    new_row['PeptideMutationInfo'] = mutation_entry.PeptideMutationInfo
    new_row['TranscriptId'] = mutation_entry.TranscriptId

    # fields specific to this epitope
    new_row['Allele'] = normalize_hla_allele_name(allele)
    new_row['EpitopeStart'] = pos
    new_row['EpitopeEnd'] = pos + len(epitope)
    new_row['Epitope'] = epitope
    new_row[IC50_FIELD_NAME] = ic50
    new_row[PERCENTILE_RANK_FIELD_NAME] = rank
    return new_row
예제 #8
0
파일: ui.py 프로젝트: xflicsu/immuno
def create_hla_types(file, patient_id):
    filename = secure_filename(file.filename)
    filepath = join(app.config['UPLOAD_FOLDER'], filename)
    file.save(filepath)
    alleles = read_hla_file(filepath)
    hla_types = []
    for allele in alleles:
        allele_normalized = normalize_hla_allele_name(allele)
        mhc_class = mhc_class_from_normalized_allele_name(allele_normalized)
        hla_type = HLAType(patient_id=patient_id, allele=allele_normalized,
            mhc_class=mhc_class)
        hla_types.append(hla_type)
    return hla_types
예제 #9
0
def create_binding_result_row(
        mutation_entry,
        allele,
        pos,
        epitope,
        log_ic50,
        ic50,
        rank,
        mutation_window_size = None):
    # if we have a bad IC50 score we might still get a salvageable
    # log of the score. Strangely, this is necessary sometimes!
    if invalid_binding_score(ic50):
        ic50 = 50000 ** (-log_ic50 + 1)
        # if IC50 is still NaN or otherwise invalid, abort
        if invalid_binding_score(ic50):
            logging.warn(
                "Invalid IC50 value %0.4f for %s w/ allele %s",
                ic50,
                epitope,
                allele)
            return None

    if invalid_binding_score(rank) or rank > 100:
        logging.warn(
            "Invalid percentile rank %s for %s w/ allele %s",
                rank, epitope, allele)
        return None

    if mutation_window_size:
        # if we clipped parts of the amino acid sequence which don't
        # overlap mutations then we have to offset epitope positions by
        # however much was removed from the beginning of the sequence
        original_start = max(
            0,
            mutation_entry.MutationStart - mutation_window_size)
        pos += original_start

    # keep track of original genetic variant that
    # gave rise to this epitope
    new_row = {}

    # fields shared by all epitopes from this sequence
    new_row['chr'] = mutation_entry.chr
    new_row['pos'] = mutation_entry.pos
    new_row['ref'] = mutation_entry.ref
    new_row['alt'] = mutation_entry.alt

    new_row['SourceSequence'] = mutation_entry.SourceSequence
    new_row['MutationStart'] = mutation_entry.MutationStart
    new_row['MutationEnd'] = mutation_entry.MutationEnd
    new_row['GeneInfo'] = mutation_entry.GeneInfo
    new_row['Gene'] = mutation_entry.Gene
    new_row["GeneMutationInfo"] = mutation_entry.GeneMutationInfo
    new_row['PeptideMutationInfo'] = mutation_entry.PeptideMutationInfo
    new_row['TranscriptId'] = mutation_entry.TranscriptId

    # fields specific to this epitope
    new_row['Allele'] = normalize_hla_allele_name(allele)
    new_row['EpitopeStart'] = pos
    new_row['EpitopeEnd'] = pos + len(epitope)
    new_row['Epitope'] = epitope
    new_row[IC50_FIELD_NAME] = ic50
    new_row[PERCENTILE_RANK_FIELD_NAME] = rank
    return new_row
예제 #10
0
        predictor = ConsensusBindingPredictor(alleles)
        return predictor.predict(mutated_regions)
    else:
        predictor = PanBindingPredictor(alleles)
        return predictor.predict(mutated_regions)

if __name__ == '__main__':
    args = parser.parse_args()

    init_logging(args.quiet)

    peptide_length = int(args.vaccine_peptide_length)

    # get rid of gene descriptions if they're in the dataframe
    if args.hla_file:
        alleles = [normalize_hla_allele_name(l) for l in open(args.hla_file)]
    elif args.hla:
        alleles = [normalize_hla_allele_name(l) for l in args.hla.split(",")]
    else:
        alleles = [normalize_hla_allele_name(DEFAULT_ALLELE)]

    # stack up the dataframes and later concatenate in case we
    # want both commandline strings (for weird mutations like translocations)
    # and files
    mutated_region_dfs = []

    if args.string:
        df = load_comma_string(args.string)
        mutated_region_dfs.append(df)

    # loop over all the input files and