Пример #1
0
    def seqannotation(self, seqrecord, allele, loc):
        """
        Gets the Annotation from the found sequence

        :return: The Annotation from the found sequence
        :rtype: Annotation
        """
        #seqrecord = self.seqrecord(allele, loc)
        complete_annotation = get_features(seqrecord)
        annotation = Annotation(annotation=complete_annotation,
                                method='match',
                                complete_annotation=True)

        if self.alignments:
            alignment = {
                f: self.annoated_alignments[loc][allele][f]['Seq']
                for f in self.annoated_alignments[loc][allele].keys()
            }
            annotation.aligned = alignment

        return annotation
Пример #2
0
 def test_002_gfe(self):
     gfe = GFE()
     for ex in self.expected['gfe']:
         loc = ex['locus']
         ann = ex['annotation']
         exp = ex['gfe']
         annotation = {}
         for f in ann:
             seqrec = SeqRecord(seq=Seq(ann[f], generic_dna), id="002_gfe")
             annotation.update({f: seqrec})
         a = Annotation(annotation=annotation)
         features, gfe = gfe.get_gfe(a, loc)
         for feat in features:
             self.assertIsInstance(feat, Feature)
         self.assertEqual(gfe, exp)
     pass
Пример #3
0
def gfe_from_allele(allele, gfe_maker):

    locus = allele.description.split(",")[0].split("*")[0]

    complete_annotation = get_features(allele)

    ann = Annotation(annotation=complete_annotation,
            method='match',
            complete_annotation=True)

    # This process takes a long time
    logging.info(f"Getting GFE data for allele {allele.id}...")
    features, gfe = gfe_maker.get_gfe(ann, locus)
        
    return { 
        "name": gfe,
        "features": features
    }
Пример #4
0
def main():
    """This is run if file is directly executed, but not if imported as
    module. Having this in a separate function  allows importing the file
    into interactive python, and still able to execute the
    function for testing"""
    parser = argparse.ArgumentParser()

    parser.add_argument("-k",
                        "--kir",
                        required=False,
                        help="Bool for KIR",
                        action='store_true')

    parser.add_argument("-a",
                        "--align",
                        required=False,
                        help="Bool for loading alignments",
                        action='store_true')

    parser.add_argument("-d",
                        "--debug",
                        required=False,
                        help="Bool for debugging",
                        action='store_true')

    parser.add_argument("-o",
                        "--outdir",
                        required=True,
                        help="Output directory",
                        type=str)

    parser.add_argument("-n",
                        "--number",
                        required=False,
                        help="Number of IMGT/DB releases",
                        default=1,
                        type=int)

    parser.add_argument("-r",
                        "--releases",
                        required=False,
                        help="IMGT/DB releases",
                        type=str)

    parser.add_argument("-v",
                        "--verbose",
                        help="Option for running in verbose",
                        action='store_true')

    data_dir = os.path.dirname(__file__)
    args = parser.parse_args()

    outdir = args.outdir

    load_loci = hla_loci + kir_loci
    release_n = args.number
    releases = args.releases
    verbosity = 1

    align = False
    kir = False
    debug = False
    verbose = False

    if args.kir:
        kir = True

    if args.align:
        align = True

    if args.verbose:
        verbose = True

    if kir:
        load_loci = hla_loci + kir_loci
    else:
        load_loci = hla_loci

    if args.debug:
        logging.info("Running in debug mode")
        load_loci = ["HLA-A"]
        kir = False
        debug = True
        verbose = True
        verbosity = 2
        release_n = 1

    gfe_e = []
    seq_e = []
    seq_n = []
    cds_n = []
    grp_e = []
    trs_e = []
    allele_n = []

    # Get last five IMGT/HLA releases
    if releases:
        dbversions = [db for db in releases.split(",")]
    else:
        dbversions = pd.read_html(imgt_hla)[0]['Release'][0:release_n].tolist()

    # Get lastest IMGT/KIR release
    kir_release = pd.read_html(imgt_kir)[0][0][1]

    gfe_maker = pyGFE(verbose=verbose,
                      verbosity=verbosity,
                      load_features=True,
                      store_features=True,
                      loci=load_loci)

    if kir:
        if verbose:
            logging.info("Adding KIR to GFE DB")

        kir_file = data_dir + '/../data/KIR.dat'

        if align:
            aligned = kir_alignments()

        # Downloading KIR
        if not os.path.isfile(kir_file):
            if verbose:
                logging.info("Downloading KIR dat file from " + kir_url)
            urllib.request.urlretrieve(kir_url, kir_file)

        kir_gen = SeqIO.parse(kir_file, "imgt")
        if verbose:
            logging.info("Finished parsing KIR dat file")

        i = 0
        for allele in kir_gen:
            if hasattr(allele, 'seq'):
                loc = allele.description.split(",")[0].split("*")[0]
                if loc in kir_loci and len(str(allele.seq)) > 5:
                    if verbose:
                        logging.info("KIR = " +
                                     allele.description.split(",")[0] + " " +
                                     kir_release)

                    groups = []
                    complete_annotation = get_features(allele)
                    ambigs = [
                        a for a in complete_annotation if re.search("/", a)
                    ]

                    aligned_seq = ''
                    if align:
                        if allele.description.split(",")[0] in aligned[loc]:
                            aligned_seq = aligned[loc][
                                allele.description.split(",")[0]]

                    if ambigs:
                        logging.info("AMBIGS " +
                                     allele.description.split(",")[0] + " " +
                                     kir_release)
                        annotations = []
                        for ambig in ambigs:
                            logging.info("AMBIG = " + ambig)
                            aterm = ambig.split("/")[0].split("_")[0]
                            anno = {
                                a: complete_annotation[a]
                                for a in complete_annotation if a not in ambigs
                            }
                            anno.update({
                                ambig.split("/")[0]:
                                complete_annotation[ambig]
                            })
                            annotations.append(anno)

                            anno2 = {
                                a: complete_annotation[a]
                                for a in complete_annotation if a not in ambigs
                            }
                            anno2.update({
                                aterm + "_" + ambig.split("/")[1]:
                                complete_annotation[ambig]
                            })
                            annotations.append(anno2)

                        for annotation in annotations:
                            ann = Annotation(annotation=annotation,
                                             method='match',
                                             complete_annotation=True)

                            features, gfe = gfe_maker.get_gfe(ann, loc)
                            (allelenode, gfeedge, seq_nodes, cds_nodes,
                             seq_edges, trans_edge, grp_edges) = build_graph(
                                 groups, gfe, allele, features, kir_release,
                                 aligned_seq, '', '', "IMGT_KIR", align)

                            gfe_e += gfeedge
                            seq_e += seq_edges
                            seq_n += seq_nodes
                            allele_n += allelenode
                            grp_e += grp_edges
                            trs_e += trans_edge
                            cds_n += cds_nodes
                        i += 1

                    else:
                        ann = Annotation(annotation=complete_annotation,
                                         method='match',
                                         complete_annotation=True)
                        features, gfe = gfe_maker.get_gfe(ann, loc)

                        (allelenode, gfeedge, seq_nodes, cds_nodes, seq_edges,
                         trans_edge, grp_edges) = build_graph(
                             groups, gfe, allele, features, kir_release,
                             aligned_seq, '', '', "IMGT_KIR", align)

                        gfe_e += gfeedge
                        seq_e += seq_edges
                        seq_n += seq_nodes
                        allele_n += allelenode
                        grp_e += grp_edges
                        trs_e += trans_edge
                        cds_n += cds_nodes
                        i += 1

    # Loop through DB versions
    for dbversion in dbversions:

        db_striped = ''.join(dbversion.split("."))

        if align:
            gen_aln, nuc_aln, prot_aln = hla_alignments(db_striped)

        ard = ARD(db_striped)

        dat_url = 'https://raw.githubusercontent.com/ANHIG/IMGTHLA/' \
                  + db_striped + '/hla.dat'
        dat_file = data_dir + '/hla.' + str(db_striped) + ".dat"

        # Downloading DAT file
        if not os.path.isfile(dat_file):
            if verbose:
                logging.info("Downloading dat file from " + dat_url)
            urllib.request.urlretrieve(dat_url, dat_file)

        cmd = "perl -p -i -e 's/[^\\x00-\\x7F]//g' " + dat_file
        os.system(cmd)

        a_gen = SeqIO.parse(dat_file, "imgt")
        if verbose:
            logging.info("Finished parsing dat file")

        i = 0
        for allele in a_gen:
            if hasattr(allele, 'seq'):
                hla_name = allele.description.split(",")[0]
                loc = allele.description.split(",")[0].split("*")[0]
                if hla_name in skip_alleles:
                    logging.info("SKIPPING = " +
                                 allele.description.split(",")[0] + " " +
                                 dbversion)
                    continue

                if (debug and (loc != "HLA-A" and i > 20)):
                    continue

                if (loc in hla_loci
                        or loc == "DRB5") and (len(str(allele.seq)) > 5):
                    if verbose:
                        logging.info("HLA = " +
                                     allele.description.split(",")[0] + " " +
                                     dbversion)

                    a_name = allele.description.split(",")[0].split("-")[1]
                    groups = [["HLA-" + ard.redux(a_name, grp), grp]
                              if ard.redux(a_name, grp) != a_name else None
                              for grp in ard_groups]
                    seco = [[to_second(a_name), "2nd_FIELD"]]
                    groups = list(filter(None, groups)) + seco
                    complete_annotation = get_features(allele)
                    ann = Annotation(annotation=complete_annotation,
                                     method='match',
                                     complete_annotation=True)
                    features, gfe = gfe_maker.get_gfe(ann, loc)

                    #gen_aln, nuc_aln, prot_aln
                    aligned_gen = ''
                    aligned_nuc = ''
                    aligned_prot = ''

                    if align:
                        if allele.description.split(",")[0] in gen_aln[loc]:
                            aligned_gen = gen_aln[loc][
                                allele.description.split(",")[0]]

                        if allele.description.split(",")[0] in nuc_aln[loc]:
                            aligned_nuc = nuc_aln[loc][
                                allele.description.split(",")[0]]

                        if allele.description.split(",")[0] in prot_aln[loc]:
                            aligned_prot = prot_aln[loc][
                                allele.description.split(",")[0]]

                    (allelenode, gfeedge, seq_nodes, cds_nodes, seq_edges,
                     trans_edge, grp_edges) = build_graph(
                         groups, gfe, allele, features, dbversion, aligned_gen,
                         aligned_nuc, aligned_prot, "IMGT_HLA", align)

                    gfe_e += gfeedge
                    seq_e += seq_edges
                    seq_n += seq_nodes
                    allele_n += allelenode
                    grp_e += grp_edges
                    trs_e += trans_edge
                    cds_n += cds_nodes
                    i += 1
        if verbose:
            logging.info("Finished loading IMGT DB " + dbversion)
    if verbose:
        logging.info("Finished loading ALL DB versions")
    gfe_df = pd.DataFrame(
        gfe_e,
        columns=":START_ID(ALLELE),:END_ID(ALLELE),imgt_release,:TYPE".split(
            ","))
    seq_df = pd.DataFrame(
        seq_e,
        columns=
        ":START_ID(ALLELE),:END_ID(SEQUENCE),imgt_release,accession,:TYPE".
        split(","))
    seqn_df = pd.DataFrame(
        seq_n,
        columns=
        "sequenceId:ID(SEQUENCE),sequence,name,feature:LABEL,rank,length,seq:string[]"
        .split(","))
    allele_df = pd.DataFrame(
        allele_n,
        columns="alleleId:ID(ALLELE),name,alleletype:LABEL,locus".split(","))
    group_df = pd.DataFrame(
        grp_e,
        columns=":START_ID(ALLELE),:END_ID(ALLELE),imgtdb,:TYPE".split(","))
    cdsn_df = pd.DataFrame(
        cds_n,
        columns="cdsId:ID(CDS),name,cdstype:LABEL,cds,protein".split(","))
    trs_df = pd.DataFrame(
        trs_e, columns=":START_ID(SEQUENCE),:END_ID(CDS),:TYPE".split(","))

    if verbose:
        gfe_es = str(len(gfe_df))
        seq_es = str(len(seq_df))
        seq_ns = str(len(seqn_df))
        all_ns = str(len(allele_df))
        grp_es = str(len(group_df))
        cds_ns = str(len(cdsn_df))
        cds_es = str(len(trs_df))
        logging.info("GFE Edges    = " + gfe_es)
        logging.info("Seq Edges    = " + seq_es)
        logging.info("Group Edges  = " + grp_es)
        logging.info("CDS Edges    = " + cds_es)
        logging.info("Seq Nodes    = " + seq_ns)
        logging.info("CDS Nodes    = " + cds_ns)
        logging.info("Allele Nodes = " + all_ns)

    gfe_df.to_csv(outdir + "/gfe_edges.csv", header=True, index=False)
    seq_df.to_csv(outdir + "/seq_edges.csv", header=True, index=False)
    seqn_df.to_csv(outdir + "/sequence_nodes.csv", header=True, index=False)
    allele_df.to_csv(outdir + "/allele_nodes.csv", header=True, index=False)
    cdsn_df.to_csv(outdir + "/cds_nodes.csv", header=True, index=False)
    group_df.to_csv(outdir + "/group_edges.csv", header=True, index=False)
    trs_df.to_csv(outdir + "/cds_edges.csv", header=True, index=False)

    if verbose:
        logging.info("** Finshed build **")
Пример #5
0
def align_seqs(found_seqs,
               sequence,
               locus,
               start_pos,
               missing,
               annotated,
               cutoff=0.90,
               verbose=False,
               verbosity=0):
    """
    align_seqs - Aligns sequences with clustalo

    :param found_seqs: List of the reference sequences
    :type found_seqs: ``List``
    :param sequence: The input consensus sequence.
    :type sequence: SeqRecord
    :param locus: The gene locus associated with the sequence.
    :type locus: ``str``
    :param annotated: dictonary of the annotated features
    :type annotated: ``dict``
    :param start_pos: Where the reference sequence starts
    :type start_pos: ``int``
    :param missing: List of the unmapped features
    :type missing: ``List``
    :param cutoff: The alignment cutoff
    :type cutoff: ``float``
    :param verbose: Flag for running in verbose mode.
    :type verbose: ``bool``
    :param verbosity: Numerical value to indicate how verbose the output will be in verbose mode.
    :type verbosity: ``int``
    :rtype: :ref:`ann`
    """
    logger = logging.getLogger("Logger." + __name__)
    seqs = [found_seqs, sequence]

    if verbose and verbosity > 0:
        logger.info("found_seqs length = " + str(len(found_seqs)))
        logger.info("sequence length = " + str(len(sequence)))

    seqs = []
    seqs.append(found_seqs)
    seqs.append(sequence)

    align = []

    # piping to clustalo failed
    # when sequences were over ~7k bp
    if len(sequence) > 7000:

        # Writing sequences out to fasta files..
        if verbose:
            logger.info("Sequence too large to use pipe")
        randid = randomid()
        input_fasta = str(randid) + ".fasta"
        output_clu = str(randid) + ".clu"
        SeqIO.write(seqs, input_fasta, "fasta")
        clustalomega_cline = ClustalOmegaCommandline(infile=input_fasta,
                                                     outfile=output_clu,
                                                     outfmt='clu',
                                                     wrap=20000,
                                                     verbose=True,
                                                     auto=True)
        stdout, stderr = clustalomega_cline()
        aligns = AlignIO.read(output_clu, "clustal")
        for aln in aligns:
            align.append(str(aln.seq))

        # Delete files
        cleanup(randid)
    else:
        # Running clustalo by piping in sequences
        indata = flatten([[">" + str(s.id), str(s.seq)] for s in seqs])
        child = Popen([
            'clustalo', '--outfmt', 'clu', '--wrap=50000', '--auto', '-i', '-'
        ],
                      stdout=PIPE,
                      stderr=STDOUT,
                      stdin=PIPE)

        stdout = child.communicate(input=str.encode("\n".join(indata)))
        child.wait()

        lines = bytes.decode(stdout[0]).split("\n")
        for line in lines:
            if re.search("\w", line) and not re.search("CLUSTAL", line):
                alignment = re.findall(r"[\S']+", line)
                if len(alignment) == 2:
                    align.append(list(alignment[1]))
        child.terminate()

    # Print out what blocks haven't been annotated
    if verbose and len(align) > 0:
        logger.info("* ClustalOmega alignment succeeded *")

    insers, dels = 0, 0
    all_features = []
    if len(align) - 2 == 0:
        infeats = get_seqfeat(seqs[0])
        diffs = count_diffs(align, infeats, sequence, locus, cutoff, verbose,
                            verbosity)
        if isinstance(diffs, Annotation):
            if verbose:
                logger.info("Run alignment with " + found_seqs.id)
                logger.info("***********************")
            return diffs, 0, 0
        else:
            insers, dels = diffs[0], diffs[1]
        f = find_features(infeats, align[0], annotated, start_pos, cutoff)
        all_features.append(f)
    else:
        for i in range(0, len(align) - 2):
            infeats = get_seqfeat(seqs[i])
            f = find_features(infeats, align[i], annotated, start_pos, cutoff)
            all_features.append(f)

    if len(all_features) > 0:
        if verbose:
            logger.info("-- Resolving features -- ")
            for f in all_features[0]:
                logger.info("Resolving -> " + f)

        annotation = resolve_feats(all_features, align[len(align) - 1],
                                   align[0], start_pos, locus, missing,
                                   verbose, verbosity)
        if verbose:
            logger.info("Run alignment with " + found_seqs.id)
            logger.info("Missing features = " + ",".join(list(missing.keys())))
            logger.info("Number of features found = " + str(len(all_features)))
            logger.info("Features found = " +
                        ",".join(list(all_features[0].keys())))
            logger.info("Features annotated = " +
                        ",".join(list(annotation.annotation.keys())))
            logger.info("***********************")

        return annotation, insers, dels
    else:
        if verbose:
            logger.info("***********************")
        return Annotation(complete_annotation=False), 0, 0
Пример #6
0
def count_diffs(align,
                feats,
                inseq,
                locus,
                cutoff,
                verbose=False,
                verbosity=0):
    """
    count_diffs - Counts the number of mismatches, gaps, and insertions and then determines if those are within an acceptable range.

    :param align: The alignment
    :type align: ``List``
    :param feats: Dictonary of the features
    :type feats: ``dict``
    :param locus: The gene locus associated with the sequence.
    :type locus: ``str``
    :param inseq: The input sequence
    :type inseq: ``str``
    :param cutoff: The alignment cutoff
    :type cutoff: ``float``
    :param verbose: Flag for running in verbose mode.
    :type verbose: ``bool``
    :param verbosity: Numerical value to indicate how verbose the output will be in verbose mode.
    :type verbosity: ``int``
    :rtype: ``List``
    """

    nfeats = len(feats.keys())
    mm = 0
    insr = 0
    dels = 0
    gaps = 0
    match = 0
    lastb = ''
    l = len(align[0]) if len(align[0]) > len(align[1]) else len(align[1])

    # Counting gaps, mismatches and insertions
    for i in range(0, l):
        if align[0][i] == "-" or align[1][i] == "-":
            if align[0][i] == "-":
                insr += 1
                if lastb != '-':
                    gaps += 1
                lastb = "-"
            if align[1][i] == "-":
                dels += 1
                if lastb != '-':
                    gaps += 1
                lastb = "-"
        else:
            lastb = ''
            if align[0][i] != align[1][i]:
                mm += 1
            else:
                match += 1

    gper = gaps / nfeats
    delper = dels / l
    iper = insr / l
    mmper = mm / l
    mper = match / l
    mper2 = match / len(inseq)

    logger = logging.getLogger("Logger." + __name__)

    if verbose and verbosity > 0:
        logger.info("Features algined = " + ",".join(list(feats.keys())))
        logger.info('{:<22}{:<6d}'.format("Number of feats: ", nfeats))
        logger.info('{:<22}{:<6d}{:<1.2f}'.format("Number of gaps: ", gaps,
                                                  gper))
        logger.info('{:<22}{:<6d}{:<1.2f}'.format("Number of deletions: ",
                                                  dels, delper))
        logger.info('{:<22}{:<6d}{:<1.2f}'.format("Number of insertions: ",
                                                  insr, iper))
        logger.info('{:<22}{:<6d}{:<1.2f}'.format("Number of mismatches: ", mm,
                                                  mmper))
        logger.info('{:<22}{:<6d}{:<1.2f}'.format("Number of matches: ", match,
                                                  mper))
        logger.info('{:<22}{:<6d}{:<1.2f}'.format("Number of matches: ", match,
                                                  mper2))
    indel = iper + delper

    # ** HARD CODED LOGIC ** #
    if len(inseq) > 6000 and mmper < .10 and mper2 > .80:
        if verbose:
            logger.info(
                "Alignment coverage high enough to complete annotation 11")
        return insr, dels
    else:
        # TODO: These numbers need to be fine tuned
        indel_mm = indel + mper2
        if (indel > 0.5 or mmper > 0.05) and mper2 < cutoff and indel_mm != 1:
            if verbose:
                logger.info(
                    "Alignment coverage NOT high enough to return annotation")
            return Annotation(complete_annotation=False)
        else:
            if verbose:
                logger.info(
                    "Alignment coverage high enough to complete annotation")
            return insr, dels
Пример #7
0
def resolve_feats(feat_list,
                  seqin,
                  seqref,
                  start,
                  locus,
                  missing,
                  verbose=False,
                  verbosity=0):
    """
    resolve_feats - Resolves features from alignments

    :param feat_list: List of the found features
    :type feat_list: ``List``
    :param seqin: The input sequence
    :type seqin: ``str``
    :param locus: The input locus
    :type locus: ``str``
    :param start: Where the sequence start in the alignment
    :type start: ``int``
    :param missing: List of the unmapped features
    :type missing: ``List``
    :param verbose: Flag for running in verbose mode.
    :type verbose: ``bool``
    :param verbosity: Numerical value to indicate how verbose the output will be in verbose mode.
    :type verbosity: ``int``
    :rtype: :ref:`ann`
    """
    structures = get_structures()
    logger = logging.getLogger("Logger." + __name__)
    seq = SeqRecord(seq=Seq("".join(seqin), SingleLetterAlphabet()))
    seq_covered = len(seq.seq)
    coordinates = dict(
        map(lambda x: [x, 1], [i for i in range(0,
                                                len(seq.seq) + 1)]))

    mapping = dict(
        map(lambda x: [x, 1], [i for i in range(0,
                                                len(seq.seq) + 1)]))

    diff = 0
    if len(feat_list) > 1:
        if verbose:
            logger.error("resolve_feats error")
        return Annotation(complete_annotation=False)
    else:
        features = {}
        full_annotation = {}
        features = feat_list[0]

        # Need to sort
        feature_list = sorted(features.keys(),
                              key=lambda f: structures[locus][f])

        diff_f = True
        for feat in feature_list:
            if feat in missing:
                f = features[feat]
                seqrec = f.extract(seq)
                seq_covered -= len(seqrec.seq)
                if re.search("-", str(seqrec.seq)):
                    l1 = len(seqrec.seq)
                    newseq = re.sub(r'-', '', str(seqrec.seq))
                    seqrec.seq = Seq(newseq, IUPAC.unambiguous_dna)
                    tmdiff = l1 - len(newseq)
                    diff += tmdiff

                if seqrec.seq:
                    #logger.error("FEAT HAS SEQ " + feat)
                    if diff_f and diff > 0:
                        sp = f.location.start + start
                        diff_f = False
                    else:
                        sp = f.location.start + start - diff

                    ep = f.location.end + start - diff
                    featn = SeqFeature(FeatureLocation(ExactPosition(sp),
                                                       ExactPosition(ep),
                                                       strand=1),
                                       type=f.type)

                    features.update({feat: featn})
                    full_annotation.update({feat: seqrec})

                    for i in range(featn.location.start, featn.location.end):
                        if i in coordinates:
                            del coordinates[i]
                        mapping[i] = feat
            else:
                f = features[feat]
                seqrec = f.extract(seq)
                seq_covered -= len(seqrec.seq)
                if re.search("-", str(seqrec.seq)):
                    l1 = len(seqrec.seq)
                    newseq = re.sub(r'-', '', str(seqrec.seq))
                    seqrec.seq = Seq(newseq, IUPAC.unambiguous_dna)
                    tmdiff = l1 - len(newseq)
                    diff += tmdiff

        blocks = getblocks(coordinates)
        rmapping = {k + start: mapping[k] for k in mapping.keys()}

        # Print out what features are missing
        if verbose and verbosity > 0 and len(full_annotation.keys()) > 1:
            logger.info("Features resolved:")
            for f in full_annotation:
                logger.info(f)
        else:
            if verbose:
                logger.info("Failed to resolve")

        if not full_annotation or len(full_annotation) == 0:
            if verbose:
                logger.info("Failed to align missing features")
            return Annotation(complete_annotation=False)
        else:
            return Annotation(annotation=full_annotation,
                              method="clustalo",
                              features=features,
                              mapping=rmapping,
                              blocks=blocks,
                              seq=seq)
Пример #8
0
    def search_seqs(self, seqrec, in_seq, locus, run=0, partial_ann=None):
        """
        search_seqs - method for annotating a BioPython sequence without alignment

        :param seqrec: The reference sequence
        :type seqrec: SeqRecord
        :param locus: The gene locus associated with the sequence.
        :type locus: str
        :param in_seq: The input sequence
        :type in_seq: SeqRecord
        :param run: The number of runs that have been done
        :type run: int
        :param partial_ann: A partial annotation from a previous step
        :type partial_ann: :ref:`ann`
        :rtype: :ref:`ann`

        Example usage:

            >>> from Bio.Seq import Seq
            >>> from seqann.seq_search import SeqSearch
            >>> inseq = Seq('AGAGACTCTCCCGAGGATTTCGTGTACCAGTTTAAGGCCATGTGCTACTTCACC')
            >>> sqsrch = SeqSearch()
            >>> ann = sqsrch.search_seqs(refseqs, inseq)

        """
        # Extract out the sequences and feature names
        # from the reference sequences

        # The mapped features will be subtracted from seq_covered
        # so the final seq_covered number will reflect the remaining
        # number of base pairs that haven't been mapped.
        #
        # The coordinates and mapping will help determine what positions
        # in the sequence have been mapped and to what features. The
        # missing blocks variable will be generated using these.
        structures = get_structures()
        seq_covered = len(in_seq.seq)
        coordinates = dict(
            map(lambda x: [x, 1], [i for i in range(0,
                                                    len(in_seq.seq) + 1)]))

        mapping = dict(
            map(lambda x: [x, 1], [i for i in range(0,
                                                    len(in_seq.seq) + 1)]))

        ambig_map = {}
        found_feats = {}
        feat_missing = {}

        method = "nt_search" if not partial_ann else partial_ann.method

        # If the partial annotation is provided
        # then make the found_feats equal to
        # what has already been annotated
        feats = get_features(seqrec)
        if partial_ann:

            found_feats = partial_ann.features

            if self.verbose and self.verbosity > 4:
                self.logger.info("Found partial features:")
                for f in found_feats:
                    self.logger.info(f)

            # Skip references that only have features
            # that have already been annoated
            if len([f for f in feats if f in found_feats]) == len(feats):
                if self.verbose:
                    self.logger.info("Skipping incomplete refseq")
                return partial_ann

            if self.verbose and self.verbosity > 1:
                self.logger.info("Using partial annotation | " + locus + " " +
                                 str(len(partial_ann.features)))

            coordinates = dict(
                map(lambda l: [l, 1], [
                    item for sublist in partial_ann.blocks for item in sublist
                ]))
            seq_covered = partial_ann.covered
            mapping = partial_ann.mapping

            if self.verbose and self.verbosity > 2:
                self.logger.info("Partial sequence coverage = " +
                                 str(seq_covered))
                self.logger.info("Partial sequence metho = " + method)

        added_feat = {}
        deleted_coords = {}
        for feat_name in sorted(feats, key=lambda k: structures[locus][k]):

            # skip if partial annotation is provided
            # and the feat name is not one of the
            # missing features
            if partial_ann and feat_name not in partial_ann.refmissing:
                if self.verbose and self.verbosity > 1:
                    self.logger.info("Skipping " + feat_name +
                                     " - Already annotated")
                continue

            if self.verbose and self.verbosity > 1:
                self.logger.info("Running seqsearch for " + feat_name)

            # Search for the reference feature sequence in the
            # input sequence. Record the coordinates if it's
            # found and if it's found in multiple spots. If it
            # is not found, then record that feature as missing.
            seq_search = nt_search(str(in_seq.seq), str(feats[feat_name]))

            if len(seq_search) == 2:

                if self.verbose and self.verbosity > 0:
                    self.logger.info("Found exact match for " + feat_name)

                seq_covered -= len(str(feats[feat_name]))
                end = int(len(str(feats[feat_name])) + seq_search[1])

                if feat_name == 'three_prime_UTR' \
                        and len(str(in_seq.seq)) > end:
                    end = len(str(in_seq.seq))

                # If the feature is found and it's a five_prime_UTR then
                # the start should always be 0, so insertions at the
                # beinging of the sequence will be found.
                start = seq_search[1] if feat_name != 'five_prime_UTR' else 0
                si = seq_search[1]+1 if seq_search[1] != 0 and \
                    feat_name != 'five_prime_UTR' else 0

                # check if this features has already been mapped
                mapcheck = set(
                    [0 if i in coordinates else 1 for i in range(si, end + 1)])

                # Dont map features if they are out of order
                skip = False
                if found_feats and len(found_feats) > 0:
                    for f in found_feats:
                        o1 = structures[locus][feat_name]
                        o2 = structures[locus][f]
                        loctyp = loctype(found_feats[f].location.start,
                                         found_feats[f].location.end, start,
                                         end)

                        if o1 < o2 and loctyp:
                            skip = True
                            if self.verbose:
                                self.logger.info("Skipping map for " +
                                                 feat_name)
                        elif o2 < o1 and not loctyp:
                            skip = True
                            if self.verbose:
                                self.logger.info("Skipping map for " +
                                                 feat_name)

                if 1 not in mapcheck and not skip:
                    for i in range(si, end + 1):
                        if i in coordinates:
                            if feat_name == "exon_8" or feat_name == 'three_prime_UTR':
                                deleted_coords.update({i: coordinates[i]})
                            del coordinates[i]
                        else:
                            if self.verbose:
                                self.logger.error(
                                    "seqsearch - should't be here " + locus +
                                    " - " + " - " + feat_name)
                        mapping[i] = feat_name

                    found_feats.update({
                        feat_name:
                        SeqFeature(FeatureLocation(ExactPosition(start),
                                                   ExactPosition(end),
                                                   strand=1),
                                   type=feat_name)
                    })

                    if feat_name == "exon_8" or feat_name == 'three_prime_UTR':
                        added_feat.update({feat_name: feats[feat_name]})
                    if self.verbose and self.verbosity > 3:
                        self.logger.info("Coordinates | Start = " +
                                         str(start) + " - End = " + str(end))

            elif (len(seq_search) > 2):
                if self.verbose and self.verbosity > 1:
                    self.logger.info("Found " + str(len(seq_search)) +
                                     " matches for " + feat_name)

                new_seq = [seq_search[0]]
                for i in range(1, len(seq_search)):
                    tnp = seq_search[i] + 1
                    if seq_search[i] in coordinates or tnp in coordinates:
                        new_seq.append(seq_search[i])

                seq_search = new_seq
                if (partial_ann and feat_name == "exon_8" and run > 0):
                    missing_feats = sorted(list(partial_ann.missing.keys()))

                    # * HARD CODED LOGIC * #
                    # > exon8 in class I maps to multiple spots in a sequence,
                    #   often in the 3' UTR. These features need to be mapped
                    #   last to make sure it's not mapping exon8 incorrectly.
                    if (missing_feats == ['exon_8', 'three_prime_UTR']
                            and len(seq_search) <= 3):
                        if self.verbose and self.verbosity > 0:
                            self.logger.info("Resolving exon_8")

                        seq_covered -= len(str(feats[feat_name]))
                        end = int(len(str(feats[feat_name])) + seq_search[1])

                        # If the feature is found and it's a five_prime_UTR then
                        # the start should always be 0, so insertions at the
                        # beinging of the sequence will be found.
                        start = seq_search[1]
                        si = seq_search[1] + 1 if seq_search[1] != 0 else 0

                        # check if this features has already been mapped
                        mapcheck = set([
                            0 if i in coordinates else 1
                            for i in range(si, end + 1)
                        ])

                        for i in range(si, end + 1):
                            if i in coordinates:
                                del coordinates[i]
                            else:
                                if self.verbose:
                                    self.logger.error(
                                        "seqsearch - should't be here " +
                                        locus + " - " + " - " + feat_name)
                            mapping[i] = feat_name

                        found_feats.update({
                            feat_name:
                            SeqFeature(FeatureLocation(ExactPosition(start),
                                                       ExactPosition(end),
                                                       strand=1),
                                       type=feat_name)
                        })

                        if self.verbose and self.verbosity > 0:
                            self.logger.info("Coordinates | Start = " +
                                             str(start) + " - End = " +
                                             str(end))
                    else:
                        if self.verbose and self.verbosity > 0:
                            self.logger.info("Adding ambig feature " +
                                             feat_name)
                        feat_missing.update({feat_name: feats[feat_name]})
                        ambig_map.update(
                            {feat_name: seq_search[1:len(seq_search)]})
                else:
                    if self.verbose and self.verbosity > 0:
                        self.logger.info("Adding ambig feature " + feat_name)
                    feat_missing.update({feat_name: feats[feat_name]})
                    ambig_map.update(
                        {feat_name: seq_search[1:len(seq_search)]})
            else:
                if self.verbose and self.verbosity > 1:
                    self.logger.info("No match for " + feat_name)
                feat_missing.update({feat_name: feats[feat_name]})

        blocks = getblocks(coordinates)
        exact_matches = list(found_feats.keys())

        # * HARD CODED LOGIC * #
        # >
        #
        #  HLA-DRB1 exon3 exact match - with intron1 and 3 missing
        if ('exon_3' in exact_matches and run == 99 and locus == 'HLA-DRB1'
                and 'exon_2' in feat_missing
                and (len(blocks) == 1 or len(blocks) == 2)):

            for b in blocks:
                x = b[len(b) - 1]
                if x == max(list(mapping.keys())):
                    featname = "intron_3"
                    found_feats.update({
                        featname:
                        SeqFeature(FeatureLocation(ExactPosition(b[0] - 1),
                                                   ExactPosition(b[len(b) -
                                                                   1]),
                                                   strand=1),
                                   type=featname)
                    })
                else:
                    featname = "exon_2"
                    found_feats.update({
                        featname:
                        SeqFeature(FeatureLocation(ExactPosition(b[0]),
                                                   ExactPosition(b[len(b) -
                                                                   1]),
                                                   strand=1),
                                   type=featname)
                    })
                    seq_covered -= len(b)

                if self.verbose and self.verbosity > 1:
                    self.logger.info(
                        "Successfully annotated class DRB1 II sequence")

                return Annotation(features=found_feats,
                                  covered=seq_covered,
                                  seq=in_seq,
                                  missing=feat_missing,
                                  ambig=ambig_map,
                                  method=method,
                                  mapping=mapping,
                                  exact_match=exact_matches)

        # If it's a class II sequence and
        # exon_2 is an exact match
        # * HARD CODED LOGIC * #
        # > It's common for exon2 to be fully sequenced
        #   but intron_2 and intron_1 to be partially sequenced,
        #   which can make it hard to annotate those to features.
        #   If there are two missing blocks that is small enough
        #   and they are before and after exon2, then it's very
        #   very likely to be intron_2 and intron_1.
        if 'exon_2' in exact_matches and len(blocks) == 2 \
                and is_classII(locus) and seq_covered < 300:

            if self.verbose and self.verbosity > 1:
                self.logger.info("Running search for class II sequence")

            r = True
            for b in blocks:
                x = b[len(b) - 1]
                if x == max(list(mapping.keys())):
                    x = b[0] - 1
                else:
                    x += 1
                f = mapping[x]
                if f != 'exon_2':
                    r = False
            if r:
                for b in blocks:
                    x = b[len(b) - 1]
                    if x == max(list(mapping.keys())):
                        featname = "intron_2"
                        found_feats.update({
                            featname:
                            SeqFeature(FeatureLocation(ExactPosition(b[0] - 1),
                                                       ExactPosition(b[len(b) -
                                                                       1]),
                                                       strand=1),
                                       type=featname)
                        })
                    else:
                        featname = "intron_1"
                        found_feats.update({
                            featname:
                            SeqFeature(FeatureLocation(ExactPosition(b[0]),
                                                       ExactPosition(b[len(b) -
                                                                       1]),
                                                       strand=1),
                                       type=featname)
                        })
                    seq_covered -= len(b)

                if self.verbose and self.verbosity > 1:
                    self.logger.info(
                        "Successfully annotated class II sequence")

                return Annotation(features=found_feats,
                                  covered=seq_covered,
                                  seq=in_seq,
                                  missing=feat_missing,
                                  ambig=ambig_map,
                                  method=method,
                                  mapping=mapping,
                                  exact_match=exact_matches)

        annotated_feats, mb, mapping = self._resolve_unmapped(
            blocks, feat_missing, ambig_map, mapping, found_feats, locus,
            seq_covered)

        # * HARD CODED LOGIC * #
        if (not mb and blocks and len(feat_missing.keys()) == 0
                and len(ambig_map.keys()) == 0):
            mb = blocks

        if mb:

            # Unmap exon 8
            if locus in ['HLA-C', 'HLA-A'] and len(in_seq.seq) < 3000 \
                    and 'exon_8' in exact_matches:
                for i in deleted_coords:
                    mapping[i] = 1
                coordinates.update(deleted_coords)
                mb = getblocks(coordinates)
                feat_missing.update(added_feat)

                # Delte from found features
                del exact_matches[exact_matches.index('exon_8')]
                del found_feats['exon_8']

                if 'exon_8' in annotated_feats:
                    del annotated_feats['exon_8']
                if 'three_prime_UTR' in found_feats:
                    del found_feats['three_prime_UTR']
                if 'three_prime_UTR' in annotated_feats:
                    del annotated_feats['three_prime_UTR']

            refmissing = [
                f for f in structures[locus] if f not in annotated_feats
            ]

            if self.verbose and self.verbosity > 1:
                self.logger.info("* Annotation not complete *")

            # Print out what features were missing by the ref
            if self.verbose and self.verbosity > 2:
                self.logger.info("Refseq was missing these features = " +
                                 ",".join(list(refmissing)))

            # Print out what features were ambig matches
            if self.verbose and self.verbosity > 1 and len(ambig_map) > 1:
                self.logger.info("Features with ambig matches = " +
                                 ",".join(list(ambig_map)))

            # Print out what features were exact matches
            if self.verbose and self.verbosity > 2 and len(exact_matches) > 1:
                self.logger.info("Features exact matches = " +
                                 ",".join(list(exact_matches)))

            # Print out what features have been annotated
            if self.verbose and self.verbosity > 1 and len(
                    annotated_feats) > 1:
                self.logger.info("Features annotated = " +
                                 ",".join(list(annotated_feats)))

            # Print out what features are missing
            if self.verbose and self.verbosity > 1 and len(feat_missing) > 1:
                self.logger.info("Features missing = " +
                                 ",".join(list(feat_missing)))

            annotation = Annotation(features=annotated_feats,
                                    covered=seq_covered,
                                    seq=in_seq,
                                    missing=feat_missing,
                                    ambig=ambig_map,
                                    blocks=mb,
                                    method=method,
                                    refmissing=refmissing,
                                    mapping=mapping,
                                    exact_match=exact_matches,
                                    annotation=None)
        else:

            mb = None
            # Unmap exon 8
            if locus in ['HLA-C', 'HLA-A'] and len(in_seq.seq) < 600 \
                    and 'exon_8' in exact_matches \
                    and 'three_prime_UTR' in annotated_feats\
                    and 'three_prime_UTR' not in exact_matches:

                for i in deleted_coords:
                    mapping[i] = 1

                coordinates.update(deleted_coords)
                mb = getblocks(coordinates)
                feat_missing.update(added_feat)
                del exact_matches[exact_matches.index('exon_8')]
                del found_feats['exon_8']
                if 'exon_8' in annotated_feats:
                    del annotated_feats['exon_8']
                if 'three_prime_UTR' in found_feats:
                    del found_feats['three_prime_UTR']
                if 'three_prime_UTR' in annotated_feats:
                    del annotated_feats['three_prime_UTR']

            if self.verbose:
                self.logger.info("* No missing blocks after seq_search *")

            # Print out what features were ambig matches
            if self.verbose and self.verbosity > 0 and len(ambig_map) > 1:
                self.logger.info("Features with ambig matches = " +
                                 ",".join(list(ambig_map)))

            # Print out what features were exact matches
            if self.verbose and self.verbosity > 0 and len(exact_matches) > 1:
                self.logger.info("Features exact matches = " +
                                 ",".join(list(exact_matches)))

            # Print out what features have been annotated
            if self.verbose and self.verbosity > 0 and len(
                    annotated_feats) > 1:
                self.logger.info("Features annotated = " +
                                 ",".join(list(annotated_feats)))

            # Print out what features are missing
            if self.verbose and self.verbosity > 0 and len(feat_missing) > 1:
                self.logger.info("Features missing = " +
                                 ",".join(list(feat_missing)))

            annotation = Annotation(features=annotated_feats,
                                    covered=seq_covered,
                                    seq=in_seq,
                                    missing=feat_missing,
                                    ambig=ambig_map,
                                    method=method,
                                    blocks=mb,
                                    mapping=mapping,
                                    exact_match=exact_matches,
                                    annotation=None)

        return annotation