Python Annotation примеры использования

Язык программирования: Python

Пространство имен/Пакет: seqann.models.annotation

Класс/Тип: Annotation

Примеров на hotexamples.com: 8

Python Annotation - 8 примеров найдено. Это лучшие примеры Python кода для seqann.models.annotation.Annotation, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Annotation(8)

aligned(1)

Основные методы

Annotation (8)

aligned (1)

Пример #1

Показать файл

Файл: reference_data.py Проект: mhalagan/SeqAnn

    def seqannotation(self, seqrecord, allele, loc):
        """
        Gets the Annotation from the found sequence

        :return: The Annotation from the found sequence
        :rtype: Annotation
        """
        #seqrecord = self.seqrecord(allele, loc)
        complete_annotation = get_features(seqrecord)
        annotation = Annotation(annotation=complete_annotation,
                                method='match',
                                complete_annotation=True)

        if self.alignments:
            alignment = {
                f: self.annoated_alignments[loc][allele][f]['Seq']
                for f in self.annoated_alignments[loc][allele].keys()
            }
            annotation.aligned = alignment

        return annotation

Пример #2

Показать файл

 def test_002_gfe(self):
     gfe = GFE()
     for ex in self.expected['gfe']:
         loc = ex['locus']
         ann = ex['annotation']
         exp = ex['gfe']
         annotation = {}
         for f in ann:
             seqrec = SeqRecord(seq=Seq(ann[f], generic_dna), id="002_gfe")
             annotation.update({f: seqrec})
         a = Annotation(annotation=annotation)
         features, gfe = gfe.get_gfe(a, loc)
         for feat in features:
             self.assertIsInstance(feat, Feature)
         self.assertEqual(gfe, exp)
     pass

Пример #3

Показать файл

def gfe_from_allele(allele, gfe_maker):

    locus = allele.description.split(",")[0].split("*")[0]

    complete_annotation = get_features(allele)

    ann = Annotation(annotation=complete_annotation,
            method='match',
            complete_annotation=True)

    # This process takes a long time
    logging.info(f"Getting GFE data for allele {allele.id}...")
    features, gfe = gfe_maker.get_gfe(ann, locus)
        
    return { 
        "name": gfe,
        "features": features
    }

Пример #4

Показать файл

Файл: build_gfedb.py Проект: mhalagan/gfe-db

def main():
    """This is run if file is directly executed, but not if imported as
    module. Having this in a separate function  allows importing the file
    into interactive python, and still able to execute the
    function for testing"""
    parser = argparse.ArgumentParser()

    parser.add_argument("-k",
                        "--kir",
                        required=False,
                        help="Bool for KIR",
                        action='store_true')

    parser.add_argument("-a",
                        "--align",
                        required=False,
                        help="Bool for loading alignments",
                        action='store_true')

    parser.add_argument("-d",
                        "--debug",
                        required=False,
                        help="Bool for debugging",
                        action='store_true')

    parser.add_argument("-o",
                        "--outdir",
                        required=True,
                        help="Output directory",
                        type=str)

    parser.add_argument("-n",
                        "--number",
                        required=False,
                        help="Number of IMGT/DB releases",
                        default=1,
                        type=int)

    parser.add_argument("-r",
                        "--releases",
                        required=False,
                        help="IMGT/DB releases",
                        type=str)

    parser.add_argument("-v",
                        "--verbose",
                        help="Option for running in verbose",
                        action='store_true')

    data_dir = os.path.dirname(__file__)
    args = parser.parse_args()

    outdir = args.outdir

    load_loci = hla_loci + kir_loci
    release_n = args.number
    releases = args.releases
    verbosity = 1

    align = False
    kir = False
    debug = False
    verbose = False

    if args.kir:
        kir = True

    if args.align:
        align = True

    if args.verbose:
        verbose = True

    if kir:
        load_loci = hla_loci + kir_loci
    else:
        load_loci = hla_loci

    if args.debug:
        logging.info("Running in debug mode")
        load_loci = ["HLA-A"]
        kir = False
        debug = True
        verbose = True
        verbosity = 2
        release_n = 1

    gfe_e = []
    seq_e = []
    seq_n = []
    cds_n = []
    grp_e = []
    trs_e = []
    allele_n = []

    # Get last five IMGT/HLA releases
    if releases:
        dbversions = [db for db in releases.split(",")]
    else:
        dbversions = pd.read_html(imgt_hla)[0]['Release'][0:release_n].tolist()

    # Get lastest IMGT/KIR release
    kir_release = pd.read_html(imgt_kir)[0][0][1]

    gfe_maker = pyGFE(verbose=verbose,
                      verbosity=verbosity,
                      load_features=True,
                      store_features=True,
                      loci=load_loci)

    if kir:
        if verbose:
            logging.info("Adding KIR to GFE DB")

        kir_file = data_dir + '/../data/KIR.dat'

        if align:
            aligned = kir_alignments()

        # Downloading KIR
        if not os.path.isfile(kir_file):
            if verbose:
                logging.info("Downloading KIR dat file from " + kir_url)
            urllib.request.urlretrieve(kir_url, kir_file)

        kir_gen = SeqIO.parse(kir_file, "imgt")
        if verbose:
            logging.info("Finished parsing KIR dat file")

        i = 0
        for allele in kir_gen:
            if hasattr(allele, 'seq'):
                loc = allele.description.split(",")[0].split("*")[0]
                if loc in kir_loci and len(str(allele.seq)) > 5:
                    if verbose:
                        logging.info("KIR = " +
                                     allele.description.split(",")[0] + " " +
                                     kir_release)

                    groups = []
                    complete_annotation = get_features(allele)
                    ambigs = [
                        a for a in complete_annotation if re.search("/", a)
                    ]

                    aligned_seq = ''
                    if align:
                        if allele.description.split(",")[0] in aligned[loc]:
                            aligned_seq = aligned[loc][
                                allele.description.split(",")[0]]

                    if ambigs:
                        logging.info("AMBIGS " +
                                     allele.description.split(",")[0] + " " +
                                     kir_release)
                        annotations = []
                        for ambig in ambigs:
                            logging.info("AMBIG = " + ambig)
                            aterm = ambig.split("/")[0].split("_")[0]
                            anno = {
                                a: complete_annotation[a]
                                for a in complete_annotation if a not in ambigs
                            }
                            anno.update({
                                ambig.split("/")[0]:
                                complete_annotation[ambig]
                            })
                            annotations.append(anno)

                            anno2 = {
                                a: complete_annotation[a]
                                for a in complete_annotation if a not in ambigs
                            }
                            anno2.update({
                                aterm + "_" + ambig.split("/")[1]:
                                complete_annotation[ambig]
                            })
                            annotations.append(anno2)

                        for annotation in annotations:
                            ann = Annotation(annotation=annotation,
                                             method='match',
                                             complete_annotation=True)

                            features, gfe = gfe_maker.get_gfe(ann, loc)
                            (allelenode, gfeedge, seq_nodes, cds_nodes,
                             seq_edges, trans_edge, grp_edges) = build_graph(
                                 groups, gfe, allele, features, kir_release,
                                 aligned_seq, '', '', "IMGT_KIR", align)

                            gfe_e += gfeedge
                            seq_e += seq_edges
                            seq_n += seq_nodes
                            allele_n += allelenode
                            grp_e += grp_edges
                            trs_e += trans_edge
                            cds_n += cds_nodes
                        i += 1

                    else:
                        ann = Annotation(annotation=complete_annotation,
                                         method='match',
                                         complete_annotation=True)
                        features, gfe = gfe_maker.get_gfe(ann, loc)

                        (allelenode, gfeedge, seq_nodes, cds_nodes, seq_edges,
                         trans_edge, grp_edges) = build_graph(
                             groups, gfe, allele, features, kir_release,
                             aligned_seq, '', '', "IMGT_KIR", align)

                        gfe_e += gfeedge
                        seq_e += seq_edges
                        seq_n += seq_nodes
                        allele_n += allelenode
                        grp_e += grp_edges
                        trs_e += trans_edge
                        cds_n += cds_nodes
                        i += 1

    # Loop through DB versions
    for dbversion in dbversions:

        db_striped = ''.join(dbversion.split("."))

        if align:
            gen_aln, nuc_aln, prot_aln = hla_alignments(db_striped)

        ard = ARD(db_striped)

        dat_url = 'https://raw.githubusercontent.com/ANHIG/IMGTHLA/' \
                  + db_striped + '/hla.dat'
        dat_file = data_dir + '/hla.' + str(db_striped) + ".dat"

        # Downloading DAT file
        if not os.path.isfile(dat_file):
            if verbose:
                logging.info("Downloading dat file from " + dat_url)
            urllib.request.urlretrieve(dat_url, dat_file)

        cmd = "perl -p -i -e 's/[^\\x00-\\x7F]//g' " + dat_file
        os.system(cmd)

        a_gen = SeqIO.parse(dat_file, "imgt")
        if verbose:
            logging.info("Finished parsing dat file")

        i = 0
        for allele in a_gen:
            if hasattr(allele, 'seq'):
                hla_name = allele.description.split(",")[0]
                loc = allele.description.split(",")[0].split("*")[0]
                if hla_name in skip_alleles:
                    logging.info("SKIPPING = " +
                                 allele.description.split(",")[0] + " " +
                                 dbversion)
                    continue

                if (debug and (loc != "HLA-A" and i > 20)):
                    continue

                if (loc in hla_loci
                        or loc == "DRB5") and (len(str(allele.seq)) > 5):
                    if verbose:
                        logging.info("HLA = " +
                                     allele.description.split(",")[0] + " " +
                                     dbversion)

                    a_name = allele.description.split(",")[0].split("-")[1]
                    groups = [["HLA-" + ard.redux(a_name, grp), grp]
                              if ard.redux(a_name, grp) != a_name else None
                              for grp in ard_groups]
                    seco = [[to_second(a_name), "2nd_FIELD"]]
                    groups = list(filter(None, groups)) + seco
                    complete_annotation = get_features(allele)
                    ann = Annotation(annotation=complete_annotation,
                                     method='match',
                                     complete_annotation=True)
                    features, gfe = gfe_maker.get_gfe(ann, loc)

                    #gen_aln, nuc_aln, prot_aln
                    aligned_gen = ''
                    aligned_nuc = ''
                    aligned_prot = ''

                    if align:
                        if allele.description.split(",")[0] in gen_aln[loc]:
                            aligned_gen = gen_aln[loc][
                                allele.description.split(",")[0]]

                        if allele.description.split(",")[0] in nuc_aln[loc]:
                            aligned_nuc = nuc_aln[loc][
                                allele.description.split(",")[0]]

                        if allele.description.split(",")[0] in prot_aln[loc]:
                            aligned_prot = prot_aln[loc][
                                allele.description.split(",")[0]]

                    (allelenode, gfeedge, seq_nodes, cds_nodes, seq_edges,
                     trans_edge, grp_edges) = build_graph(
                         groups, gfe, allele, features, dbversion, aligned_gen,
                         aligned_nuc, aligned_prot, "IMGT_HLA", align)

                    gfe_e += gfeedge
                    seq_e += seq_edges
                    seq_n += seq_nodes
                    allele_n += allelenode
                    grp_e += grp_edges
                    trs_e += trans_edge
                    cds_n += cds_nodes
                    i += 1
        if verbose:
            logging.info("Finished loading IMGT DB " + dbversion)
    if verbose:
        logging.info("Finished loading ALL DB versions")
    gfe_df = pd.DataFrame(
        gfe_e,
        columns=":START_ID(ALLELE),:END_ID(ALLELE),imgt_release,:TYPE".split(
            ","))
    seq_df = pd.DataFrame(
        seq_e,
        columns=
        ":START_ID(ALLELE),:END_ID(SEQUENCE),imgt_release,accession,:TYPE".
        split(","))
    seqn_df = pd.DataFrame(
        seq_n,
        columns=
        "sequenceId:ID(SEQUENCE),sequence,name,feature:LABEL,rank,length,seq:string[]"
        .split(","))
    allele_df = pd.DataFrame(
        allele_n,
        columns="alleleId:ID(ALLELE),name,alleletype:LABEL,locus".split(","))
    group_df = pd.DataFrame(
        grp_e,
        columns=":START_ID(ALLELE),:END_ID(ALLELE),imgtdb,:TYPE".split(","))
    cdsn_df = pd.DataFrame(
        cds_n,
        columns="cdsId:ID(CDS),name,cdstype:LABEL,cds,protein".split(","))
    trs_df = pd.DataFrame(
        trs_e, columns=":START_ID(SEQUENCE),:END_ID(CDS),:TYPE".split(","))

    if verbose:
        gfe_es = str(len(gfe_df))
        seq_es = str(len(seq_df))
        seq_ns = str(len(seqn_df))
        all_ns = str(len(allele_df))
        grp_es = str(len(group_df))
        cds_ns = str(len(cdsn_df))
        cds_es = str(len(trs_df))
        logging.info("GFE Edges    = " + gfe_es)
        logging.info("Seq Edges    = " + seq_es)
        logging.info("Group Edges  = " + grp_es)
        logging.info("CDS Edges    = " + cds_es)
        logging.info("Seq Nodes    = " + seq_ns)
        logging.info("CDS Nodes    = " + cds_ns)
        logging.info("Allele Nodes = " + all_ns)

    gfe_df.to_csv(outdir + "/gfe_edges.csv", header=True, index=False)
    seq_df.to_csv(outdir + "/seq_edges.csv", header=True, index=False)
    seqn_df.to_csv(outdir + "/sequence_nodes.csv", header=True, index=False)
    allele_df.to_csv(outdir + "/allele_nodes.csv", header=True, index=False)
    cdsn_df.to_csv(outdir + "/cds_nodes.csv", header=True, index=False)
    group_df.to_csv(outdir + "/group_edges.csv", header=True, index=False)
    trs_df.to_csv(outdir + "/cds_edges.csv", header=True, index=False)

    if verbose:
        logging.info("** Finshed build **")

Пример #5

Показать файл

def align_seqs(found_seqs,
               sequence,
               locus,
               start_pos,
               missing,
               annotated,
               cutoff=0.90,
               verbose=False,
               verbosity=0):
    """
    align_seqs - Aligns sequences with clustalo

    :param found_seqs: List of the reference sequences
    :type found_seqs: ``List``
    :param sequence: The input consensus sequence.
    :type sequence: SeqRecord
    :param locus: The gene locus associated with the sequence.
    :type locus: ``str``
    :param annotated: dictonary of the annotated features
    :type annotated: ``dict``
    :param start_pos: Where the reference sequence starts
    :type start_pos: ``int``
    :param missing: List of the unmapped features
    :type missing: ``List``
    :param cutoff: The alignment cutoff
    :type cutoff: ``float``
    :param verbose: Flag for running in verbose mode.
    :type verbose: ``bool``
    :param verbosity: Numerical value to indicate how verbose the output will be in verbose mode.
    :type verbosity: ``int``
    :rtype: :ref:`ann`
    """
    logger = logging.getLogger("Logger." + __name__)
    seqs = [found_seqs, sequence]

    if verbose and verbosity > 0:
        logger.info("found_seqs length = " + str(len(found_seqs)))
        logger.info("sequence length = " + str(len(sequence)))

    seqs = []
    seqs.append(found_seqs)
    seqs.append(sequence)

    align = []

    # piping to clustalo failed
    # when sequences were over ~7k bp
    if len(sequence) > 7000:

        # Writing sequences out to fasta files..
        if verbose:
            logger.info("Sequence too large to use pipe")
        randid = randomid()
        input_fasta = str(randid) + ".fasta"
        output_clu = str(randid) + ".clu"
        SeqIO.write(seqs, input_fasta, "fasta")
        clustalomega_cline = ClustalOmegaCommandline(infile=input_fasta,
                                                     outfile=output_clu,
                                                     outfmt='clu',
                                                     wrap=20000,
                                                     verbose=True,
                                                     auto=True)
        stdout, stderr = clustalomega_cline()
        aligns = AlignIO.read(output_clu, "clustal")
        for aln in aligns:
            align.append(str(aln.seq))

        # Delete files
        cleanup(randid)
    else:
        # Running clustalo by piping in sequences
        indata = flatten([[">" + str(s.id), str(s.seq)] for s in seqs])
        child = Popen([
            'clustalo', '--outfmt', 'clu', '--wrap=50000', '--auto', '-i', '-'
        ],
                      stdout=PIPE,
                      stderr=STDOUT,
                      stdin=PIPE)

        stdout = child.communicate(input=str.encode("\n".join(indata)))
        child.wait()

        lines = bytes.decode(stdout[0]).split("\n")
        for line in lines:
            if re.search("\w", line) and not re.search("CLUSTAL", line):
                alignment = re.findall(r"[\S']+", line)
                if len(alignment) == 2:
                    align.append(list(alignment[1]))
        child.terminate()

    # Print out what blocks haven't been annotated
    if verbose and len(align) > 0:
        logger.info("* ClustalOmega alignment succeeded *")

    insers, dels = 0, 0
    all_features = []
    if len(align) - 2 == 0:
        infeats = get_seqfeat(seqs[0])
        diffs = count_diffs(align, infeats, sequence, locus, cutoff, verbose,
                            verbosity)
        if isinstance(diffs, Annotation):
            if verbose:
                logger.info("Run alignment with " + found_seqs.id)
                logger.info("***********************")
            return diffs, 0, 0
        else:
            insers, dels = diffs[0], diffs[1]
        f = find_features(infeats, align[0], annotated, start_pos, cutoff)
        all_features.append(f)
    else:
        for i in range(0, len(align) - 2):
            infeats = get_seqfeat(seqs[i])
            f = find_features(infeats, align[i], annotated, start_pos, cutoff)
            all_features.append(f)

    if len(all_features) > 0:
        if verbose:
            logger.info("-- Resolving features -- ")
            for f in all_features[0]:
                logger.info("Resolving -> " + f)

        annotation = resolve_feats(all_features, align[len(align) - 1],
                                   align[0], start_pos, locus, missing,
                                   verbose, verbosity)
        if verbose:
            logger.info("Run alignment with " + found_seqs.id)
            logger.info("Missing features = " + ",".join(list(missing.keys())))
            logger.info("Number of features found = " + str(len(all_features)))
            logger.info("Features found = " +
                        ",".join(list(all_features[0].keys())))
            logger.info("Features annotated = " +
                        ",".join(list(annotation.annotation.keys())))
            logger.info("***********************")

        return annotation, insers, dels
    else:
        if verbose:
            logger.info("***********************")
        return Annotation(complete_annotation=False), 0, 0

Пример #6

Показать файл

def count_diffs(align,
                feats,
                inseq,
                locus,
                cutoff,
                verbose=False,
                verbosity=0):
    """
    count_diffs - Counts the number of mismatches, gaps, and insertions and then determines if those are within an acceptable range.

    :param align: The alignment
    :type align: ``List``
    :param feats: Dictonary of the features
    :type feats: ``dict``
    :param locus: The gene locus associated with the sequence.
    :type locus: ``str``
    :param inseq: The input sequence
    :type inseq: ``str``
    :param cutoff: The alignment cutoff
    :type cutoff: ``float``
    :param verbose: Flag for running in verbose mode.
    :type verbose: ``bool``
    :param verbosity: Numerical value to indicate how verbose the output will be in verbose mode.
    :type verbosity: ``int``
    :rtype: ``List``
    """

    nfeats = len(feats.keys())
    mm = 0
    insr = 0
    dels = 0
    gaps = 0
    match = 0
    lastb = ''
    l = len(align[0]) if len(align[0]) > len(align[1]) else len(align[1])

    # Counting gaps, mismatches and insertions
    for i in range(0, l):
        if align[0][i] == "-" or align[1][i] == "-":
            if align[0][i] == "-":
                insr += 1
                if lastb != '-':
                    gaps += 1
                lastb = "-"
            if align[1][i] == "-":
                dels += 1
                if lastb != '-':
                    gaps += 1
                lastb = "-"
        else:
            lastb = ''
            if align[0][i] != align[1][i]:
                mm += 1
            else:
                match += 1

    gper = gaps / nfeats
    delper = dels / l
    iper = insr / l
    mmper = mm / l
    mper = match / l
    mper2 = match / len(inseq)

    logger = logging.getLogger("Logger." + __name__)

    if verbose and verbosity > 0:
        logger.info("Features algined = " + ",".join(list(feats.keys())))
        logger.info('{:<22}{:<6d}'.format("Number of feats: ", nfeats))
        logger.info('{:<22}{:<6d}{:<1.2f}'.format("Number of gaps: ", gaps,
                                                  gper))
        logger.info('{:<22}{:<6d}{:<1.2f}'.format("Number of deletions: ",
                                                  dels, delper))
        logger.info('{:<22}{:<6d}{:<1.2f}'.format("Number of insertions: ",
                                                  insr, iper))
        logger.info('{:<22}{:<6d}{:<1.2f}'.format("Number of mismatches: ", mm,
                                                  mmper))
        logger.info('{:<22}{:<6d}{:<1.2f}'.format("Number of matches: ", match,
                                                  mper))
        logger.info('{:<22}{:<6d}{:<1.2f}'.format("Number of matches: ", match,
                                                  mper2))
    indel = iper + delper

    # ** HARD CODED LOGIC ** #
    if len(inseq) > 6000 and mmper < .10 and mper2 > .80:
        if verbose:
            logger.info(
                "Alignment coverage high enough to complete annotation 11")
        return insr, dels
    else:
        # TODO: These numbers need to be fine tuned
        indel_mm = indel + mper2
        if (indel > 0.5 or mmper > 0.05) and mper2 < cutoff and indel_mm != 1:
            if verbose:
                logger.info(
                    "Alignment coverage NOT high enough to return annotation")
            return Annotation(complete_annotation=False)
        else:
            if verbose:
                logger.info(
                    "Alignment coverage high enough to complete annotation")
            return insr, dels

Пример #7

Показать файл

def resolve_feats(feat_list,
                  seqin,
                  seqref,
                  start,
                  locus,
                  missing,
                  verbose=False,
                  verbosity=0):
    """
    resolve_feats - Resolves features from alignments

    :param feat_list: List of the found features
    :type feat_list: ``List``
    :param seqin: The input sequence
    :type seqin: ``str``
    :param locus: The input locus
    :type locus: ``str``
    :param start: Where the sequence start in the alignment
    :type start: ``int``
    :param missing: List of the unmapped features
    :type missing: ``List``
    :param verbose: Flag for running in verbose mode.
    :type verbose: ``bool``
    :param verbosity: Numerical value to indicate how verbose the output will be in verbose mode.
    :type verbosity: ``int``
    :rtype: :ref:`ann`
    """
    structures = get_structures()
    logger = logging.getLogger("Logger." + __name__)
    seq = SeqRecord(seq=Seq("".join(seqin), SingleLetterAlphabet()))
    seq_covered = len(seq.seq)
    coordinates = dict(
        map(lambda x: [x, 1], [i for i in range(0,
                                                len(seq.seq) + 1)]))

    mapping = dict(
        map(lambda x: [x, 1], [i for i in range(0,
                                                len(seq.seq) + 1)]))

    diff = 0
    if len(feat_list) > 1:
        if verbose:
            logger.error("resolve_feats error")
        return Annotation(complete_annotation=False)
    else:
        features = {}
        full_annotation = {}
        features = feat_list[0]

        # Need to sort
        feature_list = sorted(features.keys(),
                              key=lambda f: structures[locus][f])

        diff_f = True
        for feat in feature_list:
            if feat in missing:
                f = features[feat]
                seqrec = f.extract(seq)
                seq_covered -= len(seqrec.seq)
                if re.search("-", str(seqrec.seq)):
                    l1 = len(seqrec.seq)
                    newseq = re.sub(r'-', '', str(seqrec.seq))
                    seqrec.seq = Seq(newseq, IUPAC.unambiguous_dna)
                    tmdiff = l1 - len(newseq)
                    diff += tmdiff

                if seqrec.seq:
                    #logger.error("FEAT HAS SEQ " + feat)
                    if diff_f and diff > 0:
                        sp = f.location.start + start
                        diff_f = False
                    else:
                        sp = f.location.start + start - diff

                    ep = f.location.end + start - diff
                    featn = SeqFeature(FeatureLocation(ExactPosition(sp),
                                                       ExactPosition(ep),
                                                       strand=1),
                                       type=f.type)

                    features.update({feat: featn})
                    full_annotation.update({feat: seqrec})

                    for i in range(featn.location.start, featn.location.end):
                        if i in coordinates:
                            del coordinates[i]
                        mapping[i] = feat
            else:
                f = features[feat]
                seqrec = f.extract(seq)
                seq_covered -= len(seqrec.seq)
                if re.search("-", str(seqrec.seq)):
                    l1 = len(seqrec.seq)
                    newseq = re.sub(r'-', '', str(seqrec.seq))
                    seqrec.seq = Seq(newseq, IUPAC.unambiguous_dna)
                    tmdiff = l1 - len(newseq)
                    diff += tmdiff

        blocks = getblocks(coordinates)
        rmapping = {k + start: mapping[k] for k in mapping.keys()}

        # Print out what features are missing
        if verbose and verbosity > 0 and len(full_annotation.keys()) > 1:
            logger.info("Features resolved:")
            for f in full_annotation:
                logger.info(f)
        else:
            if verbose:
                logger.info("Failed to resolve")

        if not full_annotation or len(full_annotation) == 0:
            if verbose:
                logger.info("Failed to align missing features")
            return Annotation(complete_annotation=False)
        else:
            return Annotation(annotation=full_annotation,
                              method="clustalo",
                              features=features,
                              mapping=rmapping,
                              blocks=blocks,
                              seq=seq)

Пример #8

Показать файл

Файл: seq_search.py Проект: pbashyal-nmdp/seq-ann

    def search_seqs(self, seqrec, in_seq, locus, run=0, partial_ann=None):
        """
        search_seqs - method for annotating a BioPython sequence without alignment

        :param seqrec: The reference sequence
        :type seqrec: SeqRecord
        :param locus: The gene locus associated with the sequence.
        :type locus: str
        :param in_seq: The input sequence
        :type in_seq: SeqRecord
        :param run: The number of runs that have been done
        :type run: int
        :param partial_ann: A partial annotation from a previous step
        :type partial_ann: :ref:`ann`
        :rtype: :ref:`ann`

        Example usage:

            >>> from Bio.Seq import Seq
            >>> from seqann.seq_search import SeqSearch
            >>> inseq = Seq('AGAGACTCTCCCGAGGATTTCGTGTACCAGTTTAAGGCCATGTGCTACTTCACC')
            >>> sqsrch = SeqSearch()
            >>> ann = sqsrch.search_seqs(refseqs, inseq)

        """
        # Extract out the sequences and feature names
        # from the reference sequences

        # The mapped features will be subtracted from seq_covered
        # so the final seq_covered number will reflect the remaining
        # number of base pairs that haven't been mapped.
        #
        # The coordinates and mapping will help determine what positions
        # in the sequence have been mapped and to what features. The
        # missing blocks variable will be generated using these.
        structures = get_structures()
        seq_covered = len(in_seq.seq)
        coordinates = dict(
            map(lambda x: [x, 1], [i for i in range(0,
                                                    len(in_seq.seq) + 1)]))

        mapping = dict(
            map(lambda x: [x, 1], [i for i in range(0,
                                                    len(in_seq.seq) + 1)]))

        ambig_map = {}
        found_feats = {}
        feat_missing = {}

        method = "nt_search" if not partial_ann else partial_ann.method

        # If the partial annotation is provided
        # then make the found_feats equal to
        # what has already been annotated
        feats = get_features(seqrec)
        if partial_ann:

            found_feats = partial_ann.features

            if self.verbose and self.verbosity > 4:
                self.logger.info("Found partial features:")
                for f in found_feats:
                    self.logger.info(f)

            # Skip references that only have features
            # that have already been annoated
            if len([f for f in feats if f in found_feats]) == len(feats):
                if self.verbose:
                    self.logger.info("Skipping incomplete refseq")
                return partial_ann

            if self.verbose and self.verbosity > 1:
                self.logger.info("Using partial annotation | " + locus + " " +
                                 str(len(partial_ann.features)))

            coordinates = dict(
                map(lambda l: [l, 1], [
                    item for sublist in partial_ann.blocks for item in sublist
                ]))
            seq_covered = partial_ann.covered
            mapping = partial_ann.mapping

            if self.verbose and self.verbosity > 2:
                self.logger.info("Partial sequence coverage = " +
                                 str(seq_covered))
                self.logger.info("Partial sequence metho = " + method)

        added_feat = {}
        deleted_coords = {}
        for feat_name in sorted(feats, key=lambda k: structures[locus][k]):

            # skip if partial annotation is provided
            # and the feat name is not one of the
            # missing features
            if partial_ann and feat_name not in partial_ann.refmissing:
                if self.verbose and self.verbosity > 1:
                    self.logger.info("Skipping " + feat_name +
                                     " - Already annotated")
                continue

            if self.verbose and self.verbosity > 1:
                self.logger.info("Running seqsearch for " + feat_name)

            # Search for the reference feature sequence in the
            # input sequence. Record the coordinates if it's
            # found and if it's found in multiple spots. If it
            # is not found, then record that feature as missing.
            seq_search = nt_search(str(in_seq.seq), str(feats[feat_name]))

            if len(seq_search) == 2:

                if self.verbose and self.verbosity > 0:
                    self.logger.info("Found exact match for " + feat_name)

                seq_covered -= len(str(feats[feat_name]))
                end = int(len(str(feats[feat_name])) + seq_search[1])

                if feat_name == 'three_prime_UTR' \
                        and len(str(in_seq.seq)) > end:
                    end = len(str(in_seq.seq))

                # If the feature is found and it's a five_prime_UTR then
                # the start should always be 0, so insertions at the
                # beinging of the sequence will be found.
                start = seq_search[1] if feat_name != 'five_prime_UTR' else 0
                si = seq_search[1]+1 if seq_search[1] != 0 and \
                    feat_name != 'five_prime_UTR' else 0

                # check if this features has already been mapped
                mapcheck = set(
                    [0 if i in coordinates else 1 for i in range(si, end + 1)])

                # Dont map features if they are out of order
                skip = False
                if found_feats and len(found_feats) > 0:
                    for f in found_feats:
                        o1 = structures[locus][feat_name]
                        o2 = structures[locus][f]
                        loctyp = loctype(found_feats[f].location.start,
                                         found_feats[f].location.end, start,
                                         end)

                        if o1 < o2 and loctyp:
                            skip = True
                            if self.verbose:
                                self.logger.info("Skipping map for " +
                                                 feat_name)
                        elif o2 < o1 and not loctyp:
                            skip = True
                            if self.verbose:
                                self.logger.info("Skipping map for " +
                                                 feat_name)

                if 1 not in mapcheck and not skip:
                    for i in range(si, end + 1):
                        if i in coordinates:
                            if feat_name == "exon_8" or feat_name == 'three_prime_UTR':
                                deleted_coords.update({i: coordinates[i]})
                            del coordinates[i]
                        else:
                            if self.verbose:
                                self.logger.error(
                                    "seqsearch - should't be here " + locus +
                                    " - " + " - " + feat_name)
                        mapping[i] = feat_name

                    found_feats.update({
                        feat_name:
                        SeqFeature(FeatureLocation(ExactPosition(start),
                                                   ExactPosition(end),
                                                   strand=1),
                                   type=feat_name)
                    })

                    if feat_name == "exon_8" or feat_name == 'three_prime_UTR':
                        added_feat.update({feat_name: feats[feat_name]})
                    if self.verbose and self.verbosity > 3:
                        self.logger.info("Coordinates | Start = " +
                                         str(start) + " - End = " + str(end))

            elif (len(seq_search) > 2):
                if self.verbose and self.verbosity > 1:
                    self.logger.info("Found " + str(len(seq_search)) +
                                     " matches for " + feat_name)

                new_seq = [seq_search[0]]
                for i in range(1, len(seq_search)):
                    tnp = seq_search[i] + 1
                    if seq_search[i] in coordinates or tnp in coordinates:
                        new_seq.append(seq_search[i])

                seq_search = new_seq
                if (partial_ann and feat_name == "exon_8" and run > 0):
                    missing_feats = sorted(list(partial_ann.missing.keys()))

                    # * HARD CODED LOGIC * #
                    # > exon8 in class I maps to multiple spots in a sequence,
                    #   often in the 3' UTR. These features need to be mapped
                    #   last to make sure it's not mapping exon8 incorrectly.
                    if (missing_feats == ['exon_8', 'three_prime_UTR']
                            and len(seq_search) <= 3):
                        if self.verbose and self.verbosity > 0:
                            self.logger.info("Resolving exon_8")

                        seq_covered -= len(str(feats[feat_name]))
                        end = int(len(str(feats[feat_name])) + seq_search[1])

                        # If the feature is found and it's a five_prime_UTR then
                        # the start should always be 0, so insertions at the
                        # beinging of the sequence will be found.
                        start = seq_search[1]
                        si = seq_search[1] + 1 if seq_search[1] != 0 else 0

                        # check if this features has already been mapped
                        mapcheck = set([
                            0 if i in coordinates else 1
                            for i in range(si, end + 1)
                        ])

                        for i in range(si, end + 1):
                            if i in coordinates:
                                del coordinates[i]
                            else:
                                if self.verbose:
                                    self.logger.error(
                                        "seqsearch - should't be here " +
                                        locus + " - " + " - " + feat_name)
                            mapping[i] = feat_name

                        found_feats.update({
                            feat_name:
                            SeqFeature(FeatureLocation(ExactPosition(start),
                                                       ExactPosition(end),
                                                       strand=1),
                                       type=feat_name)
                        })

                        if self.verbose and self.verbosity > 0:
                            self.logger.info("Coordinates | Start = " +
                                             str(start) + " - End = " +
                                             str(end))
                    else:
                        if self.verbose and self.verbosity > 0:
                            self.logger.info("Adding ambig feature " +
                                             feat_name)
                        feat_missing.update({feat_name: feats[feat_name]})
                        ambig_map.update(
                            {feat_name: seq_search[1:len(seq_search)]})
                else:
                    if self.verbose and self.verbosity > 0:
                        self.logger.info("Adding ambig feature " + feat_name)
                    feat_missing.update({feat_name: feats[feat_name]})
                    ambig_map.update(
                        {feat_name: seq_search[1:len(seq_search)]})
            else:
                if self.verbose and self.verbosity > 1:
                    self.logger.info("No match for " + feat_name)
                feat_missing.update({feat_name: feats[feat_name]})

        blocks = getblocks(coordinates)
        exact_matches = list(found_feats.keys())

        # * HARD CODED LOGIC * #
        # >
        #
        #  HLA-DRB1 exon3 exact match - with intron1 and 3 missing
        if ('exon_3' in exact_matches and run == 99 and locus == 'HLA-DRB1'
                and 'exon_2' in feat_missing
                and (len(blocks) == 1 or len(blocks) == 2)):

            for b in blocks:
                x = b[len(b) - 1]
                if x == max(list(mapping.keys())):
                    featname = "intron_3"
                    found_feats.update({
                        featname:
                        SeqFeature(FeatureLocation(ExactPosition(b[0] - 1),
                                                   ExactPosition(b[len(b) -
                                                                   1]),
                                                   strand=1),
                                   type=featname)
                    })
                else:
                    featname = "exon_2"
                    found_feats.update({
                        featname:
                        SeqFeature(FeatureLocation(ExactPosition(b[0]),
                                                   ExactPosition(b[len(b) -
                                                                   1]),
                                                   strand=1),
                                   type=featname)
                    })
                    seq_covered -= len(b)

                if self.verbose and self.verbosity > 1:
                    self.logger.info(
                        "Successfully annotated class DRB1 II sequence")

                return Annotation(features=found_feats,
                                  covered=seq_covered,
                                  seq=in_seq,
                                  missing=feat_missing,
                                  ambig=ambig_map,
                                  method=method,
                                  mapping=mapping,
                                  exact_match=exact_matches)

        # If it's a class II sequence and
        # exon_2 is an exact match
        # * HARD CODED LOGIC * #
        # > It's common for exon2 to be fully sequenced
        #   but intron_2 and intron_1 to be partially sequenced,
        #   which can make it hard to annotate those to features.
        #   If there are two missing blocks that is small enough
        #   and they are before and after exon2, then it's very
        #   very likely to be intron_2 and intron_1.
        if 'exon_2' in exact_matches and len(blocks) == 2 \
                and is_classII(locus) and seq_covered < 300:

            if self.verbose and self.verbosity > 1:
                self.logger.info("Running search for class II sequence")

            r = True
            for b in blocks:
                x = b[len(b) - 1]
                if x == max(list(mapping.keys())):
                    x = b[0] - 1
                else:
                    x += 1
                f = mapping[x]
                if f != 'exon_2':
                    r = False
            if r:
                for b in blocks:
                    x = b[len(b) - 1]
                    if x == max(list(mapping.keys())):
                        featname = "intron_2"
                        found_feats.update({
                            featname:
                            SeqFeature(FeatureLocation(ExactPosition(b[0] - 1),
                                                       ExactPosition(b[len(b) -
                                                                       1]),
                                                       strand=1),
                                       type=featname)
                        })
                    else:
                        featname = "intron_1"
                        found_feats.update({
                            featname:
                            SeqFeature(FeatureLocation(ExactPosition(b[0]),
                                                       ExactPosition(b[len(b) -
                                                                       1]),
                                                       strand=1),
                                       type=featname)
                        })
                    seq_covered -= len(b)

                if self.verbose and self.verbosity > 1:
                    self.logger.info(
                        "Successfully annotated class II sequence")

                return Annotation(features=found_feats,
                                  covered=seq_covered,
                                  seq=in_seq,
                                  missing=feat_missing,
                                  ambig=ambig_map,
                                  method=method,
                                  mapping=mapping,
                                  exact_match=exact_matches)

        annotated_feats, mb, mapping = self._resolve_unmapped(
            blocks, feat_missing, ambig_map, mapping, found_feats, locus,
            seq_covered)

        # * HARD CODED LOGIC * #
        if (not mb and blocks and len(feat_missing.keys()) == 0
                and len(ambig_map.keys()) == 0):
            mb = blocks

        if mb:

            # Unmap exon 8
            if locus in ['HLA-C', 'HLA-A'] and len(in_seq.seq) < 3000 \
                    and 'exon_8' in exact_matches:
                for i in deleted_coords:
                    mapping[i] = 1
                coordinates.update(deleted_coords)
                mb = getblocks(coordinates)
                feat_missing.update(added_feat)

                # Delte from found features
                del exact_matches[exact_matches.index('exon_8')]
                del found_feats['exon_8']

                if 'exon_8' in annotated_feats:
                    del annotated_feats['exon_8']
                if 'three_prime_UTR' in found_feats:
                    del found_feats['three_prime_UTR']
                if 'three_prime_UTR' in annotated_feats:
                    del annotated_feats['three_prime_UTR']

            refmissing = [
                f for f in structures[locus] if f not in annotated_feats
            ]

            if self.verbose and self.verbosity > 1:
                self.logger.info("* Annotation not complete *")

            # Print out what features were missing by the ref
            if self.verbose and self.verbosity > 2:
                self.logger.info("Refseq was missing these features = " +
                                 ",".join(list(refmissing)))

            # Print out what features were ambig matches
            if self.verbose and self.verbosity > 1 and len(ambig_map) > 1:
                self.logger.info("Features with ambig matches = " +
                                 ",".join(list(ambig_map)))

            # Print out what features were exact matches
            if self.verbose and self.verbosity > 2 and len(exact_matches) > 1:
                self.logger.info("Features exact matches = " +
                                 ",".join(list(exact_matches)))

            # Print out what features have been annotated
            if self.verbose and self.verbosity > 1 and len(
                    annotated_feats) > 1:
                self.logger.info("Features annotated = " +
                                 ",".join(list(annotated_feats)))

            # Print out what features are missing
            if self.verbose and self.verbosity > 1 and len(feat_missing) > 1:
                self.logger.info("Features missing = " +
                                 ",".join(list(feat_missing)))

            annotation = Annotation(features=annotated_feats,
                                    covered=seq_covered,
                                    seq=in_seq,
                                    missing=feat_missing,
                                    ambig=ambig_map,
                                    blocks=mb,
                                    method=method,
                                    refmissing=refmissing,
                                    mapping=mapping,
                                    exact_match=exact_matches,
                                    annotation=None)
        else:

            mb = None
            # Unmap exon 8
            if locus in ['HLA-C', 'HLA-A'] and len(in_seq.seq) < 600 \
                    and 'exon_8' in exact_matches \
                    and 'three_prime_UTR' in annotated_feats\
                    and 'three_prime_UTR' not in exact_matches:

                for i in deleted_coords:
                    mapping[i] = 1

                coordinates.update(deleted_coords)
                mb = getblocks(coordinates)
                feat_missing.update(added_feat)
                del exact_matches[exact_matches.index('exon_8')]
                del found_feats['exon_8']
                if 'exon_8' in annotated_feats:
                    del annotated_feats['exon_8']
                if 'three_prime_UTR' in found_feats:
                    del found_feats['three_prime_UTR']
                if 'three_prime_UTR' in annotated_feats:
                    del annotated_feats['three_prime_UTR']

            if self.verbose:
                self.logger.info("* No missing blocks after seq_search *")

            # Print out what features were ambig matches
            if self.verbose and self.verbosity > 0 and len(ambig_map) > 1:
                self.logger.info("Features with ambig matches = " +
                                 ",".join(list(ambig_map)))

            # Print out what features were exact matches
            if self.verbose and self.verbosity > 0 and len(exact_matches) > 1:
                self.logger.info("Features exact matches = " +
                                 ",".join(list(exact_matches)))

            # Print out what features have been annotated
            if self.verbose and self.verbosity > 0 and len(
                    annotated_feats) > 1:
                self.logger.info("Features annotated = " +
                                 ",".join(list(annotated_feats)))

            # Print out what features are missing
            if self.verbose and self.verbosity > 0 and len(feat_missing) > 1:
                self.logger.info("Features missing = " +
                                 ",".join(list(feat_missing)))

            annotation = Annotation(features=annotated_feats,
                                    covered=seq_covered,
                                    seq=in_seq,
                                    missing=feat_missing,
                                    ambig=ambig_map,
                                    method=method,
                                    blocks=mb,
                                    mapping=mapping,
                                    exact_match=exact_matches,
                                    annotation=None)

        return annotation