def seqannotation(self, seqrecord, allele, loc): """ Gets the Annotation from the found sequence :return: The Annotation from the found sequence :rtype: Annotation """ #seqrecord = self.seqrecord(allele, loc) complete_annotation = get_features(seqrecord) annotation = Annotation(annotation=complete_annotation, method='match', complete_annotation=True) if self.alignments: alignment = { f: self.annoated_alignments[loc][allele][f]['Seq'] for f in self.annoated_alignments[loc][allele].keys() } annotation.aligned = alignment return annotation
def test_002_gfe(self): gfe = GFE() for ex in self.expected['gfe']: loc = ex['locus'] ann = ex['annotation'] exp = ex['gfe'] annotation = {} for f in ann: seqrec = SeqRecord(seq=Seq(ann[f], generic_dna), id="002_gfe") annotation.update({f: seqrec}) a = Annotation(annotation=annotation) features, gfe = gfe.get_gfe(a, loc) for feat in features: self.assertIsInstance(feat, Feature) self.assertEqual(gfe, exp) pass
def gfe_from_allele(allele, gfe_maker): locus = allele.description.split(",")[0].split("*")[0] complete_annotation = get_features(allele) ann = Annotation(annotation=complete_annotation, method='match', complete_annotation=True) # This process takes a long time logging.info(f"Getting GFE data for allele {allele.id}...") features, gfe = gfe_maker.get_gfe(ann, locus) return { "name": gfe, "features": features }
def main(): """This is run if file is directly executed, but not if imported as module. Having this in a separate function allows importing the file into interactive python, and still able to execute the function for testing""" parser = argparse.ArgumentParser() parser.add_argument("-k", "--kir", required=False, help="Bool for KIR", action='store_true') parser.add_argument("-a", "--align", required=False, help="Bool for loading alignments", action='store_true') parser.add_argument("-d", "--debug", required=False, help="Bool for debugging", action='store_true') parser.add_argument("-o", "--outdir", required=True, help="Output directory", type=str) parser.add_argument("-n", "--number", required=False, help="Number of IMGT/DB releases", default=1, type=int) parser.add_argument("-r", "--releases", required=False, help="IMGT/DB releases", type=str) parser.add_argument("-v", "--verbose", help="Option for running in verbose", action='store_true') data_dir = os.path.dirname(__file__) args = parser.parse_args() outdir = args.outdir load_loci = hla_loci + kir_loci release_n = args.number releases = args.releases verbosity = 1 align = False kir = False debug = False verbose = False if args.kir: kir = True if args.align: align = True if args.verbose: verbose = True if kir: load_loci = hla_loci + kir_loci else: load_loci = hla_loci if args.debug: logging.info("Running in debug mode") load_loci = ["HLA-A"] kir = False debug = True verbose = True verbosity = 2 release_n = 1 gfe_e = [] seq_e = [] seq_n = [] cds_n = [] grp_e = [] trs_e = [] allele_n = [] # Get last five IMGT/HLA releases if releases: dbversions = [db for db in releases.split(",")] else: dbversions = pd.read_html(imgt_hla)[0]['Release'][0:release_n].tolist() # Get lastest IMGT/KIR release kir_release = pd.read_html(imgt_kir)[0][0][1] gfe_maker = pyGFE(verbose=verbose, verbosity=verbosity, load_features=True, store_features=True, loci=load_loci) if kir: if verbose: logging.info("Adding KIR to GFE DB") kir_file = data_dir + '/../data/KIR.dat' if align: aligned = kir_alignments() # Downloading KIR if not os.path.isfile(kir_file): if verbose: logging.info("Downloading KIR dat file from " + kir_url) urllib.request.urlretrieve(kir_url, kir_file) kir_gen = SeqIO.parse(kir_file, "imgt") if verbose: logging.info("Finished parsing KIR dat file") i = 0 for allele in kir_gen: if hasattr(allele, 'seq'): loc = allele.description.split(",")[0].split("*")[0] if loc in kir_loci and len(str(allele.seq)) > 5: if verbose: logging.info("KIR = " + allele.description.split(",")[0] + " " + kir_release) groups = [] complete_annotation = get_features(allele) ambigs = [ a for a in complete_annotation if re.search("/", a) ] aligned_seq = '' if align: if allele.description.split(",")[0] in aligned[loc]: aligned_seq = aligned[loc][ allele.description.split(",")[0]] if ambigs: logging.info("AMBIGS " + allele.description.split(",")[0] + " " + kir_release) annotations = [] for ambig in ambigs: logging.info("AMBIG = " + ambig) aterm = ambig.split("/")[0].split("_")[0] anno = { a: complete_annotation[a] for a in complete_annotation if a not in ambigs } anno.update({ ambig.split("/")[0]: complete_annotation[ambig] }) annotations.append(anno) anno2 = { a: complete_annotation[a] for a in complete_annotation if a not in ambigs } anno2.update({ aterm + "_" + ambig.split("/")[1]: complete_annotation[ambig] }) annotations.append(anno2) for annotation in annotations: ann = Annotation(annotation=annotation, method='match', complete_annotation=True) features, gfe = gfe_maker.get_gfe(ann, loc) (allelenode, gfeedge, seq_nodes, cds_nodes, seq_edges, trans_edge, grp_edges) = build_graph( groups, gfe, allele, features, kir_release, aligned_seq, '', '', "IMGT_KIR", align) gfe_e += gfeedge seq_e += seq_edges seq_n += seq_nodes allele_n += allelenode grp_e += grp_edges trs_e += trans_edge cds_n += cds_nodes i += 1 else: ann = Annotation(annotation=complete_annotation, method='match', complete_annotation=True) features, gfe = gfe_maker.get_gfe(ann, loc) (allelenode, gfeedge, seq_nodes, cds_nodes, seq_edges, trans_edge, grp_edges) = build_graph( groups, gfe, allele, features, kir_release, aligned_seq, '', '', "IMGT_KIR", align) gfe_e += gfeedge seq_e += seq_edges seq_n += seq_nodes allele_n += allelenode grp_e += grp_edges trs_e += trans_edge cds_n += cds_nodes i += 1 # Loop through DB versions for dbversion in dbversions: db_striped = ''.join(dbversion.split(".")) if align: gen_aln, nuc_aln, prot_aln = hla_alignments(db_striped) ard = ARD(db_striped) dat_url = 'https://raw.githubusercontent.com/ANHIG/IMGTHLA/' \ + db_striped + '/hla.dat' dat_file = data_dir + '/hla.' + str(db_striped) + ".dat" # Downloading DAT file if not os.path.isfile(dat_file): if verbose: logging.info("Downloading dat file from " + dat_url) urllib.request.urlretrieve(dat_url, dat_file) cmd = "perl -p -i -e 's/[^\\x00-\\x7F]//g' " + dat_file os.system(cmd) a_gen = SeqIO.parse(dat_file, "imgt") if verbose: logging.info("Finished parsing dat file") i = 0 for allele in a_gen: if hasattr(allele, 'seq'): hla_name = allele.description.split(",")[0] loc = allele.description.split(",")[0].split("*")[0] if hla_name in skip_alleles: logging.info("SKIPPING = " + allele.description.split(",")[0] + " " + dbversion) continue if (debug and (loc != "HLA-A" and i > 20)): continue if (loc in hla_loci or loc == "DRB5") and (len(str(allele.seq)) > 5): if verbose: logging.info("HLA = " + allele.description.split(",")[0] + " " + dbversion) a_name = allele.description.split(",")[0].split("-")[1] groups = [["HLA-" + ard.redux(a_name, grp), grp] if ard.redux(a_name, grp) != a_name else None for grp in ard_groups] seco = [[to_second(a_name), "2nd_FIELD"]] groups = list(filter(None, groups)) + seco complete_annotation = get_features(allele) ann = Annotation(annotation=complete_annotation, method='match', complete_annotation=True) features, gfe = gfe_maker.get_gfe(ann, loc) #gen_aln, nuc_aln, prot_aln aligned_gen = '' aligned_nuc = '' aligned_prot = '' if align: if allele.description.split(",")[0] in gen_aln[loc]: aligned_gen = gen_aln[loc][ allele.description.split(",")[0]] if allele.description.split(",")[0] in nuc_aln[loc]: aligned_nuc = nuc_aln[loc][ allele.description.split(",")[0]] if allele.description.split(",")[0] in prot_aln[loc]: aligned_prot = prot_aln[loc][ allele.description.split(",")[0]] (allelenode, gfeedge, seq_nodes, cds_nodes, seq_edges, trans_edge, grp_edges) = build_graph( groups, gfe, allele, features, dbversion, aligned_gen, aligned_nuc, aligned_prot, "IMGT_HLA", align) gfe_e += gfeedge seq_e += seq_edges seq_n += seq_nodes allele_n += allelenode grp_e += grp_edges trs_e += trans_edge cds_n += cds_nodes i += 1 if verbose: logging.info("Finished loading IMGT DB " + dbversion) if verbose: logging.info("Finished loading ALL DB versions") gfe_df = pd.DataFrame( gfe_e, columns=":START_ID(ALLELE),:END_ID(ALLELE),imgt_release,:TYPE".split( ",")) seq_df = pd.DataFrame( seq_e, columns= ":START_ID(ALLELE),:END_ID(SEQUENCE),imgt_release,accession,:TYPE". split(",")) seqn_df = pd.DataFrame( seq_n, columns= "sequenceId:ID(SEQUENCE),sequence,name,feature:LABEL,rank,length,seq:string[]" .split(",")) allele_df = pd.DataFrame( allele_n, columns="alleleId:ID(ALLELE),name,alleletype:LABEL,locus".split(",")) group_df = pd.DataFrame( grp_e, columns=":START_ID(ALLELE),:END_ID(ALLELE),imgtdb,:TYPE".split(",")) cdsn_df = pd.DataFrame( cds_n, columns="cdsId:ID(CDS),name,cdstype:LABEL,cds,protein".split(",")) trs_df = pd.DataFrame( trs_e, columns=":START_ID(SEQUENCE),:END_ID(CDS),:TYPE".split(",")) if verbose: gfe_es = str(len(gfe_df)) seq_es = str(len(seq_df)) seq_ns = str(len(seqn_df)) all_ns = str(len(allele_df)) grp_es = str(len(group_df)) cds_ns = str(len(cdsn_df)) cds_es = str(len(trs_df)) logging.info("GFE Edges = " + gfe_es) logging.info("Seq Edges = " + seq_es) logging.info("Group Edges = " + grp_es) logging.info("CDS Edges = " + cds_es) logging.info("Seq Nodes = " + seq_ns) logging.info("CDS Nodes = " + cds_ns) logging.info("Allele Nodes = " + all_ns) gfe_df.to_csv(outdir + "/gfe_edges.csv", header=True, index=False) seq_df.to_csv(outdir + "/seq_edges.csv", header=True, index=False) seqn_df.to_csv(outdir + "/sequence_nodes.csv", header=True, index=False) allele_df.to_csv(outdir + "/allele_nodes.csv", header=True, index=False) cdsn_df.to_csv(outdir + "/cds_nodes.csv", header=True, index=False) group_df.to_csv(outdir + "/group_edges.csv", header=True, index=False) trs_df.to_csv(outdir + "/cds_edges.csv", header=True, index=False) if verbose: logging.info("** Finshed build **")
def align_seqs(found_seqs, sequence, locus, start_pos, missing, annotated, cutoff=0.90, verbose=False, verbosity=0): """ align_seqs - Aligns sequences with clustalo :param found_seqs: List of the reference sequences :type found_seqs: ``List`` :param sequence: The input consensus sequence. :type sequence: SeqRecord :param locus: The gene locus associated with the sequence. :type locus: ``str`` :param annotated: dictonary of the annotated features :type annotated: ``dict`` :param start_pos: Where the reference sequence starts :type start_pos: ``int`` :param missing: List of the unmapped features :type missing: ``List`` :param cutoff: The alignment cutoff :type cutoff: ``float`` :param verbose: Flag for running in verbose mode. :type verbose: ``bool`` :param verbosity: Numerical value to indicate how verbose the output will be in verbose mode. :type verbosity: ``int`` :rtype: :ref:`ann` """ logger = logging.getLogger("Logger." + __name__) seqs = [found_seqs, sequence] if verbose and verbosity > 0: logger.info("found_seqs length = " + str(len(found_seqs))) logger.info("sequence length = " + str(len(sequence))) seqs = [] seqs.append(found_seqs) seqs.append(sequence) align = [] # piping to clustalo failed # when sequences were over ~7k bp if len(sequence) > 7000: # Writing sequences out to fasta files.. if verbose: logger.info("Sequence too large to use pipe") randid = randomid() input_fasta = str(randid) + ".fasta" output_clu = str(randid) + ".clu" SeqIO.write(seqs, input_fasta, "fasta") clustalomega_cline = ClustalOmegaCommandline(infile=input_fasta, outfile=output_clu, outfmt='clu', wrap=20000, verbose=True, auto=True) stdout, stderr = clustalomega_cline() aligns = AlignIO.read(output_clu, "clustal") for aln in aligns: align.append(str(aln.seq)) # Delete files cleanup(randid) else: # Running clustalo by piping in sequences indata = flatten([[">" + str(s.id), str(s.seq)] for s in seqs]) child = Popen([ 'clustalo', '--outfmt', 'clu', '--wrap=50000', '--auto', '-i', '-' ], stdout=PIPE, stderr=STDOUT, stdin=PIPE) stdout = child.communicate(input=str.encode("\n".join(indata))) child.wait() lines = bytes.decode(stdout[0]).split("\n") for line in lines: if re.search("\w", line) and not re.search("CLUSTAL", line): alignment = re.findall(r"[\S']+", line) if len(alignment) == 2: align.append(list(alignment[1])) child.terminate() # Print out what blocks haven't been annotated if verbose and len(align) > 0: logger.info("* ClustalOmega alignment succeeded *") insers, dels = 0, 0 all_features = [] if len(align) - 2 == 0: infeats = get_seqfeat(seqs[0]) diffs = count_diffs(align, infeats, sequence, locus, cutoff, verbose, verbosity) if isinstance(diffs, Annotation): if verbose: logger.info("Run alignment with " + found_seqs.id) logger.info("***********************") return diffs, 0, 0 else: insers, dels = diffs[0], diffs[1] f = find_features(infeats, align[0], annotated, start_pos, cutoff) all_features.append(f) else: for i in range(0, len(align) - 2): infeats = get_seqfeat(seqs[i]) f = find_features(infeats, align[i], annotated, start_pos, cutoff) all_features.append(f) if len(all_features) > 0: if verbose: logger.info("-- Resolving features -- ") for f in all_features[0]: logger.info("Resolving -> " + f) annotation = resolve_feats(all_features, align[len(align) - 1], align[0], start_pos, locus, missing, verbose, verbosity) if verbose: logger.info("Run alignment with " + found_seqs.id) logger.info("Missing features = " + ",".join(list(missing.keys()))) logger.info("Number of features found = " + str(len(all_features))) logger.info("Features found = " + ",".join(list(all_features[0].keys()))) logger.info("Features annotated = " + ",".join(list(annotation.annotation.keys()))) logger.info("***********************") return annotation, insers, dels else: if verbose: logger.info("***********************") return Annotation(complete_annotation=False), 0, 0
def count_diffs(align, feats, inseq, locus, cutoff, verbose=False, verbosity=0): """ count_diffs - Counts the number of mismatches, gaps, and insertions and then determines if those are within an acceptable range. :param align: The alignment :type align: ``List`` :param feats: Dictonary of the features :type feats: ``dict`` :param locus: The gene locus associated with the sequence. :type locus: ``str`` :param inseq: The input sequence :type inseq: ``str`` :param cutoff: The alignment cutoff :type cutoff: ``float`` :param verbose: Flag for running in verbose mode. :type verbose: ``bool`` :param verbosity: Numerical value to indicate how verbose the output will be in verbose mode. :type verbosity: ``int`` :rtype: ``List`` """ nfeats = len(feats.keys()) mm = 0 insr = 0 dels = 0 gaps = 0 match = 0 lastb = '' l = len(align[0]) if len(align[0]) > len(align[1]) else len(align[1]) # Counting gaps, mismatches and insertions for i in range(0, l): if align[0][i] == "-" or align[1][i] == "-": if align[0][i] == "-": insr += 1 if lastb != '-': gaps += 1 lastb = "-" if align[1][i] == "-": dels += 1 if lastb != '-': gaps += 1 lastb = "-" else: lastb = '' if align[0][i] != align[1][i]: mm += 1 else: match += 1 gper = gaps / nfeats delper = dels / l iper = insr / l mmper = mm / l mper = match / l mper2 = match / len(inseq) logger = logging.getLogger("Logger." + __name__) if verbose and verbosity > 0: logger.info("Features algined = " + ",".join(list(feats.keys()))) logger.info('{:<22}{:<6d}'.format("Number of feats: ", nfeats)) logger.info('{:<22}{:<6d}{:<1.2f}'.format("Number of gaps: ", gaps, gper)) logger.info('{:<22}{:<6d}{:<1.2f}'.format("Number of deletions: ", dels, delper)) logger.info('{:<22}{:<6d}{:<1.2f}'.format("Number of insertions: ", insr, iper)) logger.info('{:<22}{:<6d}{:<1.2f}'.format("Number of mismatches: ", mm, mmper)) logger.info('{:<22}{:<6d}{:<1.2f}'.format("Number of matches: ", match, mper)) logger.info('{:<22}{:<6d}{:<1.2f}'.format("Number of matches: ", match, mper2)) indel = iper + delper # ** HARD CODED LOGIC ** # if len(inseq) > 6000 and mmper < .10 and mper2 > .80: if verbose: logger.info( "Alignment coverage high enough to complete annotation 11") return insr, dels else: # TODO: These numbers need to be fine tuned indel_mm = indel + mper2 if (indel > 0.5 or mmper > 0.05) and mper2 < cutoff and indel_mm != 1: if verbose: logger.info( "Alignment coverage NOT high enough to return annotation") return Annotation(complete_annotation=False) else: if verbose: logger.info( "Alignment coverage high enough to complete annotation") return insr, dels
def resolve_feats(feat_list, seqin, seqref, start, locus, missing, verbose=False, verbosity=0): """ resolve_feats - Resolves features from alignments :param feat_list: List of the found features :type feat_list: ``List`` :param seqin: The input sequence :type seqin: ``str`` :param locus: The input locus :type locus: ``str`` :param start: Where the sequence start in the alignment :type start: ``int`` :param missing: List of the unmapped features :type missing: ``List`` :param verbose: Flag for running in verbose mode. :type verbose: ``bool`` :param verbosity: Numerical value to indicate how verbose the output will be in verbose mode. :type verbosity: ``int`` :rtype: :ref:`ann` """ structures = get_structures() logger = logging.getLogger("Logger." + __name__) seq = SeqRecord(seq=Seq("".join(seqin), SingleLetterAlphabet())) seq_covered = len(seq.seq) coordinates = dict( map(lambda x: [x, 1], [i for i in range(0, len(seq.seq) + 1)])) mapping = dict( map(lambda x: [x, 1], [i for i in range(0, len(seq.seq) + 1)])) diff = 0 if len(feat_list) > 1: if verbose: logger.error("resolve_feats error") return Annotation(complete_annotation=False) else: features = {} full_annotation = {} features = feat_list[0] # Need to sort feature_list = sorted(features.keys(), key=lambda f: structures[locus][f]) diff_f = True for feat in feature_list: if feat in missing: f = features[feat] seqrec = f.extract(seq) seq_covered -= len(seqrec.seq) if re.search("-", str(seqrec.seq)): l1 = len(seqrec.seq) newseq = re.sub(r'-', '', str(seqrec.seq)) seqrec.seq = Seq(newseq, IUPAC.unambiguous_dna) tmdiff = l1 - len(newseq) diff += tmdiff if seqrec.seq: #logger.error("FEAT HAS SEQ " + feat) if diff_f and diff > 0: sp = f.location.start + start diff_f = False else: sp = f.location.start + start - diff ep = f.location.end + start - diff featn = SeqFeature(FeatureLocation(ExactPosition(sp), ExactPosition(ep), strand=1), type=f.type) features.update({feat: featn}) full_annotation.update({feat: seqrec}) for i in range(featn.location.start, featn.location.end): if i in coordinates: del coordinates[i] mapping[i] = feat else: f = features[feat] seqrec = f.extract(seq) seq_covered -= len(seqrec.seq) if re.search("-", str(seqrec.seq)): l1 = len(seqrec.seq) newseq = re.sub(r'-', '', str(seqrec.seq)) seqrec.seq = Seq(newseq, IUPAC.unambiguous_dna) tmdiff = l1 - len(newseq) diff += tmdiff blocks = getblocks(coordinates) rmapping = {k + start: mapping[k] for k in mapping.keys()} # Print out what features are missing if verbose and verbosity > 0 and len(full_annotation.keys()) > 1: logger.info("Features resolved:") for f in full_annotation: logger.info(f) else: if verbose: logger.info("Failed to resolve") if not full_annotation or len(full_annotation) == 0: if verbose: logger.info("Failed to align missing features") return Annotation(complete_annotation=False) else: return Annotation(annotation=full_annotation, method="clustalo", features=features, mapping=rmapping, blocks=blocks, seq=seq)
def search_seqs(self, seqrec, in_seq, locus, run=0, partial_ann=None): """ search_seqs - method for annotating a BioPython sequence without alignment :param seqrec: The reference sequence :type seqrec: SeqRecord :param locus: The gene locus associated with the sequence. :type locus: str :param in_seq: The input sequence :type in_seq: SeqRecord :param run: The number of runs that have been done :type run: int :param partial_ann: A partial annotation from a previous step :type partial_ann: :ref:`ann` :rtype: :ref:`ann` Example usage: >>> from Bio.Seq import Seq >>> from seqann.seq_search import SeqSearch >>> inseq = Seq('AGAGACTCTCCCGAGGATTTCGTGTACCAGTTTAAGGCCATGTGCTACTTCACC') >>> sqsrch = SeqSearch() >>> ann = sqsrch.search_seqs(refseqs, inseq) """ # Extract out the sequences and feature names # from the reference sequences # The mapped features will be subtracted from seq_covered # so the final seq_covered number will reflect the remaining # number of base pairs that haven't been mapped. # # The coordinates and mapping will help determine what positions # in the sequence have been mapped and to what features. The # missing blocks variable will be generated using these. structures = get_structures() seq_covered = len(in_seq.seq) coordinates = dict( map(lambda x: [x, 1], [i for i in range(0, len(in_seq.seq) + 1)])) mapping = dict( map(lambda x: [x, 1], [i for i in range(0, len(in_seq.seq) + 1)])) ambig_map = {} found_feats = {} feat_missing = {} method = "nt_search" if not partial_ann else partial_ann.method # If the partial annotation is provided # then make the found_feats equal to # what has already been annotated feats = get_features(seqrec) if partial_ann: found_feats = partial_ann.features if self.verbose and self.verbosity > 4: self.logger.info("Found partial features:") for f in found_feats: self.logger.info(f) # Skip references that only have features # that have already been annoated if len([f for f in feats if f in found_feats]) == len(feats): if self.verbose: self.logger.info("Skipping incomplete refseq") return partial_ann if self.verbose and self.verbosity > 1: self.logger.info("Using partial annotation | " + locus + " " + str(len(partial_ann.features))) coordinates = dict( map(lambda l: [l, 1], [ item for sublist in partial_ann.blocks for item in sublist ])) seq_covered = partial_ann.covered mapping = partial_ann.mapping if self.verbose and self.verbosity > 2: self.logger.info("Partial sequence coverage = " + str(seq_covered)) self.logger.info("Partial sequence metho = " + method) added_feat = {} deleted_coords = {} for feat_name in sorted(feats, key=lambda k: structures[locus][k]): # skip if partial annotation is provided # and the feat name is not one of the # missing features if partial_ann and feat_name not in partial_ann.refmissing: if self.verbose and self.verbosity > 1: self.logger.info("Skipping " + feat_name + " - Already annotated") continue if self.verbose and self.verbosity > 1: self.logger.info("Running seqsearch for " + feat_name) # Search for the reference feature sequence in the # input sequence. Record the coordinates if it's # found and if it's found in multiple spots. If it # is not found, then record that feature as missing. seq_search = nt_search(str(in_seq.seq), str(feats[feat_name])) if len(seq_search) == 2: if self.verbose and self.verbosity > 0: self.logger.info("Found exact match for " + feat_name) seq_covered -= len(str(feats[feat_name])) end = int(len(str(feats[feat_name])) + seq_search[1]) if feat_name == 'three_prime_UTR' \ and len(str(in_seq.seq)) > end: end = len(str(in_seq.seq)) # If the feature is found and it's a five_prime_UTR then # the start should always be 0, so insertions at the # beinging of the sequence will be found. start = seq_search[1] if feat_name != 'five_prime_UTR' else 0 si = seq_search[1]+1 if seq_search[1] != 0 and \ feat_name != 'five_prime_UTR' else 0 # check if this features has already been mapped mapcheck = set( [0 if i in coordinates else 1 for i in range(si, end + 1)]) # Dont map features if they are out of order skip = False if found_feats and len(found_feats) > 0: for f in found_feats: o1 = structures[locus][feat_name] o2 = structures[locus][f] loctyp = loctype(found_feats[f].location.start, found_feats[f].location.end, start, end) if o1 < o2 and loctyp: skip = True if self.verbose: self.logger.info("Skipping map for " + feat_name) elif o2 < o1 and not loctyp: skip = True if self.verbose: self.logger.info("Skipping map for " + feat_name) if 1 not in mapcheck and not skip: for i in range(si, end + 1): if i in coordinates: if feat_name == "exon_8" or feat_name == 'three_prime_UTR': deleted_coords.update({i: coordinates[i]}) del coordinates[i] else: if self.verbose: self.logger.error( "seqsearch - should't be here " + locus + " - " + " - " + feat_name) mapping[i] = feat_name found_feats.update({ feat_name: SeqFeature(FeatureLocation(ExactPosition(start), ExactPosition(end), strand=1), type=feat_name) }) if feat_name == "exon_8" or feat_name == 'three_prime_UTR': added_feat.update({feat_name: feats[feat_name]}) if self.verbose and self.verbosity > 3: self.logger.info("Coordinates | Start = " + str(start) + " - End = " + str(end)) elif (len(seq_search) > 2): if self.verbose and self.verbosity > 1: self.logger.info("Found " + str(len(seq_search)) + " matches for " + feat_name) new_seq = [seq_search[0]] for i in range(1, len(seq_search)): tnp = seq_search[i] + 1 if seq_search[i] in coordinates or tnp in coordinates: new_seq.append(seq_search[i]) seq_search = new_seq if (partial_ann and feat_name == "exon_8" and run > 0): missing_feats = sorted(list(partial_ann.missing.keys())) # * HARD CODED LOGIC * # # > exon8 in class I maps to multiple spots in a sequence, # often in the 3' UTR. These features need to be mapped # last to make sure it's not mapping exon8 incorrectly. if (missing_feats == ['exon_8', 'three_prime_UTR'] and len(seq_search) <= 3): if self.verbose and self.verbosity > 0: self.logger.info("Resolving exon_8") seq_covered -= len(str(feats[feat_name])) end = int(len(str(feats[feat_name])) + seq_search[1]) # If the feature is found and it's a five_prime_UTR then # the start should always be 0, so insertions at the # beinging of the sequence will be found. start = seq_search[1] si = seq_search[1] + 1 if seq_search[1] != 0 else 0 # check if this features has already been mapped mapcheck = set([ 0 if i in coordinates else 1 for i in range(si, end + 1) ]) for i in range(si, end + 1): if i in coordinates: del coordinates[i] else: if self.verbose: self.logger.error( "seqsearch - should't be here " + locus + " - " + " - " + feat_name) mapping[i] = feat_name found_feats.update({ feat_name: SeqFeature(FeatureLocation(ExactPosition(start), ExactPosition(end), strand=1), type=feat_name) }) if self.verbose and self.verbosity > 0: self.logger.info("Coordinates | Start = " + str(start) + " - End = " + str(end)) else: if self.verbose and self.verbosity > 0: self.logger.info("Adding ambig feature " + feat_name) feat_missing.update({feat_name: feats[feat_name]}) ambig_map.update( {feat_name: seq_search[1:len(seq_search)]}) else: if self.verbose and self.verbosity > 0: self.logger.info("Adding ambig feature " + feat_name) feat_missing.update({feat_name: feats[feat_name]}) ambig_map.update( {feat_name: seq_search[1:len(seq_search)]}) else: if self.verbose and self.verbosity > 1: self.logger.info("No match for " + feat_name) feat_missing.update({feat_name: feats[feat_name]}) blocks = getblocks(coordinates) exact_matches = list(found_feats.keys()) # * HARD CODED LOGIC * # # > # # HLA-DRB1 exon3 exact match - with intron1 and 3 missing if ('exon_3' in exact_matches and run == 99 and locus == 'HLA-DRB1' and 'exon_2' in feat_missing and (len(blocks) == 1 or len(blocks) == 2)): for b in blocks: x = b[len(b) - 1] if x == max(list(mapping.keys())): featname = "intron_3" found_feats.update({ featname: SeqFeature(FeatureLocation(ExactPosition(b[0] - 1), ExactPosition(b[len(b) - 1]), strand=1), type=featname) }) else: featname = "exon_2" found_feats.update({ featname: SeqFeature(FeatureLocation(ExactPosition(b[0]), ExactPosition(b[len(b) - 1]), strand=1), type=featname) }) seq_covered -= len(b) if self.verbose and self.verbosity > 1: self.logger.info( "Successfully annotated class DRB1 II sequence") return Annotation(features=found_feats, covered=seq_covered, seq=in_seq, missing=feat_missing, ambig=ambig_map, method=method, mapping=mapping, exact_match=exact_matches) # If it's a class II sequence and # exon_2 is an exact match # * HARD CODED LOGIC * # # > It's common for exon2 to be fully sequenced # but intron_2 and intron_1 to be partially sequenced, # which can make it hard to annotate those to features. # If there are two missing blocks that is small enough # and they are before and after exon2, then it's very # very likely to be intron_2 and intron_1. if 'exon_2' in exact_matches and len(blocks) == 2 \ and is_classII(locus) and seq_covered < 300: if self.verbose and self.verbosity > 1: self.logger.info("Running search for class II sequence") r = True for b in blocks: x = b[len(b) - 1] if x == max(list(mapping.keys())): x = b[0] - 1 else: x += 1 f = mapping[x] if f != 'exon_2': r = False if r: for b in blocks: x = b[len(b) - 1] if x == max(list(mapping.keys())): featname = "intron_2" found_feats.update({ featname: SeqFeature(FeatureLocation(ExactPosition(b[0] - 1), ExactPosition(b[len(b) - 1]), strand=1), type=featname) }) else: featname = "intron_1" found_feats.update({ featname: SeqFeature(FeatureLocation(ExactPosition(b[0]), ExactPosition(b[len(b) - 1]), strand=1), type=featname) }) seq_covered -= len(b) if self.verbose and self.verbosity > 1: self.logger.info( "Successfully annotated class II sequence") return Annotation(features=found_feats, covered=seq_covered, seq=in_seq, missing=feat_missing, ambig=ambig_map, method=method, mapping=mapping, exact_match=exact_matches) annotated_feats, mb, mapping = self._resolve_unmapped( blocks, feat_missing, ambig_map, mapping, found_feats, locus, seq_covered) # * HARD CODED LOGIC * # if (not mb and blocks and len(feat_missing.keys()) == 0 and len(ambig_map.keys()) == 0): mb = blocks if mb: # Unmap exon 8 if locus in ['HLA-C', 'HLA-A'] and len(in_seq.seq) < 3000 \ and 'exon_8' in exact_matches: for i in deleted_coords: mapping[i] = 1 coordinates.update(deleted_coords) mb = getblocks(coordinates) feat_missing.update(added_feat) # Delte from found features del exact_matches[exact_matches.index('exon_8')] del found_feats['exon_8'] if 'exon_8' in annotated_feats: del annotated_feats['exon_8'] if 'three_prime_UTR' in found_feats: del found_feats['three_prime_UTR'] if 'three_prime_UTR' in annotated_feats: del annotated_feats['three_prime_UTR'] refmissing = [ f for f in structures[locus] if f not in annotated_feats ] if self.verbose and self.verbosity > 1: self.logger.info("* Annotation not complete *") # Print out what features were missing by the ref if self.verbose and self.verbosity > 2: self.logger.info("Refseq was missing these features = " + ",".join(list(refmissing))) # Print out what features were ambig matches if self.verbose and self.verbosity > 1 and len(ambig_map) > 1: self.logger.info("Features with ambig matches = " + ",".join(list(ambig_map))) # Print out what features were exact matches if self.verbose and self.verbosity > 2 and len(exact_matches) > 1: self.logger.info("Features exact matches = " + ",".join(list(exact_matches))) # Print out what features have been annotated if self.verbose and self.verbosity > 1 and len( annotated_feats) > 1: self.logger.info("Features annotated = " + ",".join(list(annotated_feats))) # Print out what features are missing if self.verbose and self.verbosity > 1 and len(feat_missing) > 1: self.logger.info("Features missing = " + ",".join(list(feat_missing))) annotation = Annotation(features=annotated_feats, covered=seq_covered, seq=in_seq, missing=feat_missing, ambig=ambig_map, blocks=mb, method=method, refmissing=refmissing, mapping=mapping, exact_match=exact_matches, annotation=None) else: mb = None # Unmap exon 8 if locus in ['HLA-C', 'HLA-A'] and len(in_seq.seq) < 600 \ and 'exon_8' in exact_matches \ and 'three_prime_UTR' in annotated_feats\ and 'three_prime_UTR' not in exact_matches: for i in deleted_coords: mapping[i] = 1 coordinates.update(deleted_coords) mb = getblocks(coordinates) feat_missing.update(added_feat) del exact_matches[exact_matches.index('exon_8')] del found_feats['exon_8'] if 'exon_8' in annotated_feats: del annotated_feats['exon_8'] if 'three_prime_UTR' in found_feats: del found_feats['three_prime_UTR'] if 'three_prime_UTR' in annotated_feats: del annotated_feats['three_prime_UTR'] if self.verbose: self.logger.info("* No missing blocks after seq_search *") # Print out what features were ambig matches if self.verbose and self.verbosity > 0 and len(ambig_map) > 1: self.logger.info("Features with ambig matches = " + ",".join(list(ambig_map))) # Print out what features were exact matches if self.verbose and self.verbosity > 0 and len(exact_matches) > 1: self.logger.info("Features exact matches = " + ",".join(list(exact_matches))) # Print out what features have been annotated if self.verbose and self.verbosity > 0 and len( annotated_feats) > 1: self.logger.info("Features annotated = " + ",".join(list(annotated_feats))) # Print out what features are missing if self.verbose and self.verbosity > 0 and len(feat_missing) > 1: self.logger.info("Features missing = " + ",".join(list(feat_missing))) annotation = Annotation(features=annotated_feats, covered=seq_covered, seq=in_seq, missing=feat_missing, ambig=ambig_map, method=method, blocks=mb, mapping=mapping, exact_match=exact_matches, annotation=None) return annotation