def td_contents(selfself, item, attr_list): text = item.sequence_id if text == 'Totals': return Markup('<strong>' + text + '</strong>') imgt_ref = get_imgt_reference_genes() if item.sequence_id not in imgt_ref[ item.genotype_description.submission.species]: text = '<em>' + text + '</em>' if len(item.inferred_sequences) > 0: text = '<strong>' + text + '</strong>' return Markup(text)
def get(self, species, name): """ Returns the sequence given the IMGT name """ imgt_ref = get_imgt_reference_genes() imgt_ref_gapped = get_imgt_gapped_reference_genes() if species in imgt_ref and name in imgt_ref[species]: ungapped = str(imgt_ref[species][name]) gapped = str(imgt_ref_gapped[species][name]) return { 'species': species, 'imgt_name': name, 'sequence': ungapped, 'coding_seq_imgt': gapped } else: return 'Not found', 404
def get(self, species): """ Returns the set of IARC-affirmed sequences for the species """ imgt_ref = get_imgt_reference_genes() if species not in imgt_ref: return 'Not found', 404 all_species = db.session.query(Committee.species).all() all_species = [s[0] for s in all_species] if species not in all_species: return [] q = db.session.query(GeneDescription).filter( GeneDescription.status == 'published', GeneDescription.affirmation_level != '0', GeneDescription.species == species) results = q.all() dl = self.descs_to_airr(results) return dl
def download_sequences(species, format, exc): if format not in ['gapped','ungapped','airr']: flash('Invalid format') return redirect('/') all_species = db.session.query(Committee.species).all() all_species = [s[0] for s in all_species] if species not in all_species: flash('Invalid species') return redirect('/') q = db.session.query(GeneDescription).filter(GeneDescription.status == 'published', GeneDescription.affirmation_level != '0', GeneDescription.species == species) results = q.all() imgt_ref = get_imgt_reference_genes() if species in imgt_ref and exc == 'non': descs = [] for result in results: if result.imgt_name == '': descs.append(result) results = descs if len(results) < 1: flash('No sequences to download') return redirect('/') if format == 'airr': ad = [] for desc in results: ad.append(vars(AIRRAlleleDescription(desc))) dl = json.dumps(ad, default=str, indent=4) ext = 'json' else: dl = descs_to_fasta(results, format) ext = 'fa' filename = 'affirmed_germlines_%s_%s.%s' % (species, format, ext) return Response(dl, mimetype="application/octet-stream", headers={"Content-disposition": "attachment; filename=%s" % filename})
def get(self): """ Returns the species for which sequences are available """ imgt_ref = get_imgt_reference_genes() return {'species': list(imgt_ref.keys())}
def generate_stats(form): species = form.species.data locus = form.locus.data sequence_type = form.sequence_type.data imgt_ref = get_imgt_reference_genes() if species not in imgt_ref: return (0, None, None) def gene_match(gene, ref): for k,v in ref.items(): if gene in k: return True return False rare_genes = form.rare_genes.data.replace(' ', '').split(',') rare_missing = [] for gene in rare_genes: if not gene_match(gene, imgt_ref[species]): rare_missing.append(gene) if len(rare_missing) > 0: form.rare_genes.errors = ['Gene(s) %s not found in IMGT reference database' % ', '.join(rare_missing)] very_rare_genes = form.very_rare_genes.data.replace(' ', '').split(',') very_rare_missing = [] for gene in very_rare_genes: if not gene_match(gene, imgt_ref[species]): very_rare_missing.append(gene) if len(very_rare_missing) > 0: form.very_rare_genes.errors = ['Gene(s) %s not found in IMGT reference database' % ', '.join(very_rare_missing)] if len(rare_missing) > 0 or len(very_rare_missing) > 0: return (0, None, None) ref = [] for gene in imgt_ref[species].keys(): if locus in gene and gene[3] == sequence_type: ref.append(gene) # Calculate thresholds for each reference gene gene_thresh = {} for gene in imgt_ref[species].keys(): gene_thresh[gene] = form.freq_threshold.data for rg in rare_genes: if rg in gene: gene_thresh[gene] = form.rare_threshold.data for rg in very_rare_genes: if rg in gene: gene_thresh[gene] = form.very_rare_threshold.data # Get unique list of genotype descriptions that underlie affirmed inferences genotype_descriptions = [] seqs = db.session.query(GeneDescription).filter(GeneDescription.status == 'published', GeneDescription.species == species, GeneDescription.locus == locus, GeneDescription.sequence_type == sequence_type, GeneDescription.affirmation_level != '0' ).all() for seq in seqs: for genotype in seq.inferred_sequences: if genotype.genotype_description not in genotype_descriptions: genotype_descriptions.append(genotype.genotype_description) if len(genotype_descriptions) == 0: return (0, None, None) # Initialise stats stats = {} for name in ref: if '/OR' not in name: stats[name] = {'occurrences': 0, 'unmutated_freq': [], 'gene': name} stats = OrderedDict(sorted(stats.items(), key=lambda name: parse_name(name[0])[2])) stats = OrderedDict(sorted(stats.items(), key=lambda name: parse_name(name[0])[1])) stats = OrderedDict(sorted(stats.items(), key=lambda name: parse_name(name[0])[0])) raw = {} for name in ref: if '/OR' not in name: raw[name] = {'gene': name} raw = OrderedDict(sorted(raw.items(), key=lambda name: parse_name(name[0])[2])) raw = OrderedDict(sorted(raw.items(), key=lambda name: parse_name(name[0])[1])) raw = OrderedDict(sorted(raw.items(), key=lambda name: parse_name(name[0])[0])) # Compose stats gen_names = ['gene'] for desc in genotype_descriptions: gen_name = "%s/%s" % (desc.submission.submission_id, desc.genotype_name) gen_names.append(gen_name) for gen in desc.genotypes: if gen.sequence_id in stats \ and (gen.allelic_percentage is None or gen.allelic_percentage==0 or gen.allelic_percentage >= form.allelic_threshold.data) \ and (gen.assigned_unmutated_frequency is None or gen.assigned_unmutated_frequency >= form.assigned_unmutated_threshold.data) \ and (gen.unmutated_frequency is not None and gen.unmutated_frequency >= gene_thresh[gen.sequence_id]): stats[gen.sequence_id]['occurrences'] += 1 stats[gen.sequence_id]['unmutated_freq'].append(gen.unmutated_frequency) if gen.sequence_id in raw and (gen.allelic_percentage is None or gen.allelic_percentage >= form.allelic_threshold.data): raw[gen.sequence_id][gen_name] = gen.unmutated_frequency for (k, stat) in stats.items(): stats[k]['unmutated_freq'] = round(sum(stat['unmutated_freq'])/max(len(stat['unmutated_freq']),1), 2) ret = [] for(k, stat) in stats.items(): ret.append(stat) ro = StringIO() writer = csv.DictWriter(ro, fieldnames=gen_names) writer.writeheader() for gene in raw: writer.writerow(raw[gene]) return (len(genotype_descriptions), ret, ro)
def setup_gv_table(desc): table = make_Genotype_full_table(desc.genotypes, desc.locus, False, classes=['table-bordered']) # table.items = list(table.items) table._cols['sequence_id'] = GenTitleCol( 'Allele name', tooltip= 'Identifier of the allele (either IMGT, or the name assigned by the submitter to an inferred gene)' ) table.rotate_header = True table.add_column( 'nt_sequence', SeqCol('Sequence', tooltip="Click to view or download sequence")) table.table_id = 'genotype_table' # Add totals row totals = Genotype() totals.sequence_id = 'Totals' totals.sequences = 0 totals.unmutated_sequences = 0 lh_seqs = 0 for gen in desc.genotypes: try: if gen.haplotyping_ratio and ':' in gen.haplotyping_ratio: lh = int(gen.haplotyping_ratio.split(':')[0]) totals.sequences += gen.sequences totals.unmutated_sequences += gen.unmutated_sequences lh_seqs += lh * totals.sequences / 100 except: pass if lh_seqs > 0: lh_prop = round(100 * lh_seqs / totals.sequences) totals.haplotyping_ratio = "%d:%d" % (lh_prop, (100 - lh_prop)) totals.assigned_unmutated_frequency = round( 100 * totals.unmutated_sequences / totals.sequences, 2) table.items.append(totals) inferred_seqs = [] for inf in desc.inferred_sequences: inferred_seqs.append(inf.sequence_details.sequence_id) novel = [] imgt_ref = get_imgt_reference_genes() for item in desc.genotypes: if item.sequence_id != 'Totals' and ( item.sequence_id not in imgt_ref[item.genotype_description.submission.species] or item.sequence_id in inferred_seqs): novel.append(item) inferred_table = make_Genotype_novel_table(novel, False, classes=['table-bordered']) inferred_table._cols['sequence_id'] = GenTitleCol( 'Allele name', tooltip= 'Identifier of the allele (either IMGT, or the name assigned by the submitter to an inferred gene)' ) inferred_table.add_column( 'nt_sequence', SeqCol('Sequence', tooltip="Click to view or download sequence")) inferred_table.rotate_header = True return (table, inferred_table)
def td_contents(self, item, attr_list): if not item.nt_sequence: # e.g. for totals column return '' imgt_ref = get_imgt_reference_genes() imgt_ref_gapped = get_imgt_gapped_reference_genes() ref_codon_usage = get_reference_v_codon_usage() if item.sequence_id in imgt_ref[ item.genotype_description.submission.species]: if item.nt_sequence.lower() == imgt_ref[ item.genotype_description.submission.species][ item.sequence_id]: icon = 'glyphicon-ok' colour = 'text-info' aln_text = ' data-toggle="tooltip" title="Agrees with Reference"' else: icon = 'glyphicon-remove' colour = 'text-danger' alignments = pairwise2.align.globalms( item.nt_sequence.lower(), imgt_ref[item.genotype_description.submission.species][ item.sequence_id], 2, -1, -2, -1, one_alignment_only=True) alignment = format_aln(format_alignment(*alignments[0]), item.sequence_id, 'Reference', 50) fasta_seqs = format_fasta_sequence( item.sequence_id, item.nt_sequence.lower(), 50) + format_fasta_sequence( 'Reference', imgt_ref[item.genotype_description.submission.species][ item.sequence_id], 50) aln_text = Markup( ' id="btn_view_check" data-target="#seqModal" data-sequence="%s" data-name="%s" data-fa="%s" data-toggle="modal" title="Differs from Reference (click to view)"' % (alignment, item.sequence_id, fasta_seqs)) bt_check = '<button type="button" class="btn btn-xs %s icon_back" %s><span class="glyphicon %s"></span> </button>' \ % (colour, aln_text, icon) else: bt_check = '' bt_igpdb = '' if item.genotype_description.submission.species == 'Human': igpdb_genes = get_igpdb_ref() for k, v in igpdb_genes.items(): if item.nt_sequence.lower( ) in v or v in item.nt_sequence.lower(): bt_igpdb = '<button type="button" class="btn btn-xs text-info icon_back" data-toggle="tooltip" title="Sequence matches IGPDB gene %s"><span class="glyphicon glyphicon-info-sign"></span> </button>' % k break bt_vdjbase = '' if item.genotype_description.submission.species == 'Human' and item.sequence_id not in imgt_ref[ item.genotype_description.submission.species]: vdjbase_ref = get_vdjbase_ref() vdjbase_species = item.genotype_description.submission.species.replace( 'Human_TCR', 'Human') locus = item.genotype_description.locus if vdjbase_species in vdjbase_ref and locus in vdjbase_ref[ vdjbase_species]: vdjbase_genes = vdjbase_ref[vdjbase_species][locus] for vdjbase_name, (vdjbase_seq, vdjbase_count) in vdjbase_genes.items(): if item.nt_sequence.lower( ) in vdjbase_seq or vdjbase_seq in item.nt_sequence.lower( ): bt_vdjbase = '<button type="button" name="vdjbasebtn" id="vdjbasebtn" class="btn btn-xs text-info icon_back" onclick="window.open(%s)" data-toggle="tooltip" title="Sequence matches VDJbase gene %s (found in %s subjects). Click to view in VDJbase."><span class="glyphicon glyphicon-info-sign"></span> </button>' % \ (Markup("'%sgenerep/%s/%s/%s'" % (app.config['VDJBASE_URL'], vdjbase_species, locus, vdjbase_name)), vdjbase_name, vdjbase_count) break bt_indels = '' bt_imgt = '' bt_codon_usage = '' bt_runs = '' bt_hotspots = '' bt_ref_found = '' annots = [] if item.sequence_id not in imgt_ref[ item.genotype_description.submission.species]: if item.closest_reference not in imgt_ref[ item.genotype_description.submission.species]: bt_ref_found = '<button type="button" class="btn btn-xs text-info icon_back" data-toggle="tooltip" title="Nearest reference not found in IMGT reference set"><span class="glyphicon glyphicon-info-sign"></span> </button>' else: for k, v in imgt_ref[ item.genotype_description.submission.species].items(): if item.nt_sequence.lower() == v: bt_imgt = '<button type="button" class="btn btn-xs text-info icon_back" data-toggle="tooltip" title="Sequence matches IMGT gene %s"><span class="glyphicon glyphicon-info-sign"></span> </button>' % k break # QA Checks # Alignment issues ref_nt = imgt_ref[item.genotype_description.submission. species][item.closest_reference].upper() seq_nt = item.nt_sequence.upper() mismatch = 0 aligned = True for (r, s) in zip(ref_nt, seq_nt): if r != s: mismatch += 1 if mismatch > 20: aligned = False bt_indels = '<button type="button" class="btn btn-xs text-info icon_back" data-toggle="tooltip" title="Sequence has indels/low match when compared to reference sequence"><span class="glyphicon glyphicon-info-sign"></span> </button>' if aligned: # Check for unusual AAs at each position if item.genotype_description.sequence_type == 'V' and item.genotype_description.locus + 'V' in ref_codon_usage[ item.genotype_description.submission.species]: try: q_codons = [] ref_aa_gapped = imgt_ref_gapped[ item.genotype_description.submission.species][ item.closest_reference].upper().translate( gap='.') seq_aa = Seq(item.nt_sequence.upper()).translate() seq_aa_gapped = gap_sequence(seq_aa, ref_aa_gapped) family = find_family(item.closest_reference) for i in range( min(len(ref_aa_gapped), len(seq_aa_gapped))): if ref_aa_gapped[i] != seq_aa_gapped[ i] and '*' not in ( ref_aa_gapped[i], seq_aa_gapped[i] ) and '.' not in (ref_aa_gapped[i], seq_aa_gapped[i]): if seq_aa_gapped[i] not in ref_codon_usage[ item.genotype_description. submission.species][ item.genotype_description.locus + 'V'][family][i + 1]: q_codons.append( "%s%d" % (seq_aa_gapped[i], i + 1)) j = len(seq_aa_gapped[:i].replace( '.', '')) annots.append(( 3 * j, 3, '%s%d previously unreported in this family' % (seq_aa_gapped[i], i + 1))) if len(q_codons) > 0: bt_codon_usage = '<button type="button" class="btn btn-xs text-info icon_back" data-toggle="tooltip" title="Amino Acid(s) previously unreported in this family: %s"><span class="glyphicon glyphicon-info-sign"></span> </button>' % ", ".join( q_codons) except: bt_codon_usage = '<button type="button" class="btn btn-xs text-info icon_back" data-toggle="tooltip" title="Error translating sequence: %s"><span class="glyphicon glyphicon-info-sign"></span> </button>' % sys.exc_info( )[0] # Check for lengthened strings of the same base seq_qpos = [ m.start() for m in re.finditer('(.)\\1+\\1+\\1+', str(seq_nt)) ] q_runs = [] # walk up each identified repeat of 4nt or more, flag any differences for p in seq_qpos: rep_c = seq_nt[p] i = p while i < len(seq_nt) and i < len( ref_nt) and seq_nt[i] == rep_c: if ref_nt[i] != rep_c: q_runs.append("%d" % find_gapped_index( i, item.genotype_description.submission. species, item.closest_reference)) annots.append( (i, 1, 'Possible repeated read error')) break i += 1 if len(q_runs) > 0: bt_runs = '<button type="button" class="btn btn-xs text-info icon_back" data-toggle="tooltip" title="Possible repeated read errors at IMGT position(s) %s"><span class="glyphicon glyphicon-info-sign"></span> </button>' % ", ".join( q_runs) # Check for RGYW/WRCY hotspot change ref_qpos = [ m.start() for m in re.finditer('[AG][G][CT][AT]', str(ref_nt)) ] q_hotspots = [] for p in ref_qpos: if seq_nt[p + 1] == 'C': q_hotspots.append("%d" % find_gapped_index( p + 1, item.genotype_description.submission.species, item.closest_reference)) annots.append( (p + 1, 1, 'G/C SNP in RGYW hotspot')) ref_qpos = [ m.start() for m in re.finditer('[AT][AG][C][CT]', str(ref_nt)) ] for p in ref_qpos: if seq_nt[p + 2] == 'G': q_hotspots.append("%d" % find_gapped_index( p + 2, item.genotype_description.submission.species, item.closest_reference)) annots.append( (p + 2, 1, 'C/G SNP in WRCY hotspot')) if len(q_hotspots) > 0: bt_hotspots = '<button type="button" class="btn btn-xs text-info icon_back" data-toggle="tooltip" title="G/C SNP in RGYW/WRCY hotspot at IMGT position(s) %s"><span class="glyphicon glyphicon-info-sign"></span> </button>' % ", ".join( q_hotspots) bt_view = popup_seq_button(item.sequence_id, item.nt_sequence, item.nt_sequence_gapped, annots=annots) return bt_view + bt_check + bt_imgt + bt_igpdb + bt_vdjbase + bt_indels + bt_codon_usage + bt_runs + bt_hotspots + bt_ref_found