def run(format, species, genomic_datasets, genomic_samples, rep_datasets, rep_samples, params): if len(rep_samples) == 0: raise BadRequest('No repertoire-derived genotypes were selected.') if format not in (['pdf', 'html']): raise BadRequest('Invalid format requested') single_sample_filter = 1 if params[ 'single_sample'] == 'One Selected Sample' else 0 calc_by_clone = 1 if params['calculate_by'] == 'Number of Clones' else 0 chain, samples_by_dataset = collate_samples(rep_samples) # Format we need to produce is [(gene_name, hetero count, h**o count),...] genes_frequencies = defaultdict(list) for dataset in samples_by_dataset.keys(): session = vdjbase_dbs[species][dataset].session for sample_chunk in chunk_list(samples_by_dataset[dataset], SAMPLE_CHUNKS): sample_list = session.query(Sample.sample_name, Sample.genotype, Sample.patient_id)\ .filter(Sample.sample_name.in_(sample_chunk))\ .filter(Sample.sample_group >= single_sample_filter)\ .all() sample_list, wanted_genes = apply_rep_filter_params( params, sample_list, session) sample_list = [s[0] for s in sample_list] i = 0 sample_list_len = len(sample_list) frequencies = session.query(GenesDistribution.sample_id, Gene.name, GenesDistribution.frequency)\ .join(Gene)\ .join(Sample)\ .filter(GenesDistribution.count_by_clones == calc_by_clone)\ .filter(Gene.name.in_(wanted_genes)) \ .filter(Sample.sample_name.in_(sample_list)) \ .all() for frequency in frequencies: genes_frequencies[frequency[1]].append( round(float(frequency[2]), 2)) labels = ['GENE', 'FREQ'] genes_frequencies_df = pd.DataFrame(columns=labels) for gene, usages in genes_frequencies.items(): genes_frequencies_df = genes_frequencies_df.append( { 'GENE': gene, 'FREQ': ",".join([str(x) for x in usages]) }, ignore_index=True) input_path = make_output_file('tab') genes_frequencies_df.to_csv(input_path, sep='\t', index=False) output_path = make_output_file(format) attachment_filename = '%s_gene_frequency.pdf' % species locus_order = ('sort_order' in params and params['sort_order'] == 'Locus') gene_order_file = get_multiple_order_file(species, samples_by_dataset.keys(), locus_order=locus_order) cmd_line = [ "-i", input_path, "-o", output_path, "-t", 'T' if format == 'html' else 'F', "-c", chain, "-g", gene_order_file ] if run_rscript(GENE_FREQUENCY_PLOT, cmd_line) and os.path.isfile( output_path) and os.path.getsize(output_path) != 0: return send_report(output_path, format, attachment_filename) else: raise BadRequest('No output from report')
def run(format, species, genomic_datasets, genomic_samples, rep_datasets, rep_samples, params): if format != 'xls': raise BadRequest('Invalid format requested') rep_samples_by_dataset = {} for rep_sample in rep_samples: if rep_sample['dataset'] not in rep_samples_by_dataset: rep_samples_by_dataset[rep_sample['dataset']] = [] rep_samples_by_dataset[rep_sample['dataset']].append( rep_sample['sample_name']) imgt_refs = {} gene_order = {} sequences = {} all_wanted_genes = [] for dataset in rep_samples_by_dataset.keys(): session = vdjbase_dbs[species][dataset].session refs = session.query(Allele).all() for ref in refs: if ref.novel == 0 and ref.name not in imgt_refs: imgt_refs[ref.name] = ref.seq.replace('.', '') if ref.name not in sequences: sequences[ref.name.upper()] = ref.seq.replace('.', '').lower() genes = session.query(Gene).all() for gene in genes: if gene.name not in gene_order: if params['sort_order'] == 'Alphabetic': gene_order[gene.name] = gene.alpha_order else: gene_order[gene.name] = gene.locus_order rep_counts = {} for dataset in rep_samples_by_dataset.keys(): session = vdjbase_dbs[species][dataset].session appearances = [] for sample_chunk in chunk_list(rep_samples_by_dataset[dataset], SAMPLE_CHUNKS): sample_list = session.query(Sample.sample_name).filter( Sample.sample_name.in_(sample_chunk)).all() sample_list, wanted_genes = apply_rep_filter_params( params, sample_list, session) all_wanted_genes.extend(wanted_genes) sample_list = [s[0] for s in sample_list] app_query = session.query(AllelesSample.patient_id, Gene.name, Allele.name, Sample.sample_name, Patient.patient_name)\ .filter(Sample.id == AllelesSample.sample_id)\ .filter(Allele.id == AllelesSample.allele_id)\ .filter(Gene.id == Allele.gene_id)\ .filter(Patient.id == AllelesSample.patient_id)\ .filter(Sample.sample_name.in_(sample_list))\ .filter(Gene.name.in_(wanted_genes)) if params['novel_alleles'] == 'Exclude': app_query = app_query.filter(Allele.novel == 0) if params['ambiguous_alleles'] == 'Exclude': app_query = app_query.filter(Allele.is_single_allele == 1) appearances.extend(app_query.all()) for app in appearances: pid, gene, allele, sample, patient_name = app allele = allele.split('*', 1)[1].upper() if gene not in rep_counts: rep_counts[gene] = [{}, []] if allele not in rep_counts[gene][0]: rep_counts[gene][0][allele] = [] if patient_name not in rep_counts[gene][0][allele]: rep_counts[gene][0][allele].append(patient_name) if patient_name not in rep_counts[gene][1]: rep_counts[gene][1].append(patient_name) gen_samples_by_dataset = {} for gen_sample in genomic_samples: if gen_sample['dataset'] not in gen_samples_by_dataset: gen_samples_by_dataset[gen_sample['dataset']] = [] gen_samples_by_dataset[gen_sample['dataset']].append( gen_sample['identifier']) for dataset in gen_samples_by_dataset.keys(): session = genomic_dbs[species][dataset].session refs = session.query(GenomicSequence).all() for ref in refs: if ref.novel == 0 and ref.name not in imgt_refs: imgt_refs[ref.name] = ref.sequence.replace('.', '') if ref.name not in sequences: sequences[ref.name.upper()] = ref.sequence.replace('.', '').lower() genes = session.query(GenomicGene).all() for gene in genes: if gene.name not in gene_order: if params['sort_order'] == 'Alphabetic': gene_order[gene.name] = gene.alpha_order else: gene_order[gene.name] = gene.locus_order gen_counts = {} for dataset in gen_samples_by_dataset.keys(): session = genomic_dbs[species][dataset].session appearances = [] for sample_chunk in chunk_list(gen_samples_by_dataset[dataset], SAMPLE_CHUNKS): sample_list = session.query(GenomicSubject.identifier).filter( GenomicSubject.identifier.in_(sample_chunk)).all() sample_list, wanted_genes = apply_rep_filter_params( params, sample_list, session) all_wanted_genes.extend(wanted_genes) sample_list = [s[0] for s in sample_list] app_query = session.query(GenomicSubject.id, GenomicSubject.identifier, GenomicSubject.sequencing_platform, GenomicSubject.capture_probes, GenomicSequence.name, GenomicGene.name) \ .filter(GenomicSubject.id == GenomicSubjectSequence.subject_id) \ .filter(GenomicSequence.id == GenomicSubjectSequence.sequence_id) \ .filter(GenomicSequence.type.in_(['V-REGION', 'D-REGION', 'J-REGION'])) \ .filter(GenomicGene.id == GenomicSequence.gene_id)\ .filter(GenomicSubject.identifier.in_(sample_list))\ .filter(GenomicGene.name.in_(wanted_genes)) if params['novel_alleles'] == 'Exclude': app_query = app_query.filter(GenomicSequence.novel == 0) appearances.extend(app_query.all()) for app in appearances: _, patient_name, platform, probes, allele, gene = app allele = allele.split('*', 1)[1].upper() if gene not in gen_counts: gen_counts[gene] = [{}, [], {}, {}] if allele not in gen_counts[gene][0]: gen_counts[gene][0][allele] = [] gen_counts[gene][2][allele] = [] gen_counts[gene][3][allele] = [] if patient_name not in gen_counts[gene][0][allele]: gen_counts[gene][0][allele].append(patient_name) if patient_name not in gen_counts[gene][1]: gen_counts[gene][1].append(patient_name) if platform and platform not in gen_counts[gene][2][allele]: gen_counts[gene][2][allele].append(platform) if probes and probes not in gen_counts[gene][3][allele]: gen_counts[gene][3][allele].append(probes) imgt_counts = {} all_wanted_genes = list(set(all_wanted_genes)) for ref in imgt_refs.keys(): ref = ref.upper() gene, allele = ref.split('*') if gene in all_wanted_genes: if gene not in imgt_counts: imgt_counts[gene] = [{}, [1]] if allele not in imgt_counts[gene][0] and allele != 'DEL': imgt_counts[gene][0][allele] = [1] headers = ['Allele', 'IMGT', 'AIRR-Seq', 'Genomic'] genes_in_order = sorted(gene_order.items(), key=lambda x: x[1]) genes_in_order = [g[0] for g in genes_in_order] results = [] for gene in genes_in_order: # Assemble the set of alleles to list for this gene ref_alleles = [] novel_alleles = [] for counts in [imgt_counts, rep_counts, gen_counts]: if gene in counts: for allele in counts[gene][0].keys(): if '_' in allele: if allele not in novel_alleles: novel_alleles.append(allele) else: if allele not in ref_alleles: ref_alleles.append(allele) ref_alleles.sort() novel_alleles.sort() ref_alleles.extend(novel_alleles) def allele_count(gene, allele, counts): if gene not in counts: return 0 if allele not in counts[gene][0]: return 0 return len(counts[gene][0][allele]) def best_platform(gene, allele, counts): platforms = ['RS', 'SEQUEL', 'SEQUELII'] if gene not in counts: return '' if allele not in counts[gene][2]: return '' best = '' for platform in platforms: if platform in counts[gene][2][allele]: best = platform return best def best_probes(gene, allele, counts): probes = ['V2', 'V3'] if gene not in counts: return '' if allele not in counts[gene][3]: return '' best = '' for probe in probes: if probe in counts[gene][3][allele]: best = probe return best for allele in ref_alleles: row = { 'Allele': f'{gene}*{allele}', 'IMGT': allele_count(gene, allele, imgt_counts), 'AIRR-Seq': allele_count(gene, allele, rep_counts), 'Genomic': allele_count(gene, allele, gen_counts), 'Best platform': best_platform(gene, allele, gen_counts), 'Best probes': best_probes(gene, allele, gen_counts), 'Sequence': sequences[f'{gene}*{allele}'.upper()] } results.append(row) output_path = make_output_file('csv') write_csv(output_path, results) return send_report(output_path, 'csv', f'{species}_allele_usage.csv')
def run(format, species, genomic_datasets, genomic_samples, rep_datasets, rep_samples, params): if format != 'pdf' and format != 'xls': raise BadRequest('Invalid format requested') rep_samples_by_dataset = {} for rep_sample in rep_samples: if rep_sample['dataset'] not in rep_samples_by_dataset: rep_samples_by_dataset[rep_sample['dataset']] = [] rep_samples_by_dataset[rep_sample['dataset']].append( rep_sample['sample_name']) # Format we need to produce is [gene_name, [allele names], [allele appearances], gene appearances] # Start with a dict indexed by gene, then convert to appropriately sorted list counts = {} for dataset in rep_samples_by_dataset.keys(): session = vdjbase_dbs[species][dataset].session appearances = [] for sample_chunk in chunk_list(rep_samples_by_dataset[dataset], SAMPLE_CHUNKS): sample_list = session.query( Sample.sample_name, Sample.genotype, Sample.patient_id).filter( Sample.sample_name.in_(sample_chunk)).all() sample_list, wanted_genes = apply_rep_filter_params( params, sample_list, session) sample_list = [s[0] for s in sample_list] app_query = session.query(AllelesSample.patient_id, Patient.patient_name, Gene.name, Allele.name, Sample.sample_name, Gene.locus_order, Gene.alpha_order)\ .filter(Sample.id == AllelesSample.sample_id)\ .filter(Allele.id == AllelesSample.allele_id)\ .filter(Gene.id == Allele.gene_id)\ .filter(Patient.id == AllelesSample.patient_id)\ .filter(Sample.sample_name.in_(sample_list))\ .filter(Gene.name.in_(wanted_genes)) if params['novel_alleles'] == 'Exclude': app_query = app_query.filter(Allele.novel == 0) if params['ambiguous_alleles'] == 'Exclude': app_query = app_query.filter(Allele.is_single_allele == 1) appearances.extend(app_query.all()) for app in appearances: _, patient_name, gene, allele, sample, locus_order, alpha_order = app allele = allele.split('*', 1)[1].upper() if gene not in counts: if params['sort_order'] == 'Alphabetic': counts[gene] = [{}, [], alpha_order] else: counts[gene] = [{}, [], locus_order] if allele not in counts[gene][0]: counts[gene][0][allele] = [] if patient_name not in counts[gene][0][allele]: counts[gene][0][allele].append(patient_name) if patient_name not in counts[gene][1]: counts[gene][1].append(patient_name) gen_samples_by_dataset = {} for gen_sample in genomic_samples: if gen_sample['dataset'] not in gen_samples_by_dataset: gen_samples_by_dataset[gen_sample['dataset']] = [] gen_samples_by_dataset[gen_sample['dataset']].append( gen_sample['identifier']) for dataset in gen_samples_by_dataset.keys(): session = genomic_dbs[species][dataset].session appearances = [] for sample_chunk in chunk_list(gen_samples_by_dataset[dataset], SAMPLE_CHUNKS): sample_list = session.query(GenomicSubject.identifier).filter( GenomicSubject.identifier.in_(sample_chunk)).all() sample_list, wanted_genes = apply_rep_filter_params( params, sample_list, session) sample_list = [s[0] for s in sample_list] app_query = session.query(GenomicSubject.identifier, GenomicGene.name, GenomicSequence.name, GenomicGene.locus_order, GenomicGene.alpha_order)\ .filter(GenomicSubject.id == GenomicSubjectSequence.subject_id) \ .filter(GenomicSequence.id == GenomicSubjectSequence.sequence_id) \ .filter(GenomicGene.id == GenomicSequence.gene_id) \ .filter(GenomicSequence.type.in_(['V-REGION', 'D-REGION', 'J-REGION'])) \ .filter(GenomicSubject.identifier.in_(sample_list))\ .filter(GenomicGene.name.in_(wanted_genes)) if params['novel_alleles'] == 'Exclude': app_query = app_query.filter(GenomicSequence.novel == 0) appearances.extend(app_query.all()) for app in appearances: patient_name, gene, allele, locus_order, alpha_order = app allele = allele.split('*', 1)[1].upper() if gene not in counts: if params['sort_order'] == 'Alphabetic': counts[gene] = [{}, [], alpha_order] else: counts[gene] = [{}, [], locus_order] if allele not in counts[gene][0]: counts[gene][0][allele] = [] if patient_name not in counts[gene][0][allele]: counts[gene][0][allele].append(patient_name) if patient_name not in counts[gene][1]: counts[gene][1].append(patient_name) single_alleles = [] multi_alleles = [] for gene, (alleles, total, order) in counts.items(): row = [ gene, sorted(list(alleles.keys())), [len(alleles[a]) for a in sorted(alleles.keys())], len(total), order ] if len(alleles) > 1: multi_alleles.append(row) else: single_alleles.append(row) multi_alleles.sort(key=lambda row: row[4]) multi_alleles = [m[:4] for m in multi_alleles] single_alleles.sort(key=lambda row: row[4]) s = ['Single allele genes', [], []] for (gene, alleles, counts, _, _) in single_alleles: s[1].append(gene + '\n' + alleles[0]) s[2].append(counts[0]) multi_alleles.append(s) input_path = make_output_file('xls') output_path = make_output_file('pdf') book = xlwt.Workbook() for row in multi_alleles: if len(row[1]) > 0: write_gene(book, row) book.save(input_path) if format == 'xls': return send_report(input_path, format, '%s_allele_appearance.xls' % species) cmd_line = ["-i", input_path, "-o", output_path] if run_rscript(APPEARANCE_SCRIPT, cmd_line) and os.path.isfile( output_path) and os.path.getsize(output_path) != 0: return send_report(output_path, format, '%s_allele_appearance.pdf' % species) else: raise BadRequest('No output from report')
def run(format, species, genomic_datasets, genomic_samples, rep_datasets, rep_samples, params): if len(rep_samples) == 0: raise BadRequest('No repertoire-derived genotypes were selected.') if format not in ['pdf', 'html']: raise BadRequest('Invalid format requested') html = (format == 'html') chain, samples_by_dataset = collate_samples(rep_samples) genotypes = pd.DataFrame() for dataset in samples_by_dataset.keys(): session = vdjbase_dbs[species][dataset].session primer_trans, gene_subs = find_primer_translations(session) sample_list = [] for sample_chunk in chunk_list(samples_by_dataset[dataset], SAMPLE_CHUNKS): sample_list.extend( session.query(Sample.sample_name, Sample.genotype, Sample.patient_id).filter( Sample.sample_name.in_(sample_chunk)).all()) sample_list, wanted_genes = apply_rep_filter_params( params, sample_list, session) if len(wanted_genes) > 0: for (name, genotype, patient_id) in sample_list: sample_path = os.path.join(VDJBASE_SAMPLE_PATH, species, dataset, genotype.replace('samples/', '')) if not os.path.isfile(sample_path): continue genotype = pd.read_csv(sample_path, sep='\t', dtype=str) genotype = trans_df(genotype) # translate pipeline allele names to VDJbase allele names for col in ['alleles', 'GENOTYPED_ALLELES']: genotype[col] = [ translate_primer_alleles(x, y, primer_trans) for x, y in zip(genotype['gene'], genotype[col]) ] genotype['gene'] = [ translate_primer_genes(x, gene_subs) for x in genotype['gene'] ] genotype = genotype[genotype.gene.isin(wanted_genes)] subject_name = name if len( samples_by_dataset) == 1 else dataset + '_' + name if 'subject' not in genotype.columns.values: genotype.insert(0, 'subject', subject_name) else: genotype.subject = subject_name genotypes = genotypes.append(genotype)[ genotype.columns.tolist()] if len(genotypes) == 0: raise BadRequest('No records matching the filter criteria were found.') geno_path = make_output_file('csv') genotypes.to_csv(geno_path, sep='\t') if format == 'pdf': attachment_filename = '%s_genotype.pdf' % species else: attachment_filename = None locus_order = ('sort_order' in params and params['sort_order'] == 'Locus') gene_order_file = get_multiple_order_file(species, samples_by_dataset.keys(), locus_order=locus_order) output_path = make_output_file('html' if html else 'pdf') file_type = 'T' if html else 'F' cmd_line = [ "-i", geno_path, "-o", output_path, "-t", file_type, "-k", str(params['f_kdiff']), "-c", chain, "-g", gene_order_file ] if run_rscript(HEATMAP_GENOTYPE_SCRIPT, cmd_line) and os.path.isfile( output_path) and os.path.getsize(output_path) != 0: return send_report(output_path, format, attachment_filename) else: raise BadRequest('No output from report')
def run(format, species, genomic_datasets, genomic_samples, rep_datasets, rep_samples, params): if len(rep_samples) == 0: raise BadRequest('No repertoire-derived genotypes were selected.') if format != 'pdf': raise BadRequest('Invalid format requested') html = (format == 'html') chain, samples_by_dataset = collate_samples(rep_samples) haplotypes = pd.DataFrame() for dataset in samples_by_dataset.keys(): session = vdjbase_dbs[species][dataset].session primer_trans, gene_subs = find_primer_translations(session) haplos = [] for sample_chunk in chunk_list(samples_by_dataset[dataset], SAMPLE_CHUNKS): sample_list = session.query( Sample.sample_name, Sample.genotype, Sample.patient_id).filter( Sample.sample_name.in_(sample_chunk)).all() sample_list, wanted_genes = apply_rep_filter_params( params, sample_list, session) sample_list = [s[0] for s in sample_list] haplo_query = session.query(Sample.sample_name, HaplotypesFile.file)\ .filter(Sample.sample_name.in_(sample_list))\ .join(SamplesHaplotype, Sample.id == SamplesHaplotype.samples_id)\ .filter(SamplesHaplotype.haplotypes_file_id == HaplotypesFile.id)\ .filter(HaplotypesFile.by_gene == params['haplo_gene']) haplos.extend(haplo_query.all()) for name, filename in haplos: sample_path = os.path.join(VDJBASE_SAMPLE_PATH, species, dataset, filename.replace('samples/', '')) if not os.path.isfile(sample_path): raise BadRequest('Haplotype file %s is missing.' % (sample_path)) haplotype = pd.read_csv(sample_path, sep='\t', dtype=str) haplotype = trans_df(haplotype) haplotype['subject'] = name if len( samples_by_dataset) == 1 else dataset + '_' + name # translate pipeline allele names to VDJbase allele names col_names = list(haplotype.columns.values) for i in (2, 3, 4): haplotype[col_names[i]] = [ translate_primer_alleles(x, y, primer_trans) for x, y in zip(haplotype['gene'], haplotype[col_names[i]]) ] haplotype['gene'] = [ translate_primer_genes(x, gene_subs) for x in haplotype['gene'] ] haplotype = haplotype[haplotype.gene.isin(wanted_genes)] haplotypes = pd.concat( [haplotypes, haplotype], keys=None, ignore_index=True)[haplotype.columns.tolist()] if len(haplotypes) == 0: raise BadRequest('No records matching the filter criteria were found.') haplo_path = make_output_file('tsv') haplotypes.to_csv(haplo_path, sep='\t', index=False) attachment_filename = '%s_haplotype_heatmap.pdf' % species if not params['f_kdiff'] or params['f_kdiff'] == '': params['f_kdiff'] = 0 locus_order = ('sort_order' in params and params['sort_order'] == 'Locus') gene_order_file = get_multiple_order_file(species, samples_by_dataset.keys(), locus_order=locus_order) output_path = make_output_file('html' if html else 'pdf') cmd_line = [ "-i", haplo_path, "-o", output_path, "-k", str(params['f_kdiff']), "-c", chain, "-g", gene_order_file ] if run_rscript(HEATMAP_HAPLOTYPE_SCRIPT, cmd_line) and os.path.isfile( output_path) and os.path.getsize(output_path) != 0: return send_report(output_path, format, attachment_filename) else: raise BadRequest('No output from report')
def run(format, species, genomic_datasets, genomic_samples, rep_datasets, rep_samples, params): if len(rep_samples) == 0: raise BadRequest('No repertoire-derived genotypes were selected.') if format != 'html': raise BadRequest('Invalid format requested') kdiff = float(params['f_kdiff'] ) if 'f_kdiff' in params and params['f_kdiff'] != '' else 0 chain, samples_by_dataset = collate_samples(rep_samples) if len(samples_by_dataset ) > 1 and params['ambiguous_alleles'] != 'Exclude': raise BadRequest( 'Ambiguous alleles cannot be processed across multiple datasets') # Format we need to produce is [(gene_name, hetero count, h**o count),...] gene_allele_counts = {} for dataset in samples_by_dataset.keys(): session = vdjbase_dbs[species][dataset].session allele_recs = [] for sample_chunk in chunk_list(samples_by_dataset[dataset], SAMPLE_CHUNKS): sample_list = session.query( Sample.sample_name, Sample.genotype, Sample.patient_id).filter( Sample.sample_name.in_(sample_chunk)).all() sample_list, wanted_genes = apply_rep_filter_params( params, sample_list, session) sample_list = [s[0] for s in sample_list] query = session.query(Gene.name, Allele.id, Gene.type) \ .join(Allele) \ .join(AllelesSample) \ .join(Sample) \ .join(Patient, Patient.id == Sample.patient_id) \ .filter(Gene.name.in_(wanted_genes)) \ .filter(Allele.name.notlike('%Del%')) \ .filter(Allele.name.notlike('%OR%')) \ .filter(Sample.sample_name.in_(sample_list)) \ .filter(AllelesSample.kdiff >= kdiff) if 'sort_order' in params and params['sort_order'] == 'Locus': query = query.order_by(Gene.locus_order, Patient.id, Allele.id) else: query = query.order_by(Gene.alpha_order, Patient.id, Allele.id) if params['novel_alleles'] == 'Exclude': query = query.filter(Allele.novel == 0) if params['ambiguous_alleles'] == 'Exclude': query = query.filter(Allele.is_single_allele == 1) allele_recs.extend(query.all()) i = 0 while i < len(allele_recs): (gene_name, allele_id, gene_type) = allele_recs[i] gene_allele_ids = [] while i < len(allele_recs): if allele_recs[i][0] != gene_name: break allele_id = allele_recs[i][1] gene_allele_ids.append(allele_id) i += 1 gene_allele_ids = set(gene_allele_ids) # If we have both an unambiguous allele and an ambiguous allele containing that unambiguous one, # drop the unambiguous one because it is already counted if (params['ambiguous_alleles'] != 'Exclude'): patterns = session.query(AllelesPattern.pattern_id)\ .filter(AllelesPattern.allele_in_p_id.in_(gene_allele_ids))\ .filter(AllelesPattern.pattern_id.in_(gene_allele_ids))\ .all() if patterns is not None and len(patterns) > 0: patterns = set([pattern[0] for pattern in patterns]) gene_allele_ids = gene_allele_ids - patterns if gene_name not in gene_allele_counts: gene_allele_counts[gene_name] = gene_allele_ids else: gene_allele_counts[gene_name] |= gene_allele_ids listed_allele_count = [] for gene, alleles in gene_allele_counts.items(): listed_allele_count.append((gene, len(alleles))) labels = ['GENE', 'COUNT'] input_path = make_output_file('tab') df = pd.DataFrame(listed_allele_count, columns=labels) df.to_csv(input_path, sep='\t', index=False) output_path = make_output_file('html') cmd_line = ["-i", input_path, "-o", output_path, "-c", chain] if run_rscript(ALLELE_USAGE_SCRIPT, cmd_line) and os.path.isfile( output_path) and os.path.getsize(output_path) != 0: return send_report(output_path, format) else: raise BadRequest('No output from report')
def run(format, species, genomic_datasets, genomic_samples, rep_datasets, rep_samples, params): if len(rep_samples) == 0: raise BadRequest('No repertoire-derived genotypes were selected.') if 'Sample info' in params['type']: samples_by_dataset = {} for rep_sample in rep_samples: if rep_sample['dataset'] not in samples_by_dataset: samples_by_dataset[rep_sample['dataset']] = [] samples_by_dataset[rep_sample['dataset']].append(rep_sample['sample_name']) attribute_query = [] headers = [] for name, filter in sample_info_filters.items(): if filter['model'] is not None: attribute_query.append(filter['field']) headers.append(name) rows = [] for dataset in samples_by_dataset.keys(): session = vdjbase_dbs[species][dataset].session for sample_chunk in chunk_list(samples_by_dataset[dataset], SAMPLE_CHUNKS): sample_list = session.query(Sample.sample_name, Sample.genotype, Sample.patient_id).filter(Sample.sample_name.in_(sample_chunk)).all() sample_list = [s[0] for s in sample_list] results = session.query(*attribute_query)\ .join(GenoDetection, GenoDetection.id == Sample.geno_detection_id)\ .join(Patient, Patient.id == Sample.patient_id)\ .join(SeqProtocol)\ .join(TissuePro)\ .join(Study, Sample.study_id == Study.id)\ .filter(Sample.sample_name.in_(sample_list)).all() rows.extend(results) outfile = make_output_file('csv') with open(outfile, 'w', newline='') as fo: writer = csv.writer(fo, dialect='excel') writer.writerow(headers) for row in rows: writer.writerow(row) return send_report(outfile, 'csv', attachment_filename='sample_info.csv') elif 'Sample files' in params['type']: outfile = make_output_file('zip') with zipfile.ZipFile(outfile, 'w', zipfile.ZIP_DEFLATED) as fo: samples_by_dataset = {} for rep_sample in rep_samples: if rep_sample['dataset'] not in samples_by_dataset: samples_by_dataset[rep_sample['dataset']] = [] samples_by_dataset[rep_sample['dataset']].append(rep_sample['sample_name']) added_files = [] # handle multiple samples in same dir etc added_dirs = [] for dataset in samples_by_dataset.keys(): print('adding dataset') session = vdjbase_dbs[species][dataset].session for sample_chunk in chunk_list(samples_by_dataset[dataset], SAMPLE_CHUNKS): sample_list = session.query(Sample.genotype, Sample.igsnper_plot_path).filter(Sample.sample_name.in_(sample_chunk)).all() for p1, p2 in sample_list: if p1 is not None and len(p1) > 0: sample_dir = os.path.join(VDJBASE_SAMPLE_PATH, species, dataset, os.path.dirname(p1.replace('samples/', ''))) if sample_dir not in added_dirs: zipdir(sample_dir, fo, os.path.join(VDJBASE_SAMPLE_PATH, species)) # sample files added_dirs.append(sample_dir) if p2 is not None and len(p2) > 0: igsnper_path = os.path.join(VDJBASE_SAMPLE_PATH, species, dataset, p2) if igsnper_path not in added_files: fo.write(igsnper_path, arcname=igsnper_path.replace(os.path.join(VDJBASE_SAMPLE_PATH, species), '')) added_files.append(igsnper_path) return send_report(outfile, 'zip', attachment_filename='sample_data.zip') elif 'Ungapped' in params['type'] or 'Gapped' in params['type']: required_cols = ['name', 'seq', 'dataset'] seqs = find_sequences(params, rep_samples, species, required_cols) recs = [] for seq in seqs: id = '%s|%s|%s' % (seq['name'], species, seq['dataset']) recs.append(SeqRecord(Seq(seq['seq'] if 'Gapped' in params['type'] else seq['seq'].replace('.', '')), id=id, description='')) outfile = make_output_file('fasta') SeqIO.write(recs, outfile, "fasta") return send_report(outfile, 'fasta', attachment_filename='%s_sequences.fasta' % species) elif 'Gene info' in params['type']: headers = [] for name, att_filter in sequence_filters.items(): if att_filter['model'] is not None: headers.append(name) headers.append('dataset') rows = find_sequences(params, rep_samples, species, headers) outfile = make_output_file('csv') with open(outfile, 'w', newline='') as fo: writer = csv.DictWriter(fo, dialect='excel', fieldnames=headers) writer.writeheader() for row in rows: writer.writerow(row) return send_report(outfile, 'csv', attachment_filename='sequence_info.csv') raise BadRequest('No output from report')
def run(format, species, genomic_datasets, genomic_samples, rep_datasets, rep_samples, params): if len(rep_samples) == 0: raise BadRequest('No repertoire-derived genotypes were selected.') if format != 'html': raise BadRequest('Invalid format requested') kdiff = float(params['f_kdiff'] ) if 'f_kdiff' in params and params['f_kdiff'] != '' else 0 chain, samples_by_dataset = collate_samples(rep_samples) # Format we need to produce is [(gene_name, hetero count, h**o count),...] gene_hetrozygous_dis = {} for dataset in samples_by_dataset.keys(): session = vdjbase_dbs[species][dataset].session allele_sample_recs = [] for sample_chunk in chunk_list(samples_by_dataset[dataset], SAMPLE_CHUNKS): sample_list = session.query( Sample.sample_name, Sample.genotype, Sample.patient_id).filter( Sample.sample_name.in_(sample_chunk)).all() sample_list, wanted_genes = apply_rep_filter_params( params, sample_list, session) sample_list = [s[0] for s in sample_list] query = session.query(Gene.name, Patient.id, Allele.id, Sample.sample_name, Gene.locus_order, AllelesSample.kdiff, Allele.name) \ .join(Allele, Gene.id == Allele.gene_id) \ .join(AllelesSample, Allele.id == AllelesSample.allele_id) \ .join(Sample, Sample.id == AllelesSample.sample_id) \ .join(Patient, Patient.id == Sample.patient_id) \ .filter(Gene.name.in_(wanted_genes)) \ .filter(Allele.name.notlike('%Del%')) \ .filter(Allele.name.notlike('%OR%')) \ .filter(Sample.sample_name.in_(sample_list)) \ .filter(AllelesSample.kdiff >= kdiff) if 'sort_order' in params and params['sort_order'] == 'Locus': query = query.order_by(Gene.locus_order, Patient.id, Allele.id) else: query = query.order_by(Gene.alpha_order, Patient.id, Allele.id) if params['ambiguous_alleles'] == 'Exclude': query = query.filter(Allele.is_single_allele == True) allele_sample_recs.extend(query.all()) # As the result is indexed, run over each gene in turn, count the number of alleles found in each patient, update h_counts accordingly i = 0 target_gene = '' while i < len(allele_sample_recs): target_gene = allele_sample_recs[i][0] h_counts = [0, 0] while i < len(allele_sample_recs): if allele_sample_recs[i][0] != target_gene: break target_patient = allele_sample_recs[i][1] patient_allele_ids = [] while i < len(allele_sample_recs): if allele_sample_recs[i][ 0] != target_gene or allele_sample_recs[i][ 1] != target_patient: break patient_allele_ids.append(allele_sample_recs[i][2]) i += 1 patient_allele_ids = set(patient_allele_ids) # If we have both an unambiguous allele and an ambiguous allele containing that unambiguous one, # drop the unambiguous one because it is already counted if params['ambiguous_alleles'] != 'Exclude': patterns = session.query(AllelesPattern.pattern_id)\ .filter(AllelesPattern.allele_in_p_id.in_(patient_allele_ids))\ .filter(AllelesPattern.pattern_id.in_(patient_allele_ids))\ .all() if patterns is not None and len(patterns) > 0: patterns = set([pattern[0] for pattern in patterns]) patient_allele_ids = patient_allele_ids - patterns if len(patient_allele_ids) > 1: h_counts[1] += 1 elif len(patient_allele_ids) > 0: h_counts[0] += 1 if target_gene not in gene_hetrozygous_dis: gene_hetrozygous_dis[target_gene] = (target_gene, h_counts[0], h_counts[1]) else: gene_hetrozygous_dis[target_gene] = ( target_gene, gene_hetrozygous_dis[target_gene][1] + h_counts[0], gene_hetrozygous_dis[target_gene][2] + h_counts[1]) haplo_path = make_output_file('tab') labels = ['GENE', 'HM', 'HT'] df = pd.DataFrame(gene_hetrozygous_dis.values(), columns=labels) df.to_csv(haplo_path, sep='\t', index=False) output_path = make_output_file('html') cmd_line = [ "-i", haplo_path, "-o", output_path, "-c", chain, ] if run_rscript(HETEROZYGOSITY_SCRIPT, cmd_line) and os.path.isfile( output_path) and os.path.getsize(output_path) != 0: return send_report(output_path, format) else: raise BadRequest('No output from report')