def run(format, species, genomic_datasets, genomic_samples, rep_datasets, rep_samples, params): if len(rep_samples) == 0: raise BadRequest('No repertoire-derived genotypes were selected.') if format not in (['pdf', 'html']): raise BadRequest('Invalid format requested') single_sample_filter = 1 if params[ 'single_sample'] == 'One Selected Sample' else 0 calc_by_clone = 1 if params['calculate_by'] == 'Number of Clones' else 0 chain, samples_by_dataset = collate_samples(rep_samples) # Format we need to produce is [(gene_name, hetero count, h**o count),...] genes_frequencies = defaultdict(list) for dataset in samples_by_dataset.keys(): session = vdjbase_dbs[species][dataset].session for sample_chunk in chunk_list(samples_by_dataset[dataset], SAMPLE_CHUNKS): sample_list = session.query(Sample.sample_name, Sample.genotype, Sample.patient_id)\ .filter(Sample.sample_name.in_(sample_chunk))\ .filter(Sample.sample_group >= single_sample_filter)\ .all() sample_list, wanted_genes = apply_rep_filter_params( params, sample_list, session) sample_list = [s[0] for s in sample_list] i = 0 sample_list_len = len(sample_list) frequencies = session.query(GenesDistribution.sample_id, Gene.name, GenesDistribution.frequency)\ .join(Gene)\ .join(Sample)\ .filter(GenesDistribution.count_by_clones == calc_by_clone)\ .filter(Gene.name.in_(wanted_genes)) \ .filter(Sample.sample_name.in_(sample_list)) \ .all() for frequency in frequencies: genes_frequencies[frequency[1]].append( round(float(frequency[2]), 2)) labels = ['GENE', 'FREQ'] genes_frequencies_df = pd.DataFrame(columns=labels) for gene, usages in genes_frequencies.items(): genes_frequencies_df = genes_frequencies_df.append( { 'GENE': gene, 'FREQ': ",".join([str(x) for x in usages]) }, ignore_index=True) input_path = make_output_file('tab') genes_frequencies_df.to_csv(input_path, sep='\t', index=False) output_path = make_output_file(format) attachment_filename = '%s_gene_frequency.pdf' % species locus_order = ('sort_order' in params and params['sort_order'] == 'Locus') gene_order_file = get_multiple_order_file(species, samples_by_dataset.keys(), locus_order=locus_order) cmd_line = [ "-i", input_path, "-o", output_path, "-t", 'T' if format == 'html' else 'F', "-c", chain, "-g", gene_order_file ] if run_rscript(GENE_FREQUENCY_PLOT, cmd_line) and os.path.isfile( output_path) and os.path.getsize(output_path) != 0: return send_report(output_path, format, attachment_filename) else: raise BadRequest('No output from report')
def run(format, species, genomic_datasets, genomic_samples, rep_datasets, rep_samples, params): if len(rep_samples) == 0: raise BadRequest('No repertoire-derived genotypes were selected.') if format not in ['pdf', 'html']: raise BadRequest('Invalid format requested') html = (format == 'html') chain, samples_by_dataset = collate_samples(rep_samples) genotypes = pd.DataFrame() for dataset in samples_by_dataset.keys(): session = vdjbase_dbs[species][dataset].session primer_trans, gene_subs = find_primer_translations(session) sample_list = [] for sample_chunk in chunk_list(samples_by_dataset[dataset], SAMPLE_CHUNKS): sample_list.extend( session.query(Sample.sample_name, Sample.genotype, Sample.patient_id).filter( Sample.sample_name.in_(sample_chunk)).all()) sample_list, wanted_genes = apply_rep_filter_params( params, sample_list, session) if len(wanted_genes) > 0: for (name, genotype, patient_id) in sample_list: sample_path = os.path.join(VDJBASE_SAMPLE_PATH, species, dataset, genotype.replace('samples/', '')) if not os.path.isfile(sample_path): continue genotype = pd.read_csv(sample_path, sep='\t', dtype=str) genotype = trans_df(genotype) # translate pipeline allele names to VDJbase allele names for col in ['alleles', 'GENOTYPED_ALLELES']: genotype[col] = [ translate_primer_alleles(x, y, primer_trans) for x, y in zip(genotype['gene'], genotype[col]) ] genotype['gene'] = [ translate_primer_genes(x, gene_subs) for x in genotype['gene'] ] genotype = genotype[genotype.gene.isin(wanted_genes)] subject_name = name if len( samples_by_dataset) == 1 else dataset + '_' + name if 'subject' not in genotype.columns.values: genotype.insert(0, 'subject', subject_name) else: genotype.subject = subject_name genotypes = genotypes.append(genotype)[ genotype.columns.tolist()] if len(genotypes) == 0: raise BadRequest('No records matching the filter criteria were found.') geno_path = make_output_file('csv') genotypes.to_csv(geno_path, sep='\t') if format == 'pdf': attachment_filename = '%s_genotype.pdf' % species else: attachment_filename = None locus_order = ('sort_order' in params and params['sort_order'] == 'Locus') gene_order_file = get_multiple_order_file(species, samples_by_dataset.keys(), locus_order=locus_order) output_path = make_output_file('html' if html else 'pdf') file_type = 'T' if html else 'F' cmd_line = [ "-i", geno_path, "-o", output_path, "-t", file_type, "-k", str(params['f_kdiff']), "-c", chain, "-g", gene_order_file ] if run_rscript(HEATMAP_GENOTYPE_SCRIPT, cmd_line) and os.path.isfile( output_path) and os.path.getsize(output_path) != 0: return send_report(output_path, format, attachment_filename) else: raise BadRequest('No output from report')
def run(format, species, genomic_datasets, genomic_samples, rep_datasets, rep_samples, params): if len(rep_samples) == 0: raise BadRequest('No repertoire-derived genotypes were selected.') if format != 'html': raise BadRequest('Invalid format requested') kdiff = float(params['f_kdiff'] ) if 'f_kdiff' in params and params['f_kdiff'] != '' else 0 chain, samples_by_dataset = collate_samples(rep_samples) if len(samples_by_dataset ) > 1 and params['ambiguous_alleles'] != 'Exclude': raise BadRequest( 'Ambiguous alleles cannot be processed across multiple datasets') # Format we need to produce is [(gene_name, hetero count, h**o count),...] gene_allele_counts = {} for dataset in samples_by_dataset.keys(): session = vdjbase_dbs[species][dataset].session allele_recs = [] for sample_chunk in chunk_list(samples_by_dataset[dataset], SAMPLE_CHUNKS): sample_list = session.query( Sample.sample_name, Sample.genotype, Sample.patient_id).filter( Sample.sample_name.in_(sample_chunk)).all() sample_list, wanted_genes = apply_rep_filter_params( params, sample_list, session) sample_list = [s[0] for s in sample_list] query = session.query(Gene.name, Allele.id, Gene.type) \ .join(Allele) \ .join(AllelesSample) \ .join(Sample) \ .join(Patient, Patient.id == Sample.patient_id) \ .filter(Gene.name.in_(wanted_genes)) \ .filter(Allele.name.notlike('%Del%')) \ .filter(Allele.name.notlike('%OR%')) \ .filter(Sample.sample_name.in_(sample_list)) \ .filter(AllelesSample.kdiff >= kdiff) if 'sort_order' in params and params['sort_order'] == 'Locus': query = query.order_by(Gene.locus_order, Patient.id, Allele.id) else: query = query.order_by(Gene.alpha_order, Patient.id, Allele.id) if params['novel_alleles'] == 'Exclude': query = query.filter(Allele.novel == 0) if params['ambiguous_alleles'] == 'Exclude': query = query.filter(Allele.is_single_allele == 1) allele_recs.extend(query.all()) i = 0 while i < len(allele_recs): (gene_name, allele_id, gene_type) = allele_recs[i] gene_allele_ids = [] while i < len(allele_recs): if allele_recs[i][0] != gene_name: break allele_id = allele_recs[i][1] gene_allele_ids.append(allele_id) i += 1 gene_allele_ids = set(gene_allele_ids) # If we have both an unambiguous allele and an ambiguous allele containing that unambiguous one, # drop the unambiguous one because it is already counted if (params['ambiguous_alleles'] != 'Exclude'): patterns = session.query(AllelesPattern.pattern_id)\ .filter(AllelesPattern.allele_in_p_id.in_(gene_allele_ids))\ .filter(AllelesPattern.pattern_id.in_(gene_allele_ids))\ .all() if patterns is not None and len(patterns) > 0: patterns = set([pattern[0] for pattern in patterns]) gene_allele_ids = gene_allele_ids - patterns if gene_name not in gene_allele_counts: gene_allele_counts[gene_name] = gene_allele_ids else: gene_allele_counts[gene_name] |= gene_allele_ids listed_allele_count = [] for gene, alleles in gene_allele_counts.items(): listed_allele_count.append((gene, len(alleles))) labels = ['GENE', 'COUNT'] input_path = make_output_file('tab') df = pd.DataFrame(listed_allele_count, columns=labels) df.to_csv(input_path, sep='\t', index=False) output_path = make_output_file('html') cmd_line = ["-i", input_path, "-o", output_path, "-c", chain] if run_rscript(ALLELE_USAGE_SCRIPT, cmd_line) and os.path.isfile( output_path) and os.path.getsize(output_path) != 0: return send_report(output_path, format) else: raise BadRequest('No output from report')
def run(format, species, genomic_datasets, genomic_samples, rep_datasets, rep_samples, params): if len(rep_samples) == 0: raise BadRequest('No repertoire-derived genotypes were selected.') if format != 'pdf': raise BadRequest('Invalid format requested') html = (format == 'html') chain, samples_by_dataset = collate_samples(rep_samples) haplotypes = pd.DataFrame() for dataset in samples_by_dataset.keys(): session = vdjbase_dbs[species][dataset].session primer_trans, gene_subs = find_primer_translations(session) haplos = [] for sample_chunk in chunk_list(samples_by_dataset[dataset], SAMPLE_CHUNKS): sample_list = session.query( Sample.sample_name, Sample.genotype, Sample.patient_id).filter( Sample.sample_name.in_(sample_chunk)).all() sample_list, wanted_genes = apply_rep_filter_params( params, sample_list, session) sample_list = [s[0] for s in sample_list] haplo_query = session.query(Sample.sample_name, HaplotypesFile.file)\ .filter(Sample.sample_name.in_(sample_list))\ .join(SamplesHaplotype, Sample.id == SamplesHaplotype.samples_id)\ .filter(SamplesHaplotype.haplotypes_file_id == HaplotypesFile.id)\ .filter(HaplotypesFile.by_gene == params['haplo_gene']) haplos.extend(haplo_query.all()) for name, filename in haplos: sample_path = os.path.join(VDJBASE_SAMPLE_PATH, species, dataset, filename.replace('samples/', '')) if not os.path.isfile(sample_path): raise BadRequest('Haplotype file %s is missing.' % (sample_path)) haplotype = pd.read_csv(sample_path, sep='\t', dtype=str) haplotype = trans_df(haplotype) haplotype['subject'] = name if len( samples_by_dataset) == 1 else dataset + '_' + name # translate pipeline allele names to VDJbase allele names col_names = list(haplotype.columns.values) for i in (2, 3, 4): haplotype[col_names[i]] = [ translate_primer_alleles(x, y, primer_trans) for x, y in zip(haplotype['gene'], haplotype[col_names[i]]) ] haplotype['gene'] = [ translate_primer_genes(x, gene_subs) for x in haplotype['gene'] ] haplotype = haplotype[haplotype.gene.isin(wanted_genes)] haplotypes = pd.concat( [haplotypes, haplotype], keys=None, ignore_index=True)[haplotype.columns.tolist()] if len(haplotypes) == 0: raise BadRequest('No records matching the filter criteria were found.') haplo_path = make_output_file('tsv') haplotypes.to_csv(haplo_path, sep='\t', index=False) attachment_filename = '%s_haplotype_heatmap.pdf' % species if not params['f_kdiff'] or params['f_kdiff'] == '': params['f_kdiff'] = 0 locus_order = ('sort_order' in params and params['sort_order'] == 'Locus') gene_order_file = get_multiple_order_file(species, samples_by_dataset.keys(), locus_order=locus_order) output_path = make_output_file('html' if html else 'pdf') cmd_line = [ "-i", haplo_path, "-o", output_path, "-k", str(params['f_kdiff']), "-c", chain, "-g", gene_order_file ] if run_rscript(HEATMAP_HAPLOTYPE_SCRIPT, cmd_line) and os.path.isfile( output_path) and os.path.getsize(output_path) != 0: return send_report(output_path, format, attachment_filename) else: raise BadRequest('No output from report')
def run(format, species, genomic_datasets, genomic_samples, rep_datasets, rep_samples, params): if len(rep_samples) == 0: raise BadRequest('No repertoire-derived genotypes were selected.') if format != 'html': raise BadRequest('Invalid format requested') kdiff = float(params['f_kdiff'] ) if 'f_kdiff' in params and params['f_kdiff'] != '' else 0 chain, samples_by_dataset = collate_samples(rep_samples) # Format we need to produce is [(gene_name, hetero count, h**o count),...] gene_hetrozygous_dis = {} for dataset in samples_by_dataset.keys(): session = vdjbase_dbs[species][dataset].session allele_sample_recs = [] for sample_chunk in chunk_list(samples_by_dataset[dataset], SAMPLE_CHUNKS): sample_list = session.query( Sample.sample_name, Sample.genotype, Sample.patient_id).filter( Sample.sample_name.in_(sample_chunk)).all() sample_list, wanted_genes = apply_rep_filter_params( params, sample_list, session) sample_list = [s[0] for s in sample_list] query = session.query(Gene.name, Patient.id, Allele.id, Sample.sample_name, Gene.locus_order, AllelesSample.kdiff, Allele.name) \ .join(Allele, Gene.id == Allele.gene_id) \ .join(AllelesSample, Allele.id == AllelesSample.allele_id) \ .join(Sample, Sample.id == AllelesSample.sample_id) \ .join(Patient, Patient.id == Sample.patient_id) \ .filter(Gene.name.in_(wanted_genes)) \ .filter(Allele.name.notlike('%Del%')) \ .filter(Allele.name.notlike('%OR%')) \ .filter(Sample.sample_name.in_(sample_list)) \ .filter(AllelesSample.kdiff >= kdiff) if 'sort_order' in params and params['sort_order'] == 'Locus': query = query.order_by(Gene.locus_order, Patient.id, Allele.id) else: query = query.order_by(Gene.alpha_order, Patient.id, Allele.id) if params['ambiguous_alleles'] == 'Exclude': query = query.filter(Allele.is_single_allele == True) allele_sample_recs.extend(query.all()) # As the result is indexed, run over each gene in turn, count the number of alleles found in each patient, update h_counts accordingly i = 0 target_gene = '' while i < len(allele_sample_recs): target_gene = allele_sample_recs[i][0] h_counts = [0, 0] while i < len(allele_sample_recs): if allele_sample_recs[i][0] != target_gene: break target_patient = allele_sample_recs[i][1] patient_allele_ids = [] while i < len(allele_sample_recs): if allele_sample_recs[i][ 0] != target_gene or allele_sample_recs[i][ 1] != target_patient: break patient_allele_ids.append(allele_sample_recs[i][2]) i += 1 patient_allele_ids = set(patient_allele_ids) # If we have both an unambiguous allele and an ambiguous allele containing that unambiguous one, # drop the unambiguous one because it is already counted if params['ambiguous_alleles'] != 'Exclude': patterns = session.query(AllelesPattern.pattern_id)\ .filter(AllelesPattern.allele_in_p_id.in_(patient_allele_ids))\ .filter(AllelesPattern.pattern_id.in_(patient_allele_ids))\ .all() if patterns is not None and len(patterns) > 0: patterns = set([pattern[0] for pattern in patterns]) patient_allele_ids = patient_allele_ids - patterns if len(patient_allele_ids) > 1: h_counts[1] += 1 elif len(patient_allele_ids) > 0: h_counts[0] += 1 if target_gene not in gene_hetrozygous_dis: gene_hetrozygous_dis[target_gene] = (target_gene, h_counts[0], h_counts[1]) else: gene_hetrozygous_dis[target_gene] = ( target_gene, gene_hetrozygous_dis[target_gene][1] + h_counts[0], gene_hetrozygous_dis[target_gene][2] + h_counts[1]) haplo_path = make_output_file('tab') labels = ['GENE', 'HM', 'HT'] df = pd.DataFrame(gene_hetrozygous_dis.values(), columns=labels) df.to_csv(haplo_path, sep='\t', index=False) output_path = make_output_file('html') cmd_line = [ "-i", haplo_path, "-o", output_path, "-c", chain, ] if run_rscript(HETEROZYGOSITY_SCRIPT, cmd_line) and os.path.isfile( output_path) and os.path.getsize(output_path) != 0: return send_report(output_path, format) else: raise BadRequest('No output from report')