Пример #1
0
def run(format, species, genomic_datasets, genomic_samples, rep_datasets,
        rep_samples, params):
    if len(rep_samples) == 0:
        raise BadRequest('No repertoire-derived genotypes were selected.')

    if format not in (['pdf', 'html']):
        raise BadRequest('Invalid format requested')

    single_sample_filter = 1 if params[
        'single_sample'] == 'One Selected Sample' else 0
    calc_by_clone = 1 if params['calculate_by'] == 'Number of Clones' else 0
    chain, samples_by_dataset = collate_samples(rep_samples)

    # Format we need to produce is [(gene_name, hetero count, h**o count),...]

    genes_frequencies = defaultdict(list)

    for dataset in samples_by_dataset.keys():
        session = vdjbase_dbs[species][dataset].session

        for sample_chunk in chunk_list(samples_by_dataset[dataset],
                                       SAMPLE_CHUNKS):
            sample_list = session.query(Sample.sample_name, Sample.genotype, Sample.patient_id)\
                .filter(Sample.sample_name.in_(sample_chunk))\
                .filter(Sample.sample_group >= single_sample_filter)\
                .all()
            sample_list, wanted_genes = apply_rep_filter_params(
                params, sample_list, session)
            sample_list = [s[0] for s in sample_list]

            i = 0
            sample_list_len = len(sample_list)

            frequencies = session.query(GenesDistribution.sample_id, Gene.name, GenesDistribution.frequency)\
                .join(Gene)\
                .join(Sample)\
                .filter(GenesDistribution.count_by_clones == calc_by_clone)\
                .filter(Gene.name.in_(wanted_genes)) \
                .filter(Sample.sample_name.in_(sample_list)) \
                .all()

            for frequency in frequencies:
                genes_frequencies[frequency[1]].append(
                    round(float(frequency[2]), 2))

    labels = ['GENE', 'FREQ']
    genes_frequencies_df = pd.DataFrame(columns=labels)
    for gene, usages in genes_frequencies.items():
        genes_frequencies_df = genes_frequencies_df.append(
            {
                'GENE': gene,
                'FREQ': ",".join([str(x) for x in usages])
            },
            ignore_index=True)

    input_path = make_output_file('tab')
    genes_frequencies_df.to_csv(input_path, sep='\t', index=False)

    output_path = make_output_file(format)
    attachment_filename = '%s_gene_frequency.pdf' % species

    locus_order = ('sort_order' in params and params['sort_order'] == 'Locus')
    gene_order_file = get_multiple_order_file(species,
                                              samples_by_dataset.keys(),
                                              locus_order=locus_order)

    cmd_line = [
        "-i", input_path, "-o", output_path, "-t",
        'T' if format == 'html' else 'F', "-c", chain, "-g", gene_order_file
    ]

    if run_rscript(GENE_FREQUENCY_PLOT, cmd_line) and os.path.isfile(
            output_path) and os.path.getsize(output_path) != 0:
        return send_report(output_path, format, attachment_filename)
    else:
        raise BadRequest('No output from report')
Пример #2
0
def run(format, species, genomic_datasets, genomic_samples, rep_datasets,
        rep_samples, params):
    if len(rep_samples) == 0:
        raise BadRequest('No repertoire-derived genotypes were selected.')

    if format not in ['pdf', 'html']:
        raise BadRequest('Invalid format requested')

    html = (format == 'html')
    chain, samples_by_dataset = collate_samples(rep_samples)
    genotypes = pd.DataFrame()

    for dataset in samples_by_dataset.keys():
        session = vdjbase_dbs[species][dataset].session
        primer_trans, gene_subs = find_primer_translations(session)

        sample_list = []
        for sample_chunk in chunk_list(samples_by_dataset[dataset],
                                       SAMPLE_CHUNKS):
            sample_list.extend(
                session.query(Sample.sample_name, Sample.genotype,
                              Sample.patient_id).filter(
                                  Sample.sample_name.in_(sample_chunk)).all())

        sample_list, wanted_genes = apply_rep_filter_params(
            params, sample_list, session)

        if len(wanted_genes) > 0:
            for (name, genotype, patient_id) in sample_list:
                sample_path = os.path.join(VDJBASE_SAMPLE_PATH, species,
                                           dataset,
                                           genotype.replace('samples/', ''))

                if not os.path.isfile(sample_path):
                    continue

                genotype = pd.read_csv(sample_path, sep='\t', dtype=str)

                genotype = trans_df(genotype)

                # translate pipeline allele names to VDJbase allele names
                for col in ['alleles', 'GENOTYPED_ALLELES']:
                    genotype[col] = [
                        translate_primer_alleles(x, y, primer_trans)
                        for x, y in zip(genotype['gene'], genotype[col])
                    ]

                genotype['gene'] = [
                    translate_primer_genes(x, gene_subs)
                    for x in genotype['gene']
                ]
                genotype = genotype[genotype.gene.isin(wanted_genes)]

                subject_name = name if len(
                    samples_by_dataset) == 1 else dataset + '_' + name

                if 'subject' not in genotype.columns.values:
                    genotype.insert(0, 'subject', subject_name)
                else:
                    genotype.subject = subject_name

                genotypes = genotypes.append(genotype)[
                    genotype.columns.tolist()]

    if len(genotypes) == 0:
        raise BadRequest('No records matching the filter criteria were found.')

    geno_path = make_output_file('csv')
    genotypes.to_csv(geno_path, sep='\t')

    if format == 'pdf':
        attachment_filename = '%s_genotype.pdf' % species
    else:
        attachment_filename = None

    locus_order = ('sort_order' in params and params['sort_order'] == 'Locus')
    gene_order_file = get_multiple_order_file(species,
                                              samples_by_dataset.keys(),
                                              locus_order=locus_order)

    output_path = make_output_file('html' if html else 'pdf')
    file_type = 'T' if html else 'F'
    cmd_line = [
        "-i", geno_path, "-o", output_path, "-t", file_type, "-k",
        str(params['f_kdiff']), "-c", chain, "-g", gene_order_file
    ]

    if run_rscript(HEATMAP_GENOTYPE_SCRIPT, cmd_line) and os.path.isfile(
            output_path) and os.path.getsize(output_path) != 0:
        return send_report(output_path, format, attachment_filename)
    else:
        raise BadRequest('No output from report')
Пример #3
0
def run(format, species, genomic_datasets, genomic_samples, rep_datasets,
        rep_samples, params):
    if len(rep_samples) == 0:
        raise BadRequest('No repertoire-derived genotypes were selected.')

    if format != 'html':
        raise BadRequest('Invalid format requested')

    kdiff = float(params['f_kdiff']
                  ) if 'f_kdiff' in params and params['f_kdiff'] != '' else 0
    chain, samples_by_dataset = collate_samples(rep_samples)

    if len(samples_by_dataset
           ) > 1 and params['ambiguous_alleles'] != 'Exclude':
        raise BadRequest(
            'Ambiguous alleles cannot be processed across multiple datasets')

    # Format we need to produce is [(gene_name, hetero count, h**o count),...]

    gene_allele_counts = {}

    for dataset in samples_by_dataset.keys():
        session = vdjbase_dbs[species][dataset].session
        allele_recs = []

        for sample_chunk in chunk_list(samples_by_dataset[dataset],
                                       SAMPLE_CHUNKS):
            sample_list = session.query(
                Sample.sample_name, Sample.genotype, Sample.patient_id).filter(
                    Sample.sample_name.in_(sample_chunk)).all()
            sample_list, wanted_genes = apply_rep_filter_params(
                params, sample_list, session)
            sample_list = [s[0] for s in sample_list]

            query = session.query(Gene.name, Allele.id, Gene.type) \
                .join(Allele) \
                .join(AllelesSample) \
                .join(Sample) \
                .join(Patient, Patient.id == Sample.patient_id) \
                .filter(Gene.name.in_(wanted_genes)) \
                .filter(Allele.name.notlike('%Del%')) \
                .filter(Allele.name.notlike('%OR%')) \
                .filter(Sample.sample_name.in_(sample_list)) \
                .filter(AllelesSample.kdiff >= kdiff)

            if 'sort_order' in params and params['sort_order'] == 'Locus':
                query = query.order_by(Gene.locus_order, Patient.id, Allele.id)
            else:
                query = query.order_by(Gene.alpha_order, Patient.id, Allele.id)

            if params['novel_alleles'] == 'Exclude':
                query = query.filter(Allele.novel == 0)

            if params['ambiguous_alleles'] == 'Exclude':
                query = query.filter(Allele.is_single_allele == 1)

            allele_recs.extend(query.all())

        i = 0
        while i < len(allele_recs):
            (gene_name, allele_id, gene_type) = allele_recs[i]
            gene_allele_ids = []

            while i < len(allele_recs):
                if allele_recs[i][0] != gene_name:
                    break

                allele_id = allele_recs[i][1]
                gene_allele_ids.append(allele_id)
                i += 1

            gene_allele_ids = set(gene_allele_ids)

            # If we have both an unambiguous allele and an ambiguous allele containing that unambiguous one,
            # drop the unambiguous one because it is already counted

            if (params['ambiguous_alleles'] != 'Exclude'):
                patterns = session.query(AllelesPattern.pattern_id)\
                    .filter(AllelesPattern.allele_in_p_id.in_(gene_allele_ids))\
                    .filter(AllelesPattern.pattern_id.in_(gene_allele_ids))\
                    .all()

                if patterns is not None and len(patterns) > 0:
                    patterns = set([pattern[0] for pattern in patterns])
                    gene_allele_ids = gene_allele_ids - patterns

            if gene_name not in gene_allele_counts:
                gene_allele_counts[gene_name] = gene_allele_ids
            else:
                gene_allele_counts[gene_name] |= gene_allele_ids

    listed_allele_count = []
    for gene, alleles in gene_allele_counts.items():
        listed_allele_count.append((gene, len(alleles)))

    labels = ['GENE', 'COUNT']
    input_path = make_output_file('tab')
    df = pd.DataFrame(listed_allele_count, columns=labels)
    df.to_csv(input_path, sep='\t', index=False)
    output_path = make_output_file('html')

    cmd_line = ["-i", input_path, "-o", output_path, "-c", chain]

    if run_rscript(ALLELE_USAGE_SCRIPT, cmd_line) and os.path.isfile(
            output_path) and os.path.getsize(output_path) != 0:
        return send_report(output_path, format)
    else:
        raise BadRequest('No output from report')
Пример #4
0
def run(format, species, genomic_datasets, genomic_samples, rep_datasets,
        rep_samples, params):
    if len(rep_samples) == 0:
        raise BadRequest('No repertoire-derived genotypes were selected.')

    if format != 'pdf':
        raise BadRequest('Invalid format requested')

    html = (format == 'html')

    chain, samples_by_dataset = collate_samples(rep_samples)
    haplotypes = pd.DataFrame()

    for dataset in samples_by_dataset.keys():
        session = vdjbase_dbs[species][dataset].session
        primer_trans, gene_subs = find_primer_translations(session)

        haplos = []
        for sample_chunk in chunk_list(samples_by_dataset[dataset],
                                       SAMPLE_CHUNKS):
            sample_list = session.query(
                Sample.sample_name, Sample.genotype, Sample.patient_id).filter(
                    Sample.sample_name.in_(sample_chunk)).all()
            sample_list, wanted_genes = apply_rep_filter_params(
                params, sample_list, session)
            sample_list = [s[0] for s in sample_list]
            haplo_query = session.query(Sample.sample_name, HaplotypesFile.file)\
                .filter(Sample.sample_name.in_(sample_list))\
                .join(SamplesHaplotype, Sample.id == SamplesHaplotype.samples_id)\
                .filter(SamplesHaplotype.haplotypes_file_id == HaplotypesFile.id)\
                .filter(HaplotypesFile.by_gene == params['haplo_gene'])
            haplos.extend(haplo_query.all())

        for name, filename in haplos:
            sample_path = os.path.join(VDJBASE_SAMPLE_PATH, species, dataset,
                                       filename.replace('samples/', ''))

            if not os.path.isfile(sample_path):
                raise BadRequest('Haplotype file %s is missing.' %
                                 (sample_path))

            haplotype = pd.read_csv(sample_path, sep='\t', dtype=str)
            haplotype = trans_df(haplotype)
            haplotype['subject'] = name if len(
                samples_by_dataset) == 1 else dataset + '_' + name

            # translate pipeline allele names to VDJbase allele names

            col_names = list(haplotype.columns.values)
            for i in (2, 3, 4):
                haplotype[col_names[i]] = [
                    translate_primer_alleles(x, y, primer_trans)
                    for x, y in zip(haplotype['gene'], haplotype[col_names[i]])
                ]

            haplotype['gene'] = [
                translate_primer_genes(x, gene_subs) for x in haplotype['gene']
            ]
            haplotype = haplotype[haplotype.gene.isin(wanted_genes)]

            haplotypes = pd.concat(
                [haplotypes, haplotype], keys=None,
                ignore_index=True)[haplotype.columns.tolist()]

    if len(haplotypes) == 0:
        raise BadRequest('No records matching the filter criteria were found.')

    haplo_path = make_output_file('tsv')
    haplotypes.to_csv(haplo_path, sep='\t', index=False)
    attachment_filename = '%s_haplotype_heatmap.pdf' % species

    if not params['f_kdiff'] or params['f_kdiff'] == '':
        params['f_kdiff'] = 0

    locus_order = ('sort_order' in params and params['sort_order'] == 'Locus')
    gene_order_file = get_multiple_order_file(species,
                                              samples_by_dataset.keys(),
                                              locus_order=locus_order)
    output_path = make_output_file('html' if html else 'pdf')
    cmd_line = [
        "-i", haplo_path, "-o", output_path, "-k",
        str(params['f_kdiff']), "-c", chain, "-g", gene_order_file
    ]

    if run_rscript(HEATMAP_HAPLOTYPE_SCRIPT, cmd_line) and os.path.isfile(
            output_path) and os.path.getsize(output_path) != 0:
        return send_report(output_path, format, attachment_filename)
    else:
        raise BadRequest('No output from report')
Пример #5
0
def run(format, species, genomic_datasets, genomic_samples, rep_datasets,
        rep_samples, params):
    if len(rep_samples) == 0:
        raise BadRequest('No repertoire-derived genotypes were selected.')

    if format != 'html':
        raise BadRequest('Invalid format requested')

    kdiff = float(params['f_kdiff']
                  ) if 'f_kdiff' in params and params['f_kdiff'] != '' else 0
    chain, samples_by_dataset = collate_samples(rep_samples)

    # Format we need to produce is [(gene_name, hetero count, h**o count),...]

    gene_hetrozygous_dis = {}

    for dataset in samples_by_dataset.keys():
        session = vdjbase_dbs[species][dataset].session
        allele_sample_recs = []

        for sample_chunk in chunk_list(samples_by_dataset[dataset],
                                       SAMPLE_CHUNKS):
            sample_list = session.query(
                Sample.sample_name, Sample.genotype, Sample.patient_id).filter(
                    Sample.sample_name.in_(sample_chunk)).all()
            sample_list, wanted_genes = apply_rep_filter_params(
                params, sample_list, session)
            sample_list = [s[0] for s in sample_list]

            query = session.query(Gene.name, Patient.id, Allele.id, Sample.sample_name, Gene.locus_order, AllelesSample.kdiff, Allele.name) \
                .join(Allele, Gene.id == Allele.gene_id) \
                .join(AllelesSample, Allele.id == AllelesSample.allele_id) \
                .join(Sample, Sample.id == AllelesSample.sample_id) \
                .join(Patient, Patient.id == Sample.patient_id) \
                .filter(Gene.name.in_(wanted_genes)) \
                .filter(Allele.name.notlike('%Del%')) \
                .filter(Allele.name.notlike('%OR%')) \
                .filter(Sample.sample_name.in_(sample_list)) \
                .filter(AllelesSample.kdiff >= kdiff)

            if 'sort_order' in params and params['sort_order'] == 'Locus':
                query = query.order_by(Gene.locus_order, Patient.id, Allele.id)
            else:
                query = query.order_by(Gene.alpha_order, Patient.id, Allele.id)

            if params['ambiguous_alleles'] == 'Exclude':
                query = query.filter(Allele.is_single_allele == True)

            allele_sample_recs.extend(query.all())

        # As the result is indexed, run over each gene in turn, count the number of alleles found in each patient, update h_counts accordingly

        i = 0
        target_gene = ''

        while i < len(allele_sample_recs):
            target_gene = allele_sample_recs[i][0]
            h_counts = [0, 0]

            while i < len(allele_sample_recs):
                if allele_sample_recs[i][0] != target_gene:
                    break

                target_patient = allele_sample_recs[i][1]
                patient_allele_ids = []

                while i < len(allele_sample_recs):
                    if allele_sample_recs[i][
                            0] != target_gene or allele_sample_recs[i][
                                1] != target_patient:
                        break

                    patient_allele_ids.append(allele_sample_recs[i][2])
                    i += 1

                patient_allele_ids = set(patient_allele_ids)

                # If we have both an unambiguous allele and an ambiguous allele containing that unambiguous one,
                # drop the unambiguous one because it is already counted

                if params['ambiguous_alleles'] != 'Exclude':
                    patterns = session.query(AllelesPattern.pattern_id)\
                        .filter(AllelesPattern.allele_in_p_id.in_(patient_allele_ids))\
                        .filter(AllelesPattern.pattern_id.in_(patient_allele_ids))\
                        .all()

                    if patterns is not None and len(patterns) > 0:
                        patterns = set([pattern[0] for pattern in patterns])
                        patient_allele_ids = patient_allele_ids - patterns

                if len(patient_allele_ids) > 1:
                    h_counts[1] += 1
                elif len(patient_allele_ids) > 0:
                    h_counts[0] += 1

            if target_gene not in gene_hetrozygous_dis:
                gene_hetrozygous_dis[target_gene] = (target_gene, h_counts[0],
                                                     h_counts[1])
            else:
                gene_hetrozygous_dis[target_gene] = (
                    target_gene,
                    gene_hetrozygous_dis[target_gene][1] + h_counts[0],
                    gene_hetrozygous_dis[target_gene][2] + h_counts[1])

    haplo_path = make_output_file('tab')
    labels = ['GENE', 'HM', 'HT']
    df = pd.DataFrame(gene_hetrozygous_dis.values(), columns=labels)
    df.to_csv(haplo_path, sep='\t', index=False)
    output_path = make_output_file('html')

    cmd_line = [
        "-i",
        haplo_path,
        "-o",
        output_path,
        "-c",
        chain,
    ]

    if run_rscript(HETEROZYGOSITY_SCRIPT, cmd_line) and os.path.isfile(
            output_path) and os.path.getsize(output_path) != 0:
        return send_report(output_path, format)
    else:
        raise BadRequest('No output from report')