def get_region_counts(BAMorBED, ROIs, chr_prefix=None, chrom_size=300000000): """ Direct ROIs to BAM, BEDgraph or BED file reader. Also can work with Wiggle files but these are very slow. Return ROIs, the number of read tags, total counts and mapped tags """ rr = RunRecord('get_region_counts') if 'bam' in BAMorBED.lower(): filled_ROIs, num_tags, num_bases, mapped_tags =\ read_BAM(BAMorBED, ROIs, chr_prefix) elif 'bedgraph' in BAMorBED.lower(): filled_ROIs, num_tags, num_bases, mapped_tags =\ read_BEDgraph(BAMorBED, ROIs, chr_prefix) elif 'bed' in BAMorBED.lower(): filled_ROIs, num_tags, num_bases, mapped_tags =\ read_BED(BAMorBED, ROIs, chr_prefix) elif 'wig' in BAMorBED.lower(): filled_ROIs, num_tags, num_bases, mapped_tags =\ read_wiggle(BAMorBED, ROIs, chr_prefix, chrom_size) elif 'vcf' in BAMorBED.lower(): filled_ROIs, num_tags, num_bases, mapped_tags =\ read_vcf(BAMorBED, ROIs, chr_prefix, chrom_size) else: rr.dieOnCritical('File not recognised as BAM, BEDgraph,'+\ 'BED, WIG or VCF', BAMorBED) rr.addInfo('Number of read tags counted', num_tags) rr.addInfo('Number of total bases counted', num_bases) rr.addInfo('Number of mapped tags in experiment', mapped_tags) return filled_ROIs, num_tags, num_bases, mapped_tags
def gene_expr_diff_to_table(data_path, sep='\t', stable_id_label='', probeset_label='', exp_label='', sig_label='', pval_label='', allow_probeset_many_gene=False, validate=True): """ As per gene_expr_to_table() but with the addition of sig_label and pval_label columns. """ rr = RunRecord('gene_expr_diff_to_table') rr.addInfo('Reading expression diff file', data_path) genes, probes, exp, sig, pval, probes_present = _read_data_file(\ data_path, sep=sep, stable_id_label=stable_id_label, probeset_label=probeset_label, exp_label=exp_label, sig_label=sig_label, pval_label=pval_label, is_diff=True) if probes_present: if validate: # if probes and exp are mismatched, nuke the gene genes, probes, exp, sig, pval =\ _validate_probes_scores(genes, probes, exp, sig, pval) if not allow_probeset_many_gene: # each probe should map to only one gene genes, probes, exp, sig, pval =\ _remove_multimapped_probesets(genes, probes, exp, sig, pval) header = DIFF_HEADER rows = [[g, p, e, s, v] for g, p, e, s, v in \ zip(genes, probes, exp, sig, pval)] return Table(header=header, rows=rows)
def load_expr(self, expr_study, db_path, include_targets=None, exclude_targets=None): """ loads expression records from a ChippyDB and also ranks by expr """ rr = RunRecord('load_expr') sample_name = expr_study.split(' : ')[0] session = db_query.make_session(db_path) self.expr_genes = [] #sample_type == 'Expression data: absolute ranked' print 'Querying sample from ChippyDB', sample_name sample_genes = db_query.get_genes_by_ranked_expr( session, sample_name, biotype='protein_coding', data_path=None, rank_by='mean', include_targets=include_targets, exclude_targets=exclude_targets) for gene in sample_genes: gene_record = ExprGene(gene.MeanScore, gene.Rank, gene.ensembl_id, sample_name) self.expr_genes.append(gene_record) rr.addInfo('genes found in ' + sample_name, len(sample_genes))
def _groupNoGeneCounts(self): """ Don't group counts. Simply return a PlotLine for each set of counts. Called by asPlotLines() """ rr = RunRecord('_groupNoGeneCounts') counts = self.data_collection.counts ranks = self.data_collection.ranks labels = self.data_collection.labels plot_lines = [] for c, r, l in zip(counts, ranks, labels): if self.counts_func == stdev: stdev_ = c.std() if stdev_ > 0: c = (c - c.mean()) / stdev_ plot_lines.append( PlotLine(c, r, l, study=self.collection_label)) else: plot_lines.append( PlotLine(c, r, l, study=self.collection_label)) # If no data was returned default to groupAllCollectionCounts if not len(plot_lines): rr.dieOnCritical('No data in collection', 'Failure') # If a single line is created label it with the collection name if len(plot_lines) == 1: plot_lines[0].label = [self.collection_label] return plot_lines
def _groupNGeneCounts(self, group_size, p=0.0): """ Group counts for N genes and return as PlotLines. Defaults to _groupAllGeneCounts() if group size is too large. Called by asPlotLines() """ rr = RunRecord('_groupNGeneCounts') plot_lines = [] for index, (c,r,l,se) in enumerate(self.data_collection.\ iterTransformedGroups(group_size=group_size, counts_func=self.counts_func, p=p)): plot_lines.append( PlotLine(c, rank=r, label=l, study=self.collection_label, stderr=se)) # If no data was returned default to groupAllCollectionCounts if not len(plot_lines): rr.addWarning('Defaulting to ALL features. Not enough '+\ 'features for group of size', group_size) plotLines = self._groupAllGeneCounts() return plotLines return plot_lines
def get_collection(session, sample_name, feature_type, BAMorBED, chr_prefix, window_upstream, window_downstream, multitest_signif_val, collection_fn, overwrite, tab_delimited, include_targets=None, exclude_targets=None, bedgraph=False, BED_windows=False, chrom_size=300000000, no_overlap=True): """ builds and writes a collection of counts and expression for feature_type in given sample genes. """ rr = RunRecord('get_collection') if not collection_fn.endswith('.chp'): collection_fn += '.chp' # ChipPy data file if not os.path.exists(collection_fn) or overwrite: bedgraph_fn = None if bedgraph: bedgraph_fn = '.'.join(collection_fn.split('.')[:-1]) + '.bedgraph' BED_windows_fn = None if BED_windows: BED_windows_fn = '.'.join(collection_fn.split('.')[:-1]) +\ '_regions.BED' data_collection = counts_for_genes( session, sample_name, feature_type, BAMorBED, chr_prefix, window_upstream, window_downstream, include_targets, exclude_targets, bedgraph_fn, multitest_signif_val=multitest_signif_val, BED_windows_fn=BED_windows_fn, chrom_size=chrom_size, no_overlap=no_overlap) if data_collection is not None: data_collection.writeToFile(collection_fn, as_table=tab_delimited, compress_file=True) else: rr.dieOnCritical('No data collection was returned', 'Failed') else: print 'Existing output at', collection_fn
def asPlotLines(self, group_size, group_location, p=0.0): """ Returns a list of PlotLine objects from this study. 'p' is the Chebyshev cut-off if not None """ rr = RunRecord('asPlotLines') if p > 0.0: rr.addInfo('Applying per-line Chebyshev filtering', p) if type(group_size) is str and group_size.lower() == 'all': plot_lines = self._groupAllGeneCounts() elif type(group_size) is int: if group_size == 1: plot_lines = self._groupNoGeneCounts() else: plot_lines = self._groupNGeneCounts(group_size, p=p) else: rr.dieOnCritical('group_size, wrong type or value', [type(group_size), group_size]) if group_location.lower() != 'all': rr.addInfo('grouping genes from location', group_location) plot_lines.sort(key=lambda x: x.rank) if group_location.lower() == 'top': plot_lines = [plot_lines[0]] elif group_location.lower() == 'middle': plot_lines = [plot_lines[int(len(plot_lines) / 2)]] elif group_location.lower() == 'bottom': plot_lines = [plot_lines[-1]] rr.addInfo('Plottable lines from study', len(plot_lines)) return plot_lines
def get_genes_by_ranked_diff(session, sample_name, multitest_signif_val=None, biotype='protein_coding', chrom=None, data_path=None, include_targets=None, exclude_targets=None, rank_by='mean'): """returns all ranked genes from a sample difference experiment""" rr = RunRecord('get_genes_by_ranked_diff') records = get_diff_entries(session, sample_name=sample_name, biotype=biotype, data_path=data_path, chrom=chrom, multitest_signif_val=multitest_signif_val) genes = [] for expressed_diff in records: gene = expressed_diff.gene gene.Scores = expressed_diff.fold_changes genes.append(gene) # keep only those genes in the include target gene sets if provided if include_targets is not None: include_genes = get_targetgene_entries(session, include_targets) if len(include_genes) > 0: include_gene_ids = set( [tg.gene.ensembl_id for tg in include_genes]) genes = [gene for gene in genes if gene.ensembl_id in\ include_gene_ids] # keep only those genes not in the exclude target gene sets if provided if exclude_targets is not None: exclude_genes = get_targetgene_entries(session, exclude_targets) if len(exclude_genes) > 0: exclude_gene_ids = set( [tg.gene.ensembl_id for tg in exclude_genes]) genes = [gene for gene in genes if gene.ensembl_id not in\ exclude_gene_ids] # set rank if rank_by.lower() == 'mean': scored = [(g.MeanScore, g) for g in genes] elif rank_by.lower() == 'max': scored = [(g.MaxScore, g) for g in genes] else: rr.dieOnCritical('Ranking method not possible', rank_by.lower()) # Make sure we get highest first scored = reversed(sorted(scored)) genes = [] for rank, (score, gene) in enumerate(scored): gene.Rank = rank + 1 genes.append(gene) return genes
def normaliseByRPM(self): """ Normalise counts by per million mapped tags """ rr = RunRecord('normaliseByRPM') norm_factor = 1000000.0 / self.mapped_tags rr.addInfo('normalising by RPMs', norm_factor) for gene in self.counts_genes: gene.promoter_counts *= norm_factor gene.coding_counts *= norm_factor gene.feature_counts *= norm_factor gene.total_counts *= norm_factor
def set_up_series_plots_dir(plot_filename): """ Create directory structure for series plots """ rr = RunRecord('set_up_series_plot_dir') save_dir = dirname_or_default(plot_filename) basename = os.path.basename(plot_filename) plot_series_dir = os.path.join(save_dir, '%s-series' % basename[:basename.rfind('.')]) create_path(plot_series_dir) rr.addInfo('Plotting as a series to', plot_series_dir) return plot_series_dir
def gene_expr_to_table(data_path, sep='\t', stable_id_label='', probeset_label='', exp_label='', allow_probeset_many_gene=False, validate=True): """ Returns a cogent table object Deals with a simple tab-delimited representation of gene expression data which may have come from either micro-array or mRNA-seq experiments. Data from micro-arrays will have probeset information for each gene and a score to match each probe. RNA-seq data will not have probes and simply a single score for each gene. In this case we will create a fake probe for each gene of the form 'P' + a unique integer. Probset id's and expressions scores are separated by the pipe -- | -- character. The probset and expression scores are then converted to tuples of ints or floats respectively. Arguments: - probeset_label: name of column containing probesets - exp_label: name of column containing expression scores - stable_id_label: name of column containing Ensembl stable IDs - allow_probeset_many_gene: whether one probeset can map to multiple genes. If not we remove probes and scores that multi- map. - validate: checks that -- stable IDs are unique in the file, that for each row the number of probesets equals the number of expression scores. Removes the gene entry. """ rr = RunRecord('geneExprDataToTable') rr.addInfo('Reading expression data', data_path) genes, probes, exp, probes_present = _read_data_file(data_path, sep=sep, stable_id_label=stable_id_label, probeset_label=probeset_label, exp_label=exp_label) if probes_present: if validate: # if probes and scores are mismatched, nuke the gene genes, probes, exp = \ _validate_probes_scores(genes, probes, exp) if not allow_probeset_many_gene: # each probe should map to only one gene genes, probes, exp = \ _remove_multimapped_probesets(genes, probes, exp) rows = [[g,p,e] for g,p,e in zip(genes, probes, exp)] return Table(header=EXPR_HEADER, rows=rows)
def main(): rr = RunRecord('drop_expression_db') rr.addCommands(sys.argv) args = script_info['args'].parse(window_title='Drop Expression Data') session = db_query.make_session(args.db_path) if db_query.drop_sample_records(session, args.sample): rr.addInfo('Removing ' + args.sample, 'Success') else: rr.addWarning('Removing ' + args.sample, 'Failure') rr.display()
def _get_keep_indices(data, filtered=None): rr = RunRecord('_get_keep_indices') keep = range(data.shape[0]) if filtered is not None: keep = [] for i in range(data.shape[0]): if filtered(data[i]): keep.append(i) if len(keep) == 0: rr.dieOnCritical('No remaing data after filtering', 'Failure') return keep
def _check_expr_headers(header_row, stable_id_label='', probeset_label='', exp_label=''): """ Check the header labels match for standard expression. Probeset is optional and results in probes_present being False """ rr = RunRecord('_check_expr_headers') try: gene_col = header_row.index(stable_id_label) except ValueError: rr.dieOnCritical('Stable ID column header not found in', header_row) try: exp_col = header_row.index(exp_label) except ValueError: rr.dieOnCritical('Expression score column header not found in', header_row) try: probe_col = header_row.index(probeset_label) probes_present = True except ValueError: rr.addWarning('Probeset column header not found in', header_row) probe_col = -1 probes_present = False return gene_col, probe_col, exp_col, probes_present
def safe_line_division(sample_line, dividing_line): """ in case of divide-by-zero we need some robustness """ rr = RunRecord('safe_line_division') try: sample_line /= dividing_line except ZeroDivisionError: # 0 counts at a base position min_count = 1 for c in dividing_line: if 0 < c < min_count: min_count = c sample_line /= dividing_line rr.addWarning('Zero counts value seen. Setting zeros to', min_count) return sample_line
def load_counts(self, collection): """ loads gene entries from a ChipPy collection """ rr = RunRecord('load_counts') print 'Loading counts collection file', collection self.counts_genes = [] if os.path.isfile(collection): try: # to load counts data from file file1 = gzip.GzipFile(collection, 'rb') data = numpy.load(file1) d = data.tolist() counts = d['counts'] labels = d['labels'] for count, label in zip(counts, labels): gene_record = CountsGene(count, str(label), collection) self.counts_genes.append(gene_record) rr.addInfo('genes found in ' + collection, len(labels)) except IOError: # some exception type rr.dieOnCritical('file found but could not be read', collection) else: rr.dieOnCritical('unrecognised collection file', collection)
def get_chroms(session): """ return list of chroms from ',' separated string """ if session is None: return ['No connection to DB'] elif type(session) is str: session = make_session(session) rr = RunRecord('get_chroms') try: chroms = session.query(Chroms).one() chroms = chroms.chromStr.split(',') except NoResultFound: chroms = [] rr.addError('Chroms found', None) return chroms
def set_counts_function(counts_metric): """ Sets the feature counting metric function""" rr = RunRecord('set_counts_function') if counts_metric.lower() == 'mean': counts_func = column_mean rr.addInfo('Counts metric set to', 'column_mean') elif counts_metric.lower() == 'frequency': counts_func = column_sum rr.addInfo('Counts metric set to', 'column_sum') elif counts_metric.lower() == 'stdev': counts_func = stdev rr.addInfo('Counts metric set to', 'stdev') else: rr.dieOnCritical('Invalid count metric', counts_metric) return counts_func
def __call__(self, x_array, plot_lines=None, clean=False, xlabel=None, ylabel=None, title=None, plot_CI=False, ui=None): rr = RunRecord('PlottableSingle__call__') self.setAxes(plot_lines, plot_CI=plot_CI, test_run=False) self.checkYAxisScale(plot_lines, plot_CI=plot_CI) self.fig, self.ax = self.getFigureAndAxes(title=title, xlabel=xlabel, ylabel=ylabel) self.clean=clean for i, line in ui.series(enumerate(sorted(plot_lines, key=lambda line: (line.study,line.rank), reverse=True)), noun='Applying lines to plot'): self.ax.plot(x_array, line.counts, color=line.color, linewidth=self.linewidth) # Show confidence interval around each line if plot_CI: #set shading alpha alpha = line.color[3] if alpha is None: alpha = 0.9 upper = 1.96 * line.stderr + line.counts lower = -1.96 * line.stderr + line.counts self.ax.fill_between(x_array, upper, lower, alpha=alpha/2.5, color=line.color)
def add_data(session, name, description, path, expr_table, gene_id_heading='gene', probeset_heading='probeset', expr_heading='exp', sample_type=sample_types['abs_expr'], reffile1=None, reffile2=None): """ A unified interface for adding data to the DB """ rr = RunRecord('add_data') success = add_sample(session, name, description) if not success: # Check if any sample exists without data existing_data, existing_type = check_existing_data(session, name) if existing_data > 0: rr.addError(name + ' already has data loaded', existing_data) rr.addError('data of type', existing_type) return False else: rr.addInfo('now loading data for existing sample', name) # either sample was created or existed with no data, so load data now if sample_types[sample_type] == sample_types['abs_expr']: success = add_expression_study(session, name, path, expr_table) elif sample_types[sample_type] == sample_types['diff_expr']: # diff between two files, check we got the related files assert reffile1 is not None and reffile2 is not None,\ 'To enter differences in gene expression you must specify the 2'\ 'files that contain the absolute measures.' add_expression_diff_study(session, name, path, expr_table, reffile1, reffile2) elif sample_types[sample_type] == sample_types['target_genes']: add_target_genes(session, name, path, expr_table, ensembl_id_label=gene_id_heading) else: rr.dieOnCritical('Unknown sample type', sample_type) return success
def filteredChebyshevUpper(self, p=0.05, axis=None): """returns a new RegionCollection excluding records with excessive reads using a one-sided Chebyshev's inequality""" rr = RunRecord('filteredChebyshevUpper') if not (0 <= p <= 1): rr.dieOnCritical('Probability argument', 'Invalid') k = chebyshev_upper(p) if axis is None: # only bother computing normalised score for max of each row data = self.counts.max(axis=1) mean = self.counts.mean() stdev_ = self.counts.std(ddof=1) data -= mean data /= stdev_ indices = data < k data = self.counts[indices] if self.labels is not None: labels = self.labels[indices] else: labels = None if self.ranks is not None: ranks = self.ranks[indices] else: ranks = None new = self.__class__(counts=data, ranks=ranks, labels=labels, info=self.info) else: data = normalised_data(self.counts, axis=axis) func = lambda x: (x < k).all() indices = _get_keep_indices(data, filtered=func) new = self.take(indices) if self.info is None: info = {'filteredChebyshevUpper': p} else: info = self.info.copy() info['filteredChebyshevUpper'] = p if new.info: new.info.update(info) else: new.info = info return new
def _groupAllGeneCounts(self): """ Group counts for all genes and return as a single PlotLine. Called by asPlotLines or _groupNGeneCounts(). Returns a list. """ rr = RunRecord('_groupAllGeneCounts') counts, ranks, se = self.data_collection.transformed(\ counts_func=self.counts_func) if not len(counts): rr.dieOnCritical('No counts data in', 'Study._groupAllGeneCounts') ranks = 0 # rank is irrelevant for 'all' genes # Always name single lines by their collection name label = self.collection_label plot_lines = [PlotLine(counts, ranks, label, study=label, stderr=se)] return plot_lines
def filteredByLabel(self, labels): """returns a new collection object with data corresponding to the provided labels""" rr = RunRecord('filteredByLabel') if self.labels is None: rr.dieOnCritical('No labels', 'Failure') if type(labels) == str: labels = [labels] # determine label indices and use self.take indices = [] for i in range(self.counts.shape[0]): if self.labels[i] in labels: indices.append(i) return self.take(indices)
def populateLogTable(self): """ display ChipPy Log text in the appropriate window """ rr = RunRecord() self.log_table.setRowCount(0) try: table = rr.getMessageTable(last_n_lines=30, include_date=True) except RuntimeError: return if table is None: return else: for r, row in enumerate(table): self.log_table.setRowCount(self.log_table.rowCount() + 1) for c, column in enumerate(row): self.log_table.setItem(r, c, QTableWidgetItem(QString(column)))
def _get_targetgene_query(session, sample_name=None, biotype='protein_coding'): """ Returns target_gene records for a given sample """ rr = RunRecord('get_targets') if sample_name is not None: sample = _get_sample(session, sample_name) if sample is None: rr.addError('Using all samples, as no sample matches name', sample_name) query = session.query(TargetGene).join(Gene) else: query = session.query(TargetGene).join(Gene).\ filter(TargetGene.sample_id==sample.sample_id) else: # get them all query = session.query(TargetGene).join(Gene) if biotype: query = query.filter(Gene.biotype == biotype) return query
def _auto_y_lims(self, minY, maxY, rounding=True, test_run=False): """ Takes a list of plotlines. Returns ylims(y_min_limit, y_max_limit) Defaults to min = 0.0, max = 1.0 """ rr = RunRecord('_auto_y_lims') y_floor = minY y_ceiling = maxY if rounding: # Round min/max values to whole values for nice plots # For fractional counts then scale the rounding appropriately if maxY > 0: ypower = log10(maxY) # check scale if ypower < 0: rounding_places = 0 - int(floor(ypower)) y_ceiling = float(ceil(maxY * (10**rounding_places))/ (10**rounding_places)) y_floor = float(floor(minY * (10**rounding_places))/ (10**rounding_places)) elif ypower == 0: y_floor = 0.0 y_ceiling = 1.0 else: # round up to 2 significant digits ypower = ceil(log10(maxY)) y_ceiling = ceil( maxY/(10**(ypower-1)) ) * (10**(ypower-1)) y_floor = floor(minY) elif maxY == 0: y_floor = 0.0 y_ceiling = 1.0 else: rr.dieOnCritical('Negative max y-axis value', maxY) if test_run: rr.addInfo('Y-axis min', minY) rr.addInfo('Y-axis max', maxY) rr.addInfo('Y-axis auto floor', y_floor) rr.addInfo('Y-axis auto ceiling', y_ceiling) return tuple([y_floor, y_ceiling])
def add_target_genes(session, sample_name, data_path, table, ensembl_id_label='ENSEMBL'): """adds Expression instances into the database from table Arguments: - data_path: the reference file path - table: the actual expression data table - ensembl_id_label: label of the column containing Ensembl Stable IDs """ rr = RunRecord('add_target_genes') data = [] sample = _one(session.query(Sample).filter_by(name=sample_name)) if not sample: session.rollback() rr.dieOnCritical('querying for sample', 'Failed') reffile = session.query(ReferenceFile).filter_by(name=data_path).all() if len(reffile) == 0: reffile = ReferenceFile(data_path, today) reffile.sample = sample data.append(reffile) else: # Don't overwrite anything, exit instead rr.dieOnCritical('File already loaded', data_path) ensembl_ids = table.getRawData(ensembl_id_label) for id_chunk in _chunk_id_list(ensembl_ids, 100): genes = session.query(Gene).filter(Gene.ensembl_id.in_(id_chunk)).all() for gene in genes: target = TargetGene() target.gene = gene target.reference_file = reffile target.sample = sample data.append(target) rr.addInfo('Added target genes from', data_path) rr.addInfo('No. genes added', len(data)) session.add_all(data) session.commit() return
def applyBinning(self, bin_width): """ For every bin_width, sum the counts. Output array size is same, just filled with mean values of each bin - giving a normalised score per base. Bin_width must be an integer factor of the window size. """ rr = RunRecord('apply_binning') if bin_width and bin_width > 0: if len(self.counts)%bin_width: rr.dieOnCritical('Bin width is not an integer '+\ 'factor of window size', bin_width) tmp_array = numpy.array(self.counts) for k in range(0, len(self.counts), bin_width): bin_sum = 0 for i in xrange(bin_width): bin_sum += self.counts[k+i] for i in xrange(bin_width): tmp_array[k+i] = bin_sum/bin_width self.counts = tmp_array
def _get_expression_query(session, sample_name=None, biotype='protein_coding', chrom=None, data_path=None): """ Returns expression table query """ rr = RunRecord('_get_expression_query') query = session.query(Expression) if sample_name is not None: sample = _get_sample(session, sample_name) if sample is None: rr.dieOnCritical('Unknown sample name', sample_name) query = query.filter(Expression.sample_id == sample.sample_id) if data_path is not None: reffile_id = _one(session.query(ReferenceFile.reffile_id).\ filter(ReferenceFile.name==data_path)) if not data_path: rr.dieOnCritical('Unknown data path', data_path) reffile_id = reffile_id[0] query = query.filter(Expression.reffile_id == reffile_id) # used to reconstruct the origin of a sample query = query.join(Gene) if chrom is not None: query = query.filter(Gene.chrom == chrom) if biotype is not None: query = query.filter(Gene.biotype == biotype) return query
def _get_diff_query(session, sample_name=None, biotype='protein_coding', multitest_signif_val=None, chrom=None, data_path=None): """ Returns ExpressionDiff table query """ rr = RunRecord('_get_diff_query') query = session.query(ExpressionDiff) if sample_name is not None: sample = _get_sample(session, sample_name) if not sample: rr.dieOnCritical('No sample with name', sample_name) query = query.filter(ExpressionDiff.sample_id == sample.sample_id) if data_path is not None: reffile_id = _one(session.query(ReferenceFile.reffile_id).\ filter(ReferenceFile.name==data_path)) if not data_path: rr.dieOnCritical('Unknown data path', data_path) reffile_id = reffile_id[0] query = query.filter(Expression.reffile_id == reffile_id) if multitest_signif_val is not None: query = query.filter(ExpressionDiff.multitest_signif==\ multitest_signif_val) query = query.join(Gene) if chrom is not None: query = query.filter(Gene.chrom == chrom) if biotype: query = query.filter(Gene.biotype == biotype) return query