def create_requirements_file(project_name, project_dir, requirements=None, overwrite=False): """ Create a requirements.txt file with pip requirements. """ def get_current_requirements(): import requests package_name = "ngs-toolkit" url = "https://pypi.python.org/pypi/" + str(package_name) + "/json" data = requests.get(url).json() requirements = [ x.replace(r" ", "").replace("(", "").replace(")", "") for x in data["info"]["requires_dist"] if "extra" not in x ] requirements.append("ngs_toolkit=={}".format(ngs_toolkit.__version__)) return requirements if requirements is None: requirements = get_current_requirements() requirements_file = os.path.join(project_dir, "requirements.txt") if os.path.exists(requirements_file): if not overwrite: _LOGGER.warning("'requirements.txt' file already existing, skipping.") return requirements_filecontent = "\n".join(requirements) # write requirements file with open(requirements_file, "w", 1) as handle: handle.write(textwrap.dedent(requirements_filecontent) + "\n")
def warn_or_raise(exception, permissive=False): from ngs_toolkit import _LOGGER msg = exception.args[0] if permissive: _LOGGER.warning(msg) else: _LOGGER.error(msg) raise exception
def count_reads_in_intervals(bam, intervals, permissive=True): """ Count total number of reads in a iterable holding strings representing genomic intervals of the form ``"chrom:start-end"``. Please make sure both ``intervals`` and ``bam`` file are zero- or one-indexed. Parameters ---------- bam : :obj:`str` Path to BAM file. intervals : :obj:`list` List of strings with genomic coordinates in format ``"chrom:start-end"``. Returns ------- :obj:`dict` Dict of read counts for each interval. """ import pysam from ngs_toolkit import _LOGGER counts = dict() bam = pysam.AlignmentFile(bam, mode="rb") errors: int = 0 for interval in intervals: try: counts[interval] = bam.count(region=interval) except ValueError: if permissive: errors += 1 else: raise # if fix_off_by_one: # i = interval.split(":")[1] # s = ( # interval.split(":")[0] + # ":" + str(int(i.split("-")[0]) + 1) + # "-" + str(int(i.split("-")[1]) + 1)) # counts[interval] = bam.count(region=s) bam.close() if errors > 0: _LOGGER.warning("There have been %i errors. Beware.", errors) return counts
def get_this_file_or_timestamped(file, permissive=True): """ Get a path to an existing timestamped file based on an non-timestamped path. Parameters ---------- file_name : :obj:`str` File name of analysis output to record. permissive : :obj:`bool` Whether failure to find timestamped file should return the original file or raise a IndexError. Raises ---------- IndexError If not `permissive` and can't find timestamped file. """ from glob import glob import re from ngs_toolkit.utils import sorted_nicely from ngs_toolkit import _LOGGER split = file.split(".") body = ".".join(split[:-1]) end = split[-1] res = sorted_nicely(glob(body + "*" + end)) res = [x for x in res if re.search(body + r"\.\d{4}-\d{2}-\d{2}-\d{2}:\d{2}:\d{2}\.", x)] if len(res) > 1: _LOGGER.warning( "Could not get unequivocal timestamped file for '{}'.".format(file) + " Returning latest: '{}'.".format(res[-1]) ) try: # get newest file return res[-1] except IndexError: if permissive: return file else: msg = "Could not remove timestamp from file path." msg += " Probabably it does not exist." _LOGGER.error(msg) raise IndexError(msg)
def collect_esat_output(self, samples=None, permissive=True): """ Collect gene expression (read counts, gene-level) output from ESAT into expression matrix for `samples`. """ if samples is None: samples = self.samples first = True for sample in samples: try: c = pd.read_csv( os.path.join( sample.sample_root, "ESAT_{}".format(sample.genome), sample.name + ".gene.txt", ), sep="\t", ) except IOError: if permissive: _LOGGER.warning("Sample '%s' is missing file: %s", sample.name, sample.counts) continue else: raise # extract only gene ID and counts c = c[["Symbol", "Exp1"]] c = c.rename(columns={ "Symbol": "gene_symbol", "Exp1": sample.name }).set_index("gene_symbol") # Append if first: expr = c else: expr = expr.join(c) first = False return expr.sort_index()
def __init__( self, name=None, from_pep=False, from_pickle=False, root_dir=None, data_dir="data", results_dir="results", prj=None, samples=None, **kwargs ): # The check for existance is to make sure other classes can inherit from this default_args = { "data_type": "ChIP-seq", "__data_type__": "ChIP-seq", "var_unit_name": "region", "quantity": "binding", "norm_units": "RPM"} for k, v in default_args.items(): if not hasattr(self, k): setattr(self, k, v) super(ChIPSeqAnalysis, self).__init__( name=name, from_pep=from_pep, from_pickle=from_pickle, root_dir=root_dir, data_dir=data_dir, results_dir=results_dir, prj=prj, samples=samples, **kwargs ) if hasattr(self, "comparison_table"): self.set_comparisons() else: msg = "No comparison table was given. Will not prefill peak calling comparisons." _LOGGER.warning(msg)
try: DEV = os.environ["TRAVIS_BRANCH"] == "dev" except KeyError: pass try: DEV = os.environ["GITHUB_REF"] == "dev" except KeyError: import subprocess try: o = subprocess.check_output("git status".split(" ")) DEV = "dev" in o.decode().split("\n")[0] except subprocess.CalledProcessError: msg = "Could not detect whether on a development branch." _LOGGER.warning(msg) # Test-specifc options # # Note: # # The DESeq2 1.24.0 version in Debian archives # # differs from the DESeq2 1.24.0 version in bioconductor version 3.9 # # If estimateDispersions with default fitType="parametric" fails, # # (as often happens with the quickly generated synthetic data from tests), # # it tries to use local fit using the locfit package, but in Debian # # version this is not a valid choice of fit, causing failure. # # Due to this, and since I'm using Debian packages for faster testing # # I'm manually setting fitType="mean" for testing only. Analysis.differential_analysis = partialmethod( Analysis.differential_analysis, deseq_kwargs={"fitType": "mean"}) # This a part of the "example" config that is required for some analysis
def calculate_peak_support( self, samples=None, region_type="summits", peak_type="filtered", permissive=True, comparison_table=None, peak_dir="{results_dir}/chipseq_peaks"): """ Calculate a measure of support for each region in peak set (i.e. ratio of samples containing a peak overlapping region in union set of peaks). Parameters ---------- comparison_table : :obj:`pandas.DataFrame`, optional DataFrame with signal/background combinations used to call peaks Defaults to analysis' own `comparison_table`. peak_dir : :obj:`str`, optional Path to peaks output directory. Defaults to {analysis.results_dir}/chipseq_peaks samples: :obj:`list` Not used. Provided for compatibility with ATACSeqAnalysis class. region_type: :obj:`str` Not used. Provided for compatibility with ATACSeqAnalysis class. permissive: :obj:`bool` Not used. Provided for compatibility with ATACSeqAnalysis class. Attributes ---------- support : :obj:`pandas.DataFrame` DataFrame with signal/background combinations used to call peaks """ import pybedtools from tqdm import tqdm from ngs_toolkit.utils import bed_to_index if comparison_table is None: comparison_table = self.comparison_table peak_dir = os.path.abspath(self._format_string_with_attributes(peak_dir)) # get index index = bed_to_index(self.sites.to_dataframe()) # calculate support (number of samples overlaping each merged peak) support = pd.DataFrame(index=index) for name, comp in tqdm(self.comparisons.items(), total=len(self.comparisons), desc="Comparison"): for peak_caller, peak_file in comp['peak_calls'][peak_type].items(): try: sample_support = self.sites.intersect(peak_file, wa=True, c=True).to_dataframe() except ( ValueError, pybedtools.MalformedBedLineError, pybedtools.helpers.BEDToolsError, ): _LOGGER.warning( "Peaks for comparison %s (%s) not found!", (name, peak_file)) if permissive: continue else: raise sample_support.index = index support[(name, peak_caller)] = sample_support.iloc[:, 3] # Make multiindex labeling comparisons and peak type support.columns = pd.MultiIndex.from_tuples( support.columns, names=["comparison", "peak_caller"] ) support.to_csv( os.path.join( self.results_dir, self.name + "_peaks.binary_overlap_support.csv" ), index=True, ) # divide sum (of unique overlaps) by total to get support value between 0 and 1 support["support"] = support.astype(bool).sum(axis=1) / float(support.shape[1]) # save support.to_csv( os.path.join(self.results_dir, self.name + "_peaks.support.csv"), index=True ) self.support = support
def get_consensus_sites( self, samples=None, region_type="summits", peak_type="filtered", extension=250, blacklist_bed=None, filter_chroms=True, permissive=False, save=True, assign=True, **kwargs): """ Get consensus (union) of enriched sites (peaks) across all comparisons. There are two modes possible, defined by the value of ``region_type``: * peaks: simple union of all sites; * summits: peak summits are extended by ``extension`` and a union is made. For ChIP-seq, the ``comparison_table`` keyword argument or a ``comparison_table`` attribute set is required. Peaks/summits will be aggregated for the peaks called in each sample comparison. Parameters ---------- samples : :obj:`list` Iterable of :class:`peppy.Sample` objects to restrict to. Must have a ``peaks`` attribute set. Defaults to all samples in the analysis (``samples`` attribute). region_type : :obj:`str` The type of region to use to create the consensus region set - one of "summits" or "peaks". If "summits", peak summits will be extended by ``extension`` before union. If "peaks", sample peaks will be used with no modification prior to union. Default is "summits". extension : :obj:`int` Amount to extend peaks summits by in both directions. Default is 250. blacklist_bed : {:obj:`False`, :obj:`str`} Either :obj:`False` or a path to a BED file with genomic positions to exclude from consensus peak set. Default is to use a blacklist file for the analysis ``genome``. filter_chroms : {:obj:`list`, :obj:`str`} A list of chromosomes to filter out or a string with a pattern to match to exclude chromosomes. Uses Pandas string methods :class:`pandas.Series.str.match`. Pass for example `'.*_.*|chrM'` to filter out chromosomes with a "_" character and a "chrM" chromosome. Default is not to filter anything. permissive : :obj:`bool` Whether Samples that which ``region_type`` attribute file does not exist should be simply skipped or an error thrown. comparison_table : :obj:`pandas.DataFrame`, optional DataFrame with signal/background combinations used to call peaks. Part of kwargs. Defaults to analysis own ``comparison_table``. peak_dir : :obj:`str`, optional Path to peaks output directory. Part of kwargs. Defaults to "{analysis.results_dir}/chipseq_peaks". Attributes ---------- sites : :class:`pybedtools.BedTool` Bedtool with consensus sites. """ import re from ngs_toolkit.general import get_blacklist_annotations import pybedtools from tqdm import tqdm import tempfile if "comparison_table" not in kwargs: # TODO: allow not requiring peak_dir to be passed if specifying a new table self.set_comparisons(kwargs["comparison_table"], peak_dir=kwargs["peak_dir"]) if region_type not in ["summits", "peaks"]: msg = "`region_type` attribute must be one of 'summits' or 'peaks'!" _LOGGER.error(msg) raise ValueError(msg) if blacklist_bed is None: _LOGGER.info("Blacklist file not provided. Downloading...") try: blacklist_bed = get_blacklist_annotations(self.organism, self.genome) except AttributeError: msg = "Blacklist file was not provided and cannot" msg += " get one without analysis having `organism` and `genome` set." _LOGGER.error(msg) raise AttributeError(msg) # Simply concatenate all peaks in one file f = tempfile.NamedTemporaryFile() with open(f.name, "a") as handle: for name, comp in tqdm(self.comparisons.items(), total=len(self.comparisons), desc="Comparison"): for peak_caller, peak_file in comp['peak_calls'][peak_type].items(): try: # TODO: check if homer has summits and they match this pattern summit = re.sub("_peaks.narrowPeak", "_summits.bed", peak_file) file = ( pybedtools.BedTool(summit).slop(b=extension, genome=comp['genome']).fn if region_type == "summits" else peak_file) except (ValueError, FileNotFoundError): _LOGGER.warning("Input file for comparison {} ({}) not found!", (name, f)) if not permissive: raise for line in open(file, 'r'): handle.write(line) # Merge overlaping peaks across comparisons sites = pybedtools.BedTool(f.name).sort().merge() # Filter # # remove blacklist regions if blacklist_bed is not False: if not isinstance(blacklist_bed, pybedtools.BedTool): blacklist = pybedtools.BedTool(blacklist_bed) sites = sites.intersect(v=True, b=blacklist) # # filter requested chromosomes if filter_chroms is not None: if isinstance(filter_chroms, list): sites = sites.filter(lambda x: x.chrom not in filter_chroms).saveas() elif isinstance(filter_chroms, str): s = sites.to_dataframe() sites = pybedtools.BedTool.from_dataframe(s.loc[~s['chrom'].str.match(filter_chroms)]) # Save and assign if save: output_file = os.path.join(self.results_dir, self.name + ".peak_set.bed") sites.saveas(output_file) sites = pybedtools.BedTool(output_file) if assign: self.sites = sites return sites
def summarize_peaks_from_comparisons( self, comparison_table=None, output_dir="{results_dir}/chipseq_peaks", filtered=True, permissive=True, ): """ Call peaks for ChIP-seq samples using an annotation of which samples belong in each comparison and which samples represent signal or background. Parameters ---------- comparison_table : :obj:`pandas.DataFrame`, optional Comparison table with the following required columns: "comparison_name", "sample_name", "comparison_side", "sample_group". Defaults to analysis' own `comparison_table`. output_dir : :obj:`str` Parent directory where peaks will be created. Will be created if does not exist. permissive: :obj:`bool` If incomplete/incoherent comparisons should be skipped or an error should be thrown. Raises ---------- ValueError Will be raised if not `permissive` and incomplete/incoherent comparisons are detected. """ from ngs_toolkit.utils import homer_peaks_to_bed if comparison_table is None: comparison_table = self.comparison_table req_columns = [ "comparison_name", "sample_name", "comparison_side", "sample_group", ] msg = "Comparison table is missing some of the following columns: '{}'.".format( ",".join(req_columns) ) if not all([col in comparison_table.columns for col in req_columns]): _LOGGER.error(msg) raise AssertionError(msg) # Complement default `output_dir` if "{results_dir}" in output_dir: output_dir = os.path.abspath( output_dir.format(results_dir=self.results_dir) ) # For each comparison, count called peaks peak_type = "filtered" if filtered else "original" peak_counts = list() for name, comp in self.comparisons.items(): _LOGGER.info(name) for peak_caller, file in comp['peak_calls'][peak_type].items(): error = "Peak files for comparison '%s' with '%s' parameters don't exist." if "homer" in peak_caller and not filtered: try: homer_peaks_to_bed(file, file.replace("narrowPeak", "bed")) except IOError: if permissive: _LOGGER.warning(error, (name, peak_caller)) peak_counts.append([name, peak_caller, np.nan]) continue else: raise except pd.errors.EmptyDataError: peak_counts.append([name, peak_caller, 0.0]) file = file.replace("narrowPeak", "bed") try: df = pd.read_csv(file, sep="\t") except IOError: if permissive: _LOGGER.warning(error, (name, peak_caller)) peak_counts.append([name, peak_caller, np.nan]) continue else: raise except pd.errors.EmptyDataError: peak_counts.append([name, peak_caller, 0.0]) peak_counts.append([name, peak_caller, df.shape[0]]) peak_counts = pd.DataFrame(peak_counts, columns=["comparison_name", "peak_caller", "peak_counts"]) return peak_counts # .fillna(0)
def call_peaks_from_comparisons( self, comparison_table=None, output_dir="{results_dir}/chipseq_peaks", permissive=True, overwrite=True, distributed=True, ): """ Call peaks for ChIP-seq samples using an annotation of which samples belong in each comparison and which samples represent signal or background. Parameters ---------- comparison_table : :obj:`pandas.DataFrame` Comparison table with the following required columns: "comparison_name", "sample_name", "comparison_side", "sample_group". Defaults to analysis' own `comparison_table`. output_dir : :obj:`str` Parent directory where peaks will be created. Will be created if does not exist. permissive: :obj:`bool` If incomplete/incoherent comparisons should be skipped or an error should be thrown. Default is :obj:`True`. overwrite: :obj:`bool` If incomplete/incoherent comparisons should be skipped or an error should be thrown. Default is :obj:`True`. distributed: :obj:`bool` Whether peak calling should be run in serial or in distributed mode as jobs. Default is :obj:`True`. Raises ---------- ValueError If not `permissive` and incomplete/incoherent comparisons are detected. """ import subprocess from ngs_toolkit.utils import ( macs2_call_chipseq_peak, homer_call_chipseq_peak_job, filter_kwargs_by_callable ) from tqdm import tqdm if comparison_table is None: comparison_table = self.comparison_table req_columns = [ "comparison_name", "sample_name", "comparison_side", "sample_group", ] msg = "Comparison table is missing some of the following columns: '{}'.".format( ",".join(req_columns) ) if not all([col in comparison_table.columns for col in req_columns]): _LOGGER.error(msg) raise AssertionError(msg) # Complement default `output_dir` if "{results_dir}" in output_dir: output_dir = os.path.abspath( output_dir.format(results_dir=self.results_dir) ) if not os.path.exists(output_dir): os.makedirs(output_dir) # For each comparison for name, comp in tqdm(self.comparisons.items(), total=len(self.comparisons), desc="Comparison"): _LOGGER.info( "Doing comparison '{}' with positive samples '{}' and background samples '{}'".format( name, [s.name for s in comp['signal_samples']], [s.name for s in comp['control_samples']], ) ) # Call peaks cmds = list() bkws = filter_kwargs_by_callable(comp, macs2_call_chipseq_peak) kwargs = { "name": name, "distributed": distributed, **bkws} if overwrite: cmds += [macs2_call_chipseq_peak(**kwargs), homer_call_chipseq_peak_job(**kwargs)] else: if not os.path.exists(comp['peak_calls']['original']['macs']): cmds += [macs2_call_chipseq_peak(**kwargs)] if not os.path.exists(comp['peak_calls']['original']['homer_factor']): cmds += [homer_call_chipseq_peak_job(**kwargs)] else: _LOGGER.warning("Peak files for comparison '%s' already exist. Skipping.", name) if not distributed: for cmd in cmds: _LOGGER.info("Calling peaks for comparison '%s' with command: '%s'.\n", (name, cmd)) subprocess.call(cmd.split(" "))
def set_comparisons(self, comparison_table=None, peak_dir="{results_dir}/chipseq_peaks"): """ Set up an attribute containing information about the sample comparisons necessary for peak calling. Structure: * comparison_name: * signal_samples * background_samples * directory * prefix * resulting_files * macs * homer_histone * homer_factor Parameters ---------- comparison_table : :obj:`str`, optional Comparison table wit peak comparisons. Defaults to one from PEP project if available. peak_dir : :obj:`str`, optional Directory with peak calls. Defaults to "{results_dir}/chipseq_peaks". Returns ------- :obj:`dict` The dictionary with the attributes. Attributes ---------- :obj:`dict` The dictionary with the attributes. Raises ------ ValueError If comparisons are not correctly specified. """ if comparison_table is None: comparison_table = self.comparison_table comparison_names = ( comparison_table.loc[ comparison_table['comparison_type'] == 'peaks', "comparison_name"] .drop_duplicates().sort_values()).tolist() if not comparison_names: _LOGGER.warning("Could not find any comparisons of type 'peak'.") peak_dir = os.path.abspath(self._format_string_with_attributes(peak_dir)) self.comparisons = dict() for name in comparison_names: _LOGGER.info("Setting comparison '%s' up", name) # If there aren't two sides to each comparison, skip it or throw error if len(set(comparison_table.query("comparison_name == '{}'".format(name))["comparison_side"])) != 2: error = "Comparison '{}' does not contain two sides.".format(name) _LOGGER.error(error) raise ValueError(error) # Get the sample names of samples in each side pos_names = comparison_table.loc[ (comparison_table["comparison_name"] == name) & (comparison_table["comparison_side"] == 1), "sample_name"].tolist() neg_names = comparison_table.loc[ (comparison_table["comparison_name"] == name) & (comparison_table["comparison_side"] < 1), "sample_name"].tolist() signal_samples = [s for s in self.samples if s.name in pos_names] control_samples = [s for s in self.samples if s.name in neg_names] co = dict() co['signal_samples'] = signal_samples co['control_samples'] = control_samples # Additional info co['output_dir'] = os.path.join(peak_dir, name) co['prefix'] = os.path.join(co['output_dir'], name) g = comparison_table.query("comparison_name == '{}'".format(name))['comparison_genome'].drop_duplicates().squeeze() if not isinstance(g, str): msg = "Could not determine genome of comparison '%s'." % g _LOGGER.error(msg) raise AssertionError(msg) co['genome'] = g # resulting files files res = dict() res['macs'] = co['prefix'] + "_peaks.narrowPeak" res["homer_factor"] = co['prefix'] + "_homer_peaks.factor.narrowPeak" res["homer_histone"] = co['prefix'] + "_homer_peaks.histone.narrowPeak" co['peak_calls'] = dict() co['peak_calls']["original"] = res co['peak_calls']["filtered"] = { k: v.replace(".narrowPeak", ".filtered.bed") for k, v in res.items()} self.comparisons[name] = co return self.comparisons
def collect_bitseq_output(self, samples=None, permissive=True, expression_type="counts"): """ Collect gene expression (read counts, transcript-level) output from Bitseq into expression matrix for `samples`. """ # TODO: drop support for legacy pipeline output and assume one input file with all required columns # TODO: add support for RPKM if samples is None: samples = self.samples if expression_type != "counts": msg = "`expression_type` must be 'counts'!" _LOGGER.error(msg) raise NotImplementedError(msg) expr = list() for i, sample in enumerate(samples): _LOGGER.debug( "Reading transcriptome files for sample '{}'.".format( sample.name)) tr_file = os.path.join( sample.sample_root, "bowtie1_{}".format(sample.transcriptome), "bitSeq", sample.name + ".tr", ) counts_file = os.path.join( sample.sample_root, "bowtie1_{}".format(sample.transcriptome), "bitSeq", sample.name + ".counts", ) # read the "tr" file of one sample to get indexes try: tr = pd.read_csv( tr_file, sep=" ", header=None, skiprows=1, names=[ "ensembl_gene_id", "ensembl_transcript_id", "v1", "v2" ], ) except IOError: msg = "Could not open file '{}'' is missing.".format(tr_file) if permissive: _LOGGER.warning(msg) continue else: raise # read the "counts" file of one sample to get indexes try: e = pd.read_csv(counts_file, sep=" ") except IOError: msg = "Could not open file '{}'' is missing.".format( counts_file) if permissive: _LOGGER.warning(msg) continue else: raise e = tr.drop(["v1", "v2"], axis=1).join(e) e.loc[:, "sample_name"] = sample.name expr.append(e) if len(expr) == 0: msg = "No sample had a valid expression file!" if permissive: _LOGGER.warning(msg) return else: _LOGGER.error(msg) raise IOError(msg) expr = (pd.concat(expr, axis=0, sort=False).melt(id_vars=[ "ensembl_gene_id", "ensembl_transcript_id", "sample_name" ]).pivot_table( index=["ensembl_gene_id", "ensembl_transcript_id"], columns="sample_name", values="value", fill_value=0, ).astype(int, downcast=True)) return expr
def plot_features( analysis=None, knockout_genes=None, matrix="matrix_norm", samples=None, differential_results=None, output_dir=None, output_prefix="knockout_expression", ): """ Plot expression of genes in samples or sample groups. Parameters ---------- analysis : :class:`ngs_toolkit.RNASeqAnalysis`, optional Analysis object. Not required if `matrix` is given. knockout_genes : :obj:`list`, optional List of perturbed genes to plot. Defaults to the set of `knockout` attributes in the analysis' samples if `analysis` is given. Otherwise must be given. matrix : str, optional Matrix with expression values to use. Defaults to "matrix_norm" samples : [type], optional [description] Defaults to :obj:`None`. differential_results : [type], optional [description] Defaults to :obj:`None`. output_dir : [type], optional [description] Defaults to :obj:`None`. output_prefix : str, optional Prefix for output files. Defaults to "knockout_expression" """ from ngs_toolkit.graphics import clustermap_varieties if (analysis is None) and (matrix is None): raise AssertionError("One of `analysis` or `matrix` must be provided.") msg = "If an `analysis` object is not provided, you must provide a list of `knockout_genes`." if (analysis is None) and (knockout_genes is None): raise AssertionError(msg) elif (analysis is not None) and (knockout_genes is None): msg = "If `knockout_genes` is not given, Samples in `analysis` must have a `knockout` attribute." try: knockout_genes = list(set([s.knockout for s in analysis.samples])) except KeyError(msg) as e: raise e matrix = analysis.get_matrix(matrix=matrix, samples=samples) if output_dir is None: if analysis is not None: output_dir = analysis.results_dir else: output_dir = os.path.curdir knockout_genes = sorted(knockout_genes) missing = [k for k in knockout_genes if k not in matrix.index] msg = "Some `knockout_genes` were not found in the expression matrix: '%s'" if len(missing) > 0: _LOGGER.warning(msg % ", ".join(missing)) knockout_genes = [k for k in knockout_genes if k in matrix.index] ko = matrix.loc[knockout_genes, :] msg = "None of the `knockout_genes` were found in the expression matrix.\nCannot proceed." if ko.empty: _LOGGER.warning(msg) return # expression values clustermap_varieties(ko, output_dir=output_dir, output_prefix=output_prefix) # p-values and fold-changes for knockout genes if differential_results is None: differential_results = getattr(analysis, "differential_results", None) if differential_results is None: return if len(differential_results["comparison_name"].unique()) <= 1: msg = "Could not plot values per comparison as only one found!" _LOGGER.warning(msg) return # p-values p_table = pd.pivot_table( differential_results.loc[knockout_genes, :].reset_index(), index="comparison_name", columns="index", values="padj", ) p_table.index.name = "Knockout gene" p_table.columns.name = "Gene" p_table = -np.log10(p_table.loc[:, knockout_genes].dropna()) p_table = p_table.replace(np.inf, p_table[p_table != np.inf].max().max()) p_table = p_table.replace(-np.inf, 0) clustermap_varieties( p_table, output_dir=output_dir, output_prefix=output_prefix + ".p_value", quantity="-log10(FDR p-value)", ) clustermap_varieties( p_table, output_dir=output_dir, output_prefix=output_prefix + ".p_value.thresholded", steps=["base", "sorted"], quantity="-log10(FDR p-value)", vmax=1.3 * 5, ) # logfoldchanges fc_table = pd.pivot_table( differential_results.loc[knockout_genes, :].reset_index(), index="comparison_name", columns="index", values="log2FoldChange", ) fc_table.index.name = "Knockout gene" fc_table.columns.name = "Gene" fc_table = fc_table.loc[:, knockout_genes].dropna() clustermap_varieties( fc_table, output_dir=output_dir, output_prefix=output_prefix + "log_fc", steps=["base", "sorted"], quantity="log2(fold-change)", ) clustermap_varieties( fc_table, output_dir=output_dir, output_prefix=output_prefix + "log_fc.thresholded", steps=["base", "sorted"], quantity="log2(fold-change)", vmin=-2, vmax=2, )
def _copy_cnv_profile_plots( self, output_dir="{results_dir}/cnv_profiles", output_prefix="log2_profile", resolutions=None, samples=None, permissive=True, ): """ Convenience to copy output plots from runnning several samples independently to a given directory. Parameters ---------- output_dir : :obj:`str`, optional Directory to copy to. Defaults to "{results_dir}/cnv_profiles". output_prefix : :obj:`str`, optional Prefix for copied files. Defaults to "log2_profile". resolutions : :obj:`list`, optional Resolutions of analysis. Defaults to resolutions in Analysis object. samples : :obj:`list`, optional Samples to restrict analysis to. Defaults to samples in Analysis object. permissive: :obj:`bool`, optional Whether missing files should raise an error. Defaults to :obj:`True.` """ from tqdm import tqdm from glob import glob from shutil import copyfile if resolutions is None: resolutions = self.resolutions if samples is None: samples = self.samples output_dir = self._format_string_with_attributes(output_dir) if not os.path.exists(output_dir): os.makedirs(output_dir) for resolution in tqdm(resolutions, total=len(resolutions), desc="Resolution"): for sample in tqdm(samples, total=len(samples), desc="Sample"): # Read log2 file if not hasattr(sample, "log2_read_counts"): sample.log2_read_counts = os.path.join( self.data_dir, sample.sample_root, sample.name + "_{resolution}", "CNAprofiles", "log2_read_counts.igv", ) if "{resolution}" in sample.log2_read_counts: input_file = sample.log2_read_counts.format( resolution=resolution) f = glob(input_file) if len(f) == 1: f = f[0] else: msg = "Sample '{}' does not have a PDF file!".format( sample.name) if permissive: _LOGGER.warning(msg) continue else: raise OSError(msg) d = os.path.join( output_dir, sample.name + "." + resolution + "." + output_prefix + ".pdf") try: copyfile(f, d) except OSError: msg = "Could not copy file '{}' to '{}'!".format(f, d) if permissive: _LOGGER.warning(msg) else: raise OSError(msg)
def load_data( self, output_map=None, only_these_keys=None, resolutions=None, prefix="{results_dir}/{name}", permissive=True, ): """ Load the output files of the major functions of the Analysis. Parameters ---------- output_map : :obj:`dict` Dictionary with {attribute_name: (file_path, kwargs)} to load the files. The kwargs in the tuple will be passed to :meth:`pandas.read_csv`. Defaults to the required to read the keys in ``only_these_keys``. only_these_keys : :obj:`list`, optional Iterable of analysis attributes to load up. Possible attributes: * "matrix_raw" * "matrix_norm" * "matrix_features" * "differential_results" Defaults to all of the above. resolutions: :obj:`list` List of resolution strings to get data for. Defaults to value of ``resolutions`` attribute of Analysis. prefix : :obj:`str`, optional String prefix of files to load. Variables in curly braces will be formated with attributes of analysis. Defaults to "{results_dir}/{name}". permissive : :obj:`bool`, optional Whether an error should be ignored if reading a file causes IOError. Default is :obj:`True`. Attributes ---------- pandas.DataFrame Dataframes holding the respective data, available as attributes described in the `only_these_keys` parameter. Raises ---------- IOError If not permissive and a file is not found """ from ngs_toolkit.utils import fix_dataframe_header prefix = self._format_string_with_attributes(prefix) if resolutions is None: resolutions = self.resolutions if output_map is None: kwargs = {"index_col": 0} output_map = { "matrix_raw": { r: (prefix + ".{}.matrix_raw.csv".format(r), kwargs) for r in resolutions }, "matrix_norm": { r: (prefix + ".{}.matrix_norm.csv".format(r), kwargs) for r in resolutions }, "segmentation": { r: (prefix + ".{}.segmentation.csv".format(r), {}) for r in resolutions }, "segmentation_annot": { r: (prefix + ".{}.segmentation.annotated.csv".format(r), {}) for r in resolutions }, } if only_these_keys is None: only_these_keys = list(output_map.keys()) output_map = { k: v for k, v in output_map.items() if k in only_these_keys } for name, f in output_map.items(): for resolution, (file, kwargs) in f.items(): file = file.format(resolution) _LOGGER.info( "Loading '{}' analysis attribute for resolution '{}'.". format(name, resolution)) if not hasattr(self, name): setattr(self, name, {resolution: None}) try: getattr(self, name)[resolution] = pd.read_csv(file, **kwargs) # Fix possible multiindex for matrix_norm if name == "matrix_norm": getattr(self, name)[resolution] = fix_dataframe_header( getattr(self, name)[resolution]) except IOError as e: if not permissive: raise e else: _LOGGER.warning(e)