def main(args=None): vinfo = sys.version_info if not (vinfo >= (2, 7)): raise SystemError('Python interpreter version >= 2.7 required, ' 'found %d.%d instead.' % (vinfo.major, vinfo.minor)) if args is None: parser = get_argument_parser() args = parser.parse_args() gopca_file = args.gopca_file output_file = args.output_file #sig_max_len = args.sig_max_len #sig_reverse_order = args.sig_reverse_order #sample_cluster_metric = args.sample_cluster_metric #no_sample_clustering = args.no_sample_clustering # configure root logger log_file = args.log_file quiet = args.quiet verbose = args.verbose logger = misc.get_logger(log_file=log_file, quiet=quiet, verbose=verbose) result = util.read_gopca_result(gopca_file) sig_matrix = util.read_gopca_result(gopca_file) sig_labels = [sig.get_label(include_id=False) for sig in sig_matrix.signatures] matrix = ExpMatrix(genes=sig_labels, samples=sig_matrix.samples, X=sig_matrix.X) matrix.index.name = 'Signatures' #signatures = result.signatures #sig_labels = [sig.get_label(max_name_length=sig_max_len, include_id=False) # for sig in signatures] #samples = list(result.samples) # generate expression matrix #E = ExpMatrix(genes=sig_labels, samples=samples, X=sig_matrix.X) # clustering of signatures (rows) #E, _ = cluster.cluster_genes(E, reverse=sig_reverse_order) exp_logger = logging.getLogger(expression.__name__) exp_logger.setLevel(logging.WARNING) matrix.write_tsv(output_file) exp_logger.setLevel(logging.NOTSET) logger.info('Wrote %d x %d signature matrix to "%s".', matrix.p, matrix.n, output_file) return 0
def from_signatures(cls, signatures, standardize=False, center=True, use_median=True, cluster_signatures=True, signature_cluster_metric='correlation', cluster_samples=True, sample_cluster_metric='euclidean', cluster_method='average'): """Generate a GO-PCA signature matrix from individual signatures. The GO-PCA signature matrix contains the expression levels of all signatures (rows) generated, across all samples (columns) in the analysis. See the documentation of the `GOPCASignature` class for details on how signature expression levels are calculated. Parameters ---------- signatures: Iterable of `GOPCASignature` The signatures generated. """ # TODO: finish docstring assert isinstance(signatures, Iterable) assert isinstance(standardize, bool) assert isinstance(center, bool) assert isinstance(use_median, bool) assert isinstance(cluster_signatures, bool) assert isinstance(cluster_samples, bool) ### generate the expression matrix matrix = ExpMatrix( pd.concat([ sig.get_expression(standardize=standardize, center=center, use_median=use_median) for sig in signatures ], axis=1).T) matrix.genes.name = 'Signatures' matrix.samples.name = 'Samples' if matrix.p == 1: cluster_signatures = False cluster_samples = False ### clustering if cluster_signatures: # cluster signatures matrix = cluster.cluster_genes(matrix, metric=signature_cluster_metric, method=cluster_method) order_samples = None if cluster_samples: # cluster samples matrix = cluster.cluster_samples(matrix, metric=sample_cluster_metric, method=cluster_method) return cls(matrix)
def plot_read_count_distribution(barcode_count_file, output_file, xaxis_label=('# mapped reads ' '(log<sub>10</sub>-scale)')): """Plot histogram of the distribution of reads per barcode. TODO: docstring""" matrix = ExpMatrix.read_tsv(barcode_count_file) x = np.float64(matrix.values.ravel()) num_total_reads = int(np.sum(x)) x[x < 1] = 1 x = np.log10(x) data = [go.Histogram(x=x, nbinsx=100)] layout = go.Layout( title='Total number of mapped reads: %d' % num_total_reads, font=dict( size=20, family='serif', ), xaxis=dict(title=xaxis_label, ), yaxis=dict( title='# barcodes', type='log', ), ) fig = go.Figure(data=data, layout=layout) plot(fig, filename=output_file, show_link=False, auto_open=False)
def test_sparse(tmpdir, my_matrix): """Test reading/writing of sparse text format.""" output_file = tmpdir.join('expression_matrix.mtx').strpath my_matrix.write_sparse(output_file) other = ExpMatrix.read_sparse(output_file) assert other is not my_matrix assert other == my_matrix
def test_download(my_expression_file, my_gene_ontology_file, my_fly_gene_set_file): """Test if required data files were downloaded successfully.""" # expression file print(my_expression_file) assert os.path.isfile(my_expression_file) matrix = ExpMatrix.read_tsv(my_expression_file) assert isinstance(matrix, ExpMatrix) assert matrix.hash == 'aa7cc5e6e04d34e65058f059bcdfe5ea' # gene ontology file print(my_gene_ontology_file) assert os.path.isfile(my_gene_ontology_file) # hash not stable? #ontology = GeneOntology.read_obo(my_gene_ontology_file) #assert isinstance(ontology, GeneOntology) #assert ontology.hash == '978546899cfb0196ac2005d4b177725f' # gene set file print(my_fly_gene_set_file) assert os.path.isfile(my_fly_gene_set_file) gene_sets = GeneSetCollection.read_tsv(my_fly_gene_set_file) assert isinstance(gene_sets, GeneSetCollection) assert gene_sets.hash == '78b4b27e9658560a8e5993154d3228fa'
def test_tsv(tmpdir, my_matrix): output_file = tmpdir.join('expression_matrix.tsv').strpath my_matrix.write_tsv(output_file) # data = open(str(path), mode='rb').read() # h = hashlib.md5(data).hexdigest() # assert h == 'd34bf3d376eb613e4fea894f7c9d601f' other = ExpMatrix.read_tsv(output_file) assert other is not my_matrix assert other == my_matrix
def my_matrix_filtered(my_expression_file): matrix = ExpMatrix.read_tsv(my_expression_file) matrix_filtered = filter_variance(matrix, 8000) return matrix_filtered
def main(args=None): """Run GO-PCA and store the result in a `pickle` file. Parameters ---------- args: argparse.Namespace object, optional The argument values. If not specified, the values will be obtained by parsing the command line arguments using the `argparse` module. Returns ------- int Exit code (0 if no error occurred). Raises ------ SystemError If the version of the Python interpreter is not >= 2.7. """ vinfo = sys.version_info if not (vinfo >= (2, 7)): raise SystemError('Python interpreter version >= 2.7 required, ' 'found %d.%d instead.' % (vinfo.major, vinfo.minor)) if args is None: # read arguments from the command line parser = get_argument_parser() # parse first with default options, in case "--help" is specified # ("--help" causes the program to exit at this point) args = parser.parse_args() # now remove the defaults and parse again # (removing the defaults is important so that we know which values # were specified by the user) no_defaults = dict([p, None] for p in GOPCA.get_param_defaults()) no_defaults2 = dict([p, None] for p in GOPCAParams.get_param_defaults()) no_defaults.update(no_defaults2) parser.set_defaults(**no_defaults) args = parser.parse_args() # reporting options log_file = args.log_file quiet = args.quiet verbose = args.verbose # test if we can write to log_file? # configure root logger logger = util.get_logger(log_file=log_file, quiet=quiet) # check if required parameters were specified passed = True if args.expression_file is None: logger.error('No expression file specified!') passed = False if args.gene_set_file is None: logger.error('No gene set file specified!') passed = False if args.output_file is None: logger.error('No output file specified!') passed = False if not passed: logger.error('Not all required parameters were specified.') return 1 # generate configuration if args.config_file is not None: # read parameter values from config file params = GOPCAParams.read_ini(args.config_file) else: # start with default configuration params = GOPCAParams() # overwrite parameters specified on the command line for p in GOPCAParams.get_param_defaults(): v = getattr(args, p) if v is not None: logger.debug('Parameter "%s" specified on command line!', p) params.set_param(p, v) global_params = GOPCA.get_param_defaults() for k in list(global_params.keys()): v = getattr(args, k) if v is not None: logger.debug('Parameter "%s" specified on command line!', p) global_params[k] = v # read expression file matrix = ExpMatrix.read_tsv(args.expression_file) logger.info( 'Expression matrix size: ' + '(p = %d genes) x (n = %d samples).', matrix.p, matrix.n) if args.sel_var_genes > 0: # filter genes by variance matrix = matrix.filter_variance(args.sel_var_genes) # read gene set file gene_sets = GeneSetCollection.read_tsv(args.gene_set_file) print(args.gene_set_file, gene_sets) # read ontology file (if supplied) gene_ontology = None if args.gene_ontology_file is not None: p_logger = logging.getLogger(genometools.__name__) p_logger.setLevel(logging.ERROR) gene_ontology = GeneOntology.read_obo( args.gene_ontology_file, part_of_cc_only=params.go_part_of_cc_only) p_logger.setLevel(logging.NOTSET) M = GOPCA.simple_setup(matrix, params, gene_sets, gene_ontology, verbose=verbose, **global_params) run = M.run() if run is None: logger.error('GO-PCA run failed!') return 1 # write run to pickle file logger.info('Storing GO-PCA run in file "%s"...', args.output_file) run.write_pickle(args.output_file) return 0
def rma(cdf_file, sample_cel_files, pm_probes_only=True, bg_correct=True, quantile_normalize=True, medianpolish=True): """Perform RMA on a set of samples. Parameters ---------- cdf_file: str The path of the Brainarray CDF file to use. Note: Brainarray CDF files can be downloaded from http://brainarray.mbni.med.umich.edu/Brainarray/Database/CustomCDF/genomic_curated_CDF.asp sample_cel_files: collections.OrderedDict (st => str) An ordered dictionary where each key/value-pair corresponds to a sample. The *key* is the sample name, and the *value* is the (absolute) path of the corresponding CEL file. The CEL files can be gzip'ed. pm_probes_only: bool, optional Whether or not to only use PM (perfect match) probes and ignore all MM (mismatch) probes. [True] bg_correct: bool, optional Whether or not to apply background correction. [True] quantile_normalize: bool, optional Whether or not to apply quantile normalization. [True] medianpolish: bool, optional Whether or not to apply medianpolish. [True] Returns ------- genes: tuple of str The list of gene names. samples: tuple of str The list of sample names. X: np.ndarray (ndim = 2, dtype = np.float32) The expression matrix (genes-by-samples). Examples -------- >>> from collections import OrderedDict >>> import pyaffy >>> cdf_file = '/path/to/brainarray/cdf/HGU133Plus2_Hs_ENTREZG.cdf' >>> sample_cel_files = OrderedDict([ ['Sample 1', '/path/to/sample_1.CEL.gz'], ['Sample 2', '/path/to/sample_2.CEL.gz'], ]) >>> genes, samples, X = pyaffy.rma(cdf_file, sample_cel_files) """ ### checks assert isinstance(cdf_file, (str, _oldstr)) assert os.path.isfile(cdf_file), \ 'CDF file "%s" does not exist!' %(cdf_file) assert isinstance(sample_cel_files, collections.OrderedDict) for sample, cel_file in sample_cel_files.items(): assert isinstance(sample, (str, _oldstr)) assert isinstance(cel_file, (str, _oldstr)) assert os.path.isfile(cel_file), \ 'CEL file "%s" does not exist!' %(cel_file) assert isinstance(pm_probes_only, bool) assert isinstance(bg_correct, bool) assert isinstance(quantile_normalize, bool) assert isinstance(medianpolish, bool) t00 = time.time() ### read CDF data logger.info('Parsing CDF file.') t0 = time.time() # parse the CDF file probe_type = 'pm' if not pm_probes_only: probe_type = 'all' name, num_rows, num_cols, pm_probesets = \ parse_cdf(cdf_file, probe_type=probe_type) # concatenate indices of all PM probes into one long vector pm_sel = np.concatenate(list(pm_probesets.values())) t1 = time.time() logger.info('CDF file parsing time: %.2f s', t1 - t0) logger.info('CDF array design name: %s', name) logger.info('CDF rows / columns: %d x %d', num_rows, num_cols) ### read CEL data logger.info('Parsing CEL files...') t0 = time.time() p = pm_sel.size n = len(sample_cel_files) Y = np.empty((p, n), dtype=np.float32) samples = [] sub_logger = logging.getLogger(celparser.__name__) sub_logger.setLevel(logging.WARNING) for j, (sample, cel_file) in enumerate(sample_cel_files.items()): logger.debug('Parsing CEL file for sample "%s": %s', sample, cel_file) samples.append(sample) y = parse_cel(cel_file) Y[:, j] = y[pm_sel] sub_logger.setLevel(logging.NOTSET) t1 = time.time() logger.info('CEL files parsing time: %.1f s.', t1 - t0) ### background correction if bg_correct: logger.info('Performing background correction...') t0 = time.time() Y = rma_bg_correct(Y) t1 = time.time() logger.info('Background correction time: %.1f s.', t1 - t0) else: logger.info('Skipping background correction.') matrix = ExpMatrix(genes=pm_sel, samples=samples, X=Y) ### quantile normalization if quantile_normalize: logger.info('Performing quantile normalization...') t0 = time.time() matrix = qnorm(matrix) t1 = time.time() logger.info('Quantile normalization time: %.1f s.', t1 - t0) else: logger.info('Skipping quantile normalization.') ### convert intensities to log2-scale Y = np.log2(matrix.values) ### probeset summarization (with or without median polish) method = 'with' if not medianpolish: method = 'without' logger.info('Summarize probeset intensities (%s medianpolish)...', method) t0 = time.time() p = len(pm_probesets) n = Y.shape[1] X = np.empty((p, n), dtype=np.float32) cur = 0 num_converged = 0 genes = [] for i, (gene_id, probes) in enumerate(pm_probesets.items()): genes.append(gene_id) if medianpolish: #X_sub = np.ascontiguousarray(Y[cur:(cur + probes.size),:]) X_sub = Y[cur:(cur + probes.size), :] _, row_eff, col_eff, global_eff, converged, num_iter = medpolish( X_sub, copy=False) X[i, :] = col_eff + global_eff if converged: num_converged += 1 else: # simply use median across probes X[i, :] = np.median(Y[cur:(cur + probes.size), :], axis=0) #X[i,:] = np.ma.median(X_sub, axis = 0) cur += probes.size t1 = time.time() logger.info('Probeset summarization time: %.2f s.', t1 - t0) if medianpolish: logger.debug('Converged: %d / %d (%.1f%%)', num_converged, p, 100 * (num_converged / float(p))) ### report total time t11 = time.time() logger.info('Total RMA time: %.1f s.', t11 - t00) ### sort alphabetically by gene name a = np.lexsort([genes]) genes = [genes[i] for i in a] X = X[a, :] return genes, samples, X
def get_heatmap(self, sig_matrix=None, standardize=False, center=True, use_median=True, include_id=False, include_stats=True, include_pval=True, cluster_genes=True, gene_cluster_metric='correlation', cluster_samples=True, sample_cluster_metric='euclidean', cluster_method='average', colorbar_label=None, **kwargs): """Generate a heatmap of the signature gene matrix.""" # TODO: Finish docstring assert isinstance(cluster_genes, bool) assert isinstance(cluster_samples, bool) assert isinstance(gene_cluster_metric, (str, _oldstr)) assert isinstance(sample_cluster_metric, (str, _oldstr)) assert isinstance(cluster_method, (str, _oldstr)) from . import GOPCASignatureMatrix if sig_matrix is not None: assert isinstance(sig_matrix, GOPCASignatureMatrix) if colorbar_label is None: colorbar_label = 'Centered expression' matrix = self.matrix.copy() if standardize: matrix.standardize_genes(inplace=True) cb_default_label = ('Standardized expression<br>' '(based on log<sub>2</sub>-scale)') elif center: matrix.center_genes(use_median=use_median, inplace=True) cb_default_label = 'Centered expression<br>(log<sub>2</sub>-scale)' else: cb_default_label = 'Expression<br>(log<sub>2</sub>-scale)' if colorbar_label is None: colorbar_label = cb_default_label # clustering if sig_matrix is not None: # user has provided a GOPCASignatureMatrix instance # make sure its samples match the signature's logger.info('Ordering samples to match order in signature matrix.') assert set(sig_matrix.samples) == set(self.samples.values) # re-arrange samples according to clustering of signature matrix matrix = matrix.loc[:, sig_matrix.samples] elif cluster_samples: # cluster samples (only if no signature matrix is provided) matrix = cluster.cluster_samples(matrix, metric=sample_cluster_metric, method=cluster_method) if cluster_genes: # cluster genes matrix = cluster.cluster_genes(matrix, metric=gene_cluster_metric, method=cluster_method) # add a "Signature"-labeled row to the top, # which represents the signature expression vector title = self.get_label(include_id=include_id, include_stats=include_stats, include_pval=include_pval) mean = np.mean(matrix.X, axis=0) header_row = ExpMatrix(genes=['<b>Signature</b>'], samples=matrix.samples, X=np.atleast_2d(mean)) combined_matrix = pd.concat([header_row, matrix], axis=0) heatmap = ExpHeatmap(combined_matrix, title=title, colorbar_label=colorbar_label, **kwargs) return heatmap
def __init__(self, *args, **kwargs): return ExpMatrix.__init__(self, *args, **kwargs)
def run_pipeline_old(config_file): """inDrop pipeline.""" t0 = time.time() conf, errors = config.read_config(config_file) input_ = conf['input'] output = conf['output'] params = conf['parameters'] pipeline = conf['pipeline'] output_dir = output['output_dir'] barcode1_file = resource_filename( 'singlecell', 'data/indrop/gel_barcode1_list.txt') barcode2_file = resource_filename( 'singlecell', 'data/indrop/gel_barcode2_list.txt') if not util.is_empty_dir(output_dir): if output['allow_nonempty_output_dir']: _LOGGER.info('Note: Output directory exists and is not empty.') else: _LOGGER.error( 'Output directory is not empty! Either specify an empty ' '(or non-existent) output directory, or specify ' '"allow_nonempty_output_dir: yes" in the configuration file.') return 1 # create a timestamp for this run timestamp = time.strftime('%Y-%m-%d_%H-%M-%S') # create output directory, if necessary misc.make_sure_dir_exists(output_dir) # create results directory results_dir = os.path.join(output_dir, 'results') misc.make_sure_dir_exists(results_dir) # add file handler to the _LOGGER pipeline_log_file = os.path.join(results_dir, 'pipeline_log.txt') file_handler = logging.FileHandler(pipeline_log_file) log_fmt = '[%(asctime)s] %(levelname)s: %(message)s' log_datefmt = '%Y-%m-%d %H:%M:%S' formatter = logging.Formatter(log_fmt,log_datefmt) file_handler.setFormatter(formatter) _LOGGER.addHandler(file_handler) _LOGGER.info('This is the inDrop pipeline of SingleCell v%s', __version__) _LOGGER.info('Pipeline run timestamp: %s', timestamp) if params['use_docker']: _LOGGER.info('We\'re running using docker!') else: _LOGGER.info('We\'re running without using docker!') # create plot directory plot_dir = os.path.join(results_dir, 'qc_plots') misc.make_sure_dir_exists(plot_dir) # copy configuration file to results directory output_config_file = os.path.join(results_dir, 'pipeline_config_%s.yaml' % timestamp) _LOGGER.info('Copying configuration file to "%s"', output_config_file) shutil.copyfile(config_file, output_config_file) ### process reads processed_read_dir = os.path.join(output_dir, 'processed_reads') misc.make_sure_dir_exists(processed_read_dir) process_read_file = os.path.join( processed_read_dir, 'processed_reads.fastq') process_count_file = os.path.join(results_dir, 'barcode_counts_reads.tsv') if pipeline['skip_read_processing']: _LOGGER.info('Skipping read processing step!') else: _LOGGER.info('Processing reads...') reads.process_reads( input_['barcode_read_file'], input_['mrna_read_file'], barcode1_file, barcode2_file, process_read_file, process_count_file, max_reads=params['max_reads']) _LOGGER.info('Finished processing reads.') ### mapping reads with STAR barcode_counts_mapped_file = os.path.join(results_dir, 'barcode_counts_mapped.tsv') map_script_file = os.path.join(results_dir, 'map_with_star.sh') map_log_file = os.path.join(results_dir, 'mapping_log.txt') mapping_dir = os.path.join(output_dir, 'aligned_reads') alignment_file = os.path.join(mapping_dir, 'Aligned.sortedByCoord.out.bam') if pipeline['skip_mapping']: _LOGGER.info('Skipping read mapping step!') else: # mapping _LOGGER.info('Mapping reads with STAR...') star_params = conf['STAR'] mapping.map_with_star(process_read_file, input_['star_index_dir'], map_script_file, map_log_file, mapping_dir, num_threads=params['num_threads'], compressed=False, use_docker=params['use_docker'], **star_params) _LOGGER.info('Finished mapping reads.') # count mapped reads for each barcode _LOGGER.info('Counting mapped reads for each barcode...') barcodes.count_mapped_reads( alignment_file, barcode1_file, barcode2_file, barcode_counts_mapped_file) _LOGGER.info('Finished counting mapped reads for each barcode.') ### generate intermediate files (chromosome lengths; protein-coding genes) chromlen_file = os.path.join(results_dir, 'chromosome_lengths.tsv') gene_file = os.path.join(results_dir, 'genes.tsv') if (not pipeline['skip_aligned_read_processing']) or \ (not pipeline['skip_expression_quantification']): logger = logging.getLogger('genometools.ensembl') logger.setLevel(logging.ERROR) # generate file containing chromosome lengths _LOGGER.info('Extracting chromosome lengths...') chromlen = ensembl.get_chromosome_lengths(input_['genome_file']) chromlen.to_csv(chromlen_file, sep='\t', header=True) _LOGGER.info('Finished extracting chromosome lengths.') # generate file containing protein-coding genes _LOGGER.info('Extracting list of protein-coding genes from Ensembl GTF' ' file...') protein_coding_genes = ensembl.get_protein_coding_genes( input_['genome_annotation_file']) _LOGGER.info('Finished extracting list of protein-coding genes.') if params['include_lincRNA_genes']: # extract lincRNA genes _LOGGER.info('Extracting list of lincRNA genes from Ensembl GTF' ' file...') linc_rna_genes = ensembl.get_linc_rna_genes( input_['genome_annotation_file']) _LOGGER.info('Finished extracting list of lincRNA genes.') # exclude lincRNA whose gene name clashes with that of a # protein-coding gene sel = ~linc_rna_genes['name'].isin( set(protein_coding_genes['name'])) linc_rna_genes = linc_rna_genes.loc[sel] genes = pd.concat([protein_coding_genes, linc_rna_genes]) else: genes = protein_coding_genes genes.to_csv(gene_file, sep='\t', index=False) ### process aligned reads read_info_dir = os.path.join(output_dir, 'read_info') if not pipeline['skip_aligned_read_processing']: _LOGGER.info('Processing aligned reads...') misc.make_sure_dir_exists(read_info_dir) aligned_reads.process_aligned_reads( alignment_file, chromlen_file, gene_file, input_['genome_annotation_file'], read_info_dir, num_jobs=params['num_threads']) _LOGGER.info('Finished processing of aligned reads.') else: _LOGGER.info('Skipping processing of aligned reads!') ### quantify gene and transcript expression num_cells = params['num_cells'] gene_expression_file = os.path.join( results_dir, 'gene_expression.mtx') transcript_expression_file = os.path.join( results_dir, 'transcript_expression.mtx') dense_gene_expression_file = None if output['generate_dense_expression_matrix']: dense_gene_expression_file = os.path.join( results_dir, 'gene_expression.tsv') if not pipeline['skip_expression_quantification']: _LOGGER.info('Quantifying expression for top %d cells...', num_cells) expression.quantify_expression( barcode_counts_mapped_file, chromlen_file, read_info_dir, gene_file, input_['genome_annotation_file'], num_cells, gene_expression_file, transcript_expression_file, min_umi_qual=params['min_umi_qual'], cell_prefix=output['cell_prefix'], dense_gene_expression_output_file=dense_gene_expression_file) _LOGGER.info('Finished expression quantification.') else: _LOGGER.info('Skipping expression quantification!') ### QC scripts if pipeline['skip_qc_plot_generation']: _LOGGER.info('Skipping the generation of QC plots!') else: _LOGGER.info('Generating QC plots...') _LOGGER.info('Plotting distribution of mapped reads per barcode...') mapped_count_histogram_file = \ os.path.join(plot_dir, 'mapped_reads_histogram.html') barcodes.plot_read_count_distribution( barcode_counts_mapped_file, mapped_count_histogram_file) _LOGGER.info('Plotting distribution of transcripts per cell...') output_file = os.path.join(plot_dir, 'transcripts_per_cell.html') matrix = ExpMatrix.read_sparse(gene_expression_file)\ .astype(np.float64) fig = qc.plot_cell_transcript_distribution( matrix, output['experiment_name']) plot(fig, filename=output_file, show_link=False, auto_open=False) _LOGGER.info('Plotting fraction of ribosomal and mitochondrial ' 'gene expression...') output_file = os.path.join(plot_dir, 'mito_ribo_expression.html') matrix = ExpMatrix.read_sparse(gene_expression_file)\ .astype(np.float64) # redundant fig = qc.plot_transcriptome_components( matrix, species=params['species'], name=output['experiment_name'], width=950, height=800, font_size=16, font_family='serif') plot(fig, filename=output_file, show_link=False, auto_open=False) _LOGGER.info('Plotting saturation...') output_file = os.path.join(plot_dir, 'saturation.html') matrix = ExpMatrix.read_sparse(transcript_expression_file)\ .astype(np.float64) fig = qc.plot_saturation(matrix) plot(fig, filename=output_file, show_link=False, auto_open=False) #_LOGGER.removeHandler(file_handler) t1 = time.time() t = t1 - t0 _LOGGER.info('Pipeline run finished in %.1f s (%.1f min)!', t, t/60)
def main(args=None): vinfo = sys.version_info if not (vinfo >= (2, 7)): raise SystemError('Python interpreter version >= 2.7 required, ' 'found %d.%d instead.' % (vinfo.major, vinfo.minor)) if args is None: parser = get_argument_parser() args = parser.parse_args() expression_file = args.expression_file entrez2gene_file = args.entrez2gene_file gene_file = args.gene_file output_file = args.output_file strip_affy_suffix = args.strip_affy_suffix log_file = args.log_file quiet = args.quiet verbose = args.verbose # configure root logger logger = misc.get_logger(log_file=log_file, quiet=quiet, verbose=verbose) # read data genome = ExpGeneTable.read_tsv(gene_file) matrix = ExpMatrix.read_tsv(expression_file) e2g = dict(misc.read_all(entrez2gene_file)) entrez = matrix.genes if strip_affy_suffix: # remove "_at" suffix from Entrez IDs entrez = [e[:-3] for e in entrez] logger.debug(str(entrez[:3])) # check that Entrez IDs are unique assert len(entrez) == len(set(entrez)) # convert Entrez IDs to gene names f = 0 genes = [] X = [] # g = None for i, e in enumerate(entrez): # print e try: g = e2g[e] except KeyError: f += 1 else: # check if there are multiple entrez IDs pointing to the same gene # assert g not in genes genes.append(g) X.append(matrix.X[i, :]) assert len(genes) == len(set(genes)) if f > 0: logger.warning( 'Failed to convert %d / %d entrez IDs ' 'to gene symbols (%.1f%%).', f, matrix.p, 100 * (f / float(matrix.p))) # filter for known protein-coding genes X = np.float64(X) p = X.shape[0] logger.debug(str(X.shape)) sel = np.zeros(p, dtype=np.bool_) for i in range(p): if genes[i] in genome: sel[i] = True sel = np.nonzero(sel)[0] genes = [genes[i] for i in sel] X = X[sel, :] f = p - sel.size if f > 0: logger.warning( 'Failed to find %d / %d gene symbols in list of ' 'protein-coding genes (%.1f%%)', f, p, 100 * (f / float(p))) # generate new matrix (this automatically sorts the genes alphabetically) logger.debug('Genes: %d, Samples: %d, matrix: %s', len(genes), len(matrix.samples), str(X.shape)) matrix_conv = ExpMatrix(genes=genes, samples=matrix.samples, X=X) # write output file matrix_conv.write_tsv(output_file) return 0
def my_matrix(): genes = ['a', 'b', 'c', 'd', 'e', 'f'] samples = ['s1', 's2', 's3'] X = np.arange(18, dtype=np.float64).reshape(6, 3) matrix = ExpMatrix(genes=genes, samples=samples, X=X) return matrix
def main(args=None): vinfo = sys.version_info if not (vinfo >= (2, 7)): raise SystemError( "Python interpreter version >= 2.7 required, " "found %d.%d instead." % (vinfo.major, vinfo.minor) ) if args is None: parser = get_argument_parser() args = parser.parse_args() expression_file = args.expression_file entrez2gene_file = args.entrez2gene_file gene_file = args.gene_file output_file = args.output_file strip_affy_suffix = args.strip_affy_suffix log_file = args.log_file quiet = args.quiet verbose = args.verbose # configure root logger logger = misc.get_logger(log_file=log_file, quiet=quiet, verbose=verbose) # read data genome = ExpGenome.read_tsv(gene_file) matrix = ExpMatrix.read_tsv(expression_file) e2g = dict(misc.read_all(entrez2gene_file)) entrez = matrix.genes if strip_affy_suffix: # remove "_at" suffix from Entrez IDs entrez = [e[:-3] for e in entrez] logger.debug(str(entrez[:3])) # check that Entrez IDs are unique assert len(entrez) == len(set(entrez)) # convert Entrez IDs to gene names f = 0 genes = [] X = [] # g = None for i, e in enumerate(entrez): # print e try: g = e2g[e] except KeyError: f += 1 else: # check if there are multiple entrez IDs pointing to the same gene # assert g not in genes genes.append(g) X.append(matrix.X[i, :]) assert len(genes) == len(set(genes)) if f > 0: logger.warning( "Failed to convert %d / %d entrez IDs " "to gene symbols (%.1f%%).", f, matrix.p, 100 * (f / float(matrix.p)), ) # filter for known protein-coding genes X = np.float64(X) p = X.shape[0] logger.debug(str(X.shape)) sel = np.zeros(p, dtype=np.bool_) for i in range(p): if genes[i] in genome: sel[i] = True sel = np.nonzero(sel)[0] genes = [genes[i] for i in sel] X = X[sel, :] f = p - sel.size if f > 0: logger.warning( "Failed to find %d / %d gene symbols in list of " "protein-coding genes (%.1f%%)", f, p, 100 * (f / float(p)), ) # generate new matrix (this automatically sorts the genes alphabetically) logger.debug("Genes: %d, Samples: %d, matrix: %s", len(genes), len(matrix.samples), str(X.shape)) matrix_conv = ExpMatrix(genes=genes, samples=matrix.samples, X=X) # write output file matrix_conv.write_tsv(output_file) return 0
def main(args=None): """Run GO-PCA and store the result in a `pickle` file. Parameters ---------- args: argparse.Namespace object, optional The argument values. If not specified, the values will be obtained by parsing the command line arguments using the `argparse` module. Returns ------- int Exit code (0 if no error occurred). Raises ------ SystemError If the version of the Python interpreter is not >= 2.7. """ vinfo = sys.version_info if not (vinfo >= (2, 7)): raise SystemError('Python interpreter version >= 2.7 required, ' 'found %d.%d instead.' % (vinfo.major, vinfo.minor)) if args is None: # read arguments from the command line parser = get_argument_parser() # parse first with default options, in case "--help" is specified # ("--help" causes the program to exit at this point) args = parser.parse_args() # now remove the defaults and parse again # (removing the defaults is important so that we know which values # were specified by the user) no_defaults = dict([p, None] for p in GOPCA.get_param_defaults()) no_defaults2 = dict([p, None] for p in GOPCAParams.get_param_defaults()) no_defaults.update(no_defaults2) parser.set_defaults(**no_defaults) args = parser.parse_args() # reporting options log_file = args.log_file quiet = args.quiet verbose = args.verbose # test if we can write to log_file? # configure root logger logger = util.get_logger(log_file=log_file, quiet=quiet) # check if required parameters were specified passed = True if args.expression_file is None: logger.error('No expression file specified!') passed = False if args.gene_set_file is None: logger.error('No gene set file specified!') passed = False if args.output_file is None: logger.error('No output file specified!') passed = False if not passed: logger.error('Not all required parameters were specified.') return 1 # generate configuration if args.config_file is not None: # read parameter values from config file params = GOPCAParams.read_ini(args.config_file) else: # start with default configuration params = GOPCAParams() # overwrite parameters specified on the command line for p in GOPCAParams.get_param_defaults(): v = getattr(args, p) if v is not None: logger.debug('Parameter "%s" specified on command line!', p) params.set_param(p, v) global_params = GOPCA.get_param_defaults() for k in list(global_params.keys()): v = getattr(args, k) if v is not None: logger.debug('Parameter "%s" specified on command line!', p) global_params[k] = v # read expression file matrix = ExpMatrix.read_tsv(args.expression_file) logger.info('Expression matrix size: ' + '(p = %d genes) x (n = %d samples).', matrix.p, matrix.n) if args.sel_var_genes > 0: # filter genes by variance matrix = matrix.filter_variance(args.sel_var_genes) # read gene set file gene_sets = GeneSetCollection.read_tsv(args.gene_set_file) print(args.gene_set_file, gene_sets) # read ontology file (if supplied) gene_ontology = None if args.gene_ontology_file is not None: p_logger = logging.getLogger(genometools.__name__) p_logger.setLevel(logging.ERROR) gene_ontology = GeneOntology.read_obo( args.gene_ontology_file, part_of_cc_only=params.go_part_of_cc_only) p_logger.setLevel(logging.NOTSET) M = GOPCA.simple_setup(matrix, params, gene_sets, gene_ontology, verbose=verbose, **global_params) run = M.run() if run is None: logger.error('GO-PCA run failed!') return 1 # write run to pickle file logger.info('Storing GO-PCA run in file "%s"...', args.output_file) run.write_pickle(args.output_file) return 0
def my_matrix(my_gene_names, my_samples, my_X): #genes = ['a', 'b', 'c', 'd'] #samples = ['s1', 's2', 's3'] # X = np.arange(12, dtype=np.float64).reshape(4, 3) matrix = ExpMatrix(genes=my_gene_names, samples=my_samples, X=my_X) return matrix