def create_new_header(infile, mappings, outfile): """Create new header in BigWig, with UCSC chromosome names.""" with pyBigWig.open(infile) as bw: if set(bw.chroms().keys()).issubset(mappings.values()): # If chromosome names are already UCSC, just rename input file to output name. # Exit with status 0 since this is normal behavior. os.rename(infile, outfile) sys.exit(0) hdr = [(mappings[chrom], length) for chrom, length in bw.chroms().items() if chrom in mappings] if not hdr: msg = "Neither of the chromosomes in the input file has a valid UCSC pair. No mapping will be done." print(warning(msg)) os.rename(infile, outfile) sys.exit(0) seq_num = 0 with pyBigWig.open(outfile, 'w') as bw_output: bw_output.addHeader(hdr) for chrom, length in bw.chroms().items(): ints = bw.intervals(chrom, 0, length) if ints and chrom in mappings: bw_output.addEntries([mappings[chrom]] * len(ints), [x[0] for x in ints], ends=[x[1] for x in ints], values=[x[2] for x in ints]) elif chrom not in mappings: seq_num += 1 print('UCSC chromosome/conting mapping for {} is missing'.format(chrom)) if seq_num > 0: print(warning("UCSC chromosome/conting mapping for {} sequence(s) is missing. " "This sequence(s) will not be included in the bigWig file.".format(seq_num)))
def main(): """Invoke when run directly as a program.""" args = parse_arguments() validate_inputs(args) exp_type = args.exp_types[0] spikeins_mix = args.spikeins_mix expected = get_expected(spikeins_mix, log2=True) min_one_has_spikeins = False # At least one sample has spikeins = False warnings = [] for sample_name, sample_exp in zip(args.sample_names, args.sample_exps): measured_zero = get_measured(sample_exp, sample_name, exp_type, only_zero=True) measured_nonzero = get_measured(sample_exp, sample_name, exp_type, only_nonzero=True, log2=True) merged_zero = merge_expected_measured(expected, measured_zero) merged_nonzero = merge_expected_measured(expected, measured_nonzero) # Get only ERCC spike-in's and plot the histogram-scatter figure. if merged_nonzero.iloc[ merged_nonzero.index.str.startswith('ERCC'), :].empty: warnings.append( 'All ERCC spike-ins have zero expression in sample {}'.format( sample_name)) continue min_one_has_spikeins = True plot_histogram_scatter( expected=expected.iloc[expected.index.str.startswith('ERCC')], zero=merged_zero.iloc[merged_zero.index.str.startswith('ERCC'), :], nonzero=merged_nonzero.iloc[ merged_nonzero.index.str.startswith('ERCC'), :], spikein_type='ERCC', sample_name=sample_name, exp_type=exp_type, ) if min_one_has_spikeins: for message in warnings: print(warning(message)) else: # In case all samples have zero expression for all spikeins, # rather print one warning that says so (instead of printing # warning for each of the samples). print( warning('All ERCC spike-ins in all samples have zero expression.'))
def main(): """Invoke when run directly as a program.""" args = parse_arguments() res = resdk.Resolwe() with open(args.feature_ids) as gene_file: genes = [gene.strip() for gene in gene_file] org_features = res.feature.filter(source=args.source_db, species=args.species, feature_id=genes) if len(org_features) == 0: print(error("No genes were fetched from the knowledge base.")) exit(1) if args.source_db == args.target_db: target_ids = genes else: mapping_res = res.mapping.filter( source_db=args.source_db, source_species=args.species, target_db=args.target_db, target_species=args.species, source_id=genes, ) if len(mapping_res) == 0: print(error("Failed to map features.")) exit(1) mappings = {} for m in mapping_res: if m.source_id in genes: if m.source_id not in mappings: mappings[m.source_id] = m.target_id else: print(warning("Mapping {} returned multiple times.".format(m))) if len(genes) > len(mappings): print(warning("Not all features could be mapped.")) target_ids = mappings.values() with tempfile.NamedTemporaryFile() as input_genes: input_genes.write(' '.join(target_ids).encode("UTF-8")) input_genes.flush() process = Popen(['processor', str(args.pval), str(args.min_genes), args.obo, args.gaf, input_genes.name], stdout=PIPE, stderr=DEVNULL ) out, err = process.communicate() with open('terms.json', 'w') as f: f.write(out.decode("UTF-8"))
def iterate_snpeff_file(file_handle, filename): """Iterate entries in file produced by SnpSift extractFields.""" for row in file_handle: # One line can contain two or more ALT values (and consequently two or more AF/AD values) Such "multiple" # entries are split into one ALT/AF/AD value per row. Lofreq data does not contain AD value, this is why # ``ad_s`` generation might appear messy (to cover the case with or without AD column). alts = row["ALT"].strip().split(",") afqs = row["AF"].strip().split(",") default_ad_s = ",".join([""] * (len(alts) + 1)) # First entry is AD of REF allele, and the rest of them are for ALT alleles. ad_s = row.get("GEN[0].AD", default_ad_s).strip().split(",")[1:] if not (len(alts) == len(afqs) == len(ad_s)): print( warning( "Inconsistency for entry {} in file {}. Skipping this entry." .format(row, os.path.basename(filename)))) continue if len(ad_s) == 1: row["AD"] = ad_s[0] yield row else: for alt, afq, ad_ in zip(alts, afqs, ad_s): row_copy = copy.deepcopy(row) row_copy["ALT"] = alt row_copy["AF"] = afq if ad_: row_copy["AD"] = ad_ yield row_copy
def iterate_snpeff_file(file_handle, filename): """Iterate entries in file produced by SnpSift extractFields.""" for row in file_handle: # One line can contain two or more ALT values (and consequently two or more AF/AD values) Such "multiple" # entries are split into one ALT/AF/AD value per row. Lofreq data does not contain AD value, this is why # ``ad_s`` generation might appear messy (to cover the case with or without AD column). alts = row['ALT'].strip().split(',') afqs = row['AF'].strip().split(',') default_ad_s = ','.join([''] * (len(alts) + 1)) # First entry is AD of REF allele, and the rest of them are for ALT alleles. ad_s = row.get('GEN[0].AD', default_ad_s).strip().split(',')[1:] if not (len(alts) == len(afqs) == len(ad_s)): print(warning('Inconsistency for entry {} in file {}. Skipping this entry.'.format( row, os.path.basename(filename)))) continue if len(ad_s) == 1: row['AD'] = ad_s[0] yield row else: for alt, afq, ad_ in zip(alts, afqs, ad_s): row_copy = copy.deepcopy(row) row_copy['ALT'] = alt row_copy['AF'] = afq if ad_: row_copy['AD'] = ad_ yield row_copy
def test_string(self): expected = { 'type': 'COMMAND', 'type_data': 'process_log', 'data': {'warning': 'Some warning'}, } self.assertEqual(warning('Some warning'), expected)
def get_pca(expressions=pd.DataFrame(), n_components=2, gene_labels=[]): """Compute PCA.""" if not gene_labels: gene_labels = expressions.index skipped_gene_labels = list(set(gene_labels).difference(expressions.index)) if expressions.shape[0] < 2 or expressions.shape[1] < 2: coordinates = [[0.0, 0.0] for i in range(expressions.shape[1])] all_components = [[], []] all_explained_variance_ratios = [0.0, 0.0] else: pca = PCA(n_components=n_components, whiten=True) pca_expressions = pca.fit_transform(expressions.transpose()) coordinates = [t[:2].tolist() if len(t) > 1 else [t[0], 0.0] for t in pca_expressions] all_components = [component_top_factors(component, gene_labels) for component in pca.components_] if np.isnan(pca.explained_variance_ratio_).any(): all_explained_variance_ratios = [0.0 for _ in pca.explained_variance_ratio_] else: all_explained_variance_ratios = pca.explained_variance_ratio_.tolist() result = { 'coordinates': coordinates, 'all_components': all_components, 'all_explained_variance_ratios': all_explained_variance_ratios, 'skipped_gene_labels': skipped_gene_labels, 'warning': None } if expressions.empty: print(warning('Gene selection and filtering resulted in no genes. Please select different samples or genes.')) return result
def main(): """Invoke when run directly as a program.""" args = parse_arguments() validate_inputs(args) exp_type = args.exp_types[0] spikeins_mix = args.spikeins_mix expected = get_expected(spikeins_mix, log2=True) for sample_name, sample_exp in zip(args.sample_names, args.sample_exps): measured_zero = get_measured(sample_exp, sample_name, exp_type, only_zero=True) measured_nonzero = get_measured(sample_exp, sample_name, exp_type, only_nonzero=True, log2=True) merged_zero = merge_expected_measured(expected, measured_zero) merged_nonzero = merge_expected_measured(expected, measured_nonzero) # Get only ERCC spike-in's and plot the histogram-scatter figure. if merged_nonzero.iloc[merged_nonzero.index.str.startswith('ERCC'), :].empty: print(warning('All ERCC spike-ins have zero expression in sample {}'.format(sample_name))) continue plot_histogram_scatter( expected=expected.iloc[expected.index.str.startswith('ERCC')], zero=merged_zero.iloc[merged_zero.index.str.startswith('ERCC'), :], nonzero=merged_nonzero.iloc[merged_nonzero.index.str.startswith('ERCC'), :], spikein_type='ERCC', sample_name=sample_name, exp_type=exp_type, )
def get_clustering(expressions, distance_metric='euclidean', linkage_method='average', ordering_method=None, n_keep=None, n_trials=1000): """Compute linkage, order, and produce a dendrogram.""" if len(expressions.columns) < 2: return np.array([]), {'leaves': list(range(len(expressions.columns)))} try: distances = pdist(np.transpose(np.array(expressions)), metric=distance_metric) if np.isnan(distances).any(): distances = np.nan_to_num(distances, copy=False) warning( 'Distances between some samples were undefined and were set to zero.' ) except: msg = 'Cannot compute distances between samples.' print(error(msg)) raise ValueError(msg) try: link = linkage(y=distances, method=linkage_method) except: msg = 'Cannot compute linkage.' print(error(msg)) raise ValueError(msg) if ordering_method: if ordering_method == 'knn': link = knn(link, distances) elif ordering_method == 'optimal': link = optimal(link, distances, n_keep) elif ordering_method == 'sa': link = simulated_annealing(link, distances, n_trials) else: msg = 'Unknown ordering method {}'.format(ordering_method) print(error(msg)) raise ValueError(msg) try: dend = dendrogram(link, no_plot=True) except: msg = 'Cannot compute dendrogram.' print(error(msg)) raise ValueError(msg) return link, dend
def create_new_header(infile, mappings, outfile): """Create new header in BigWig, with UCSC chromosome names.""" with pyBigWig.open(infile) as bw: if set(bw.chroms().keys()).issubset(mappings.values()): # If chromosome names are already UCSC, just rename input file to output name. # Exit with status 0 since this is normal behavior. os.rename(infile, outfile) sys.exit(0) hdr = [(mappings[chrom], length) for chrom, length in bw.chroms().items() if chrom in mappings] if not hdr: msg = "Neither of the chromosomes in the input file has a valid UCSC pair. No mapping will be done." send_message(warning(msg)) os.rename(infile, outfile) sys.exit(0) seq_num = 0 with pyBigWig.open(outfile, "w") as bw_output: bw_output.addHeader(hdr) for chrom, length in bw.chroms().items(): ints = bw.intervals(chrom, 0, length) if ints and chrom in mappings: bw_output.addEntries( [mappings[chrom]] * len(ints), [x[0] for x in ints], ends=[x[1] for x in ints], values=[x[2] for x in ints], ) elif chrom not in mappings: seq_num += 1 print("UCSC chromosome/conting mapping for {} is missing". format(chrom)) if seq_num > 0: send_message( warning( "UCSC chromosome/conting mapping for {} sequence(s) is missing. " "This sequence(s) will not be included in the bigWig file." .format(seq_num)))
def validate_inputs(args): """Validate inputs.""" # Validate that all expression types are equal. exp_type_set = set(args.exp_types) if len(exp_type_set) != 1: msg = "All samples should have the same expression type, but multiple expression types were given: {}." msg = msg.format(", ".join(exp_type_set)) send_message(warning(msg)) # Validate that same number of sample names, expression files and # expression types are given. assert len(args.sample_names) == len(args.sample_exps) == len(args.exp_types)
def validate_inputs(args): """Validate inputs.""" # Validate that all expression types are equal. exp_type_set = set(args.exp_types) if len(exp_type_set) != 1: msg = "All samples should have the same expression type, but multiple expression types were given: {}." msg = msg.format(', '.join(exp_type_set)) print(warning(msg)) # Validate that same number of sample names, expression files and # expression types are given. assert len(args.sample_names) == len(args.sample_exps) == len(args.exp_types)
def parse_mappings(species, infile, outfile): """Parse file with chromosome mappings.""" mappings = dict() # if species doesn't have prepared mapping file the script should exit with status 0 and return BigWig file # with output name and warining if species not in MAPPINGS_FILES: msg = 'Chromosome mappings for Species "{}" are not supported.'.format(species) print(warning(msg)) os.rename(infile, outfile) sys.exit(0) for basename in MAPPINGS_FILES[species]: filename = os.path.join(MAPPINGS_DIR, basename) mappings.update(parse_mapping_file(filename)) return mappings
def parse_mappings(species, infile, outfile): """Parse file with chromosome mappings.""" mappings = dict() # if species doesn't have prepared mapping file the script should exit with status 0 and return BigWig file # with output name and warining if species not in MAPPINGS_FILES: msg = 'Chromosome mappings for Species "{}" are not supported.'.format( species) send_message(warning(msg)) os.rename(infile, outfile) sys.exit(0) for basename in MAPPINGS_FILES[species]: filename = os.path.join(MAPPINGS_DIR, basename) mappings.update(parse_mapping_file(filename)) return mappings
def get_pca(expressions=pd.DataFrame(), n_components=2, gene_labels=[]): """Compute PCA.""" if not gene_labels: gene_labels = expressions.index skipped_gene_labels = list(set(gene_labels).difference(expressions.index)) if expressions.shape[0] < 2 or expressions.shape[1] < 2: coordinates = [[0, 0] for i in range(len(expressions))] all_components = [[], []] all_explained_variance_ratios = [0.0, 0.0] else: pca = PCA(n_components=n_components, whiten=True) pca_expressions = pca.fit_transform(expressions.transpose()) coordinates = [ t[:2].tolist() if len(t) > 1 else [t[0], 0.0] for t in pca_expressions ] all_components = [ component_top_factors(component, gene_labels) for component in pca.components_ ] if np.isnan(pca.explained_variance_ratio_).any(): all_explained_variance_ratios = [ 0.0 for _ in pca.explained_variance_ratio_ ] else: all_explained_variance_ratios = pca.explained_variance_ratio_.tolist( ) result = { 'coordinates': coordinates, 'all_components': all_components, 'all_explained_variance_ratios': all_explained_variance_ratios, 'skipped_gene_labels': skipped_gene_labels, 'warning': None } if expressions.empty: print( warning( 'Gene selection and filtering resulted in no genes. Please select different samples or genes.' )) return result
def main(): """Invoke when run directly as a program.""" args = parse_arguments() gene_sets = create_gene_sets(args.dge_file, args.logfc, args.fdr) fname_prefix = generate_name(args.analysis_name, args.tool, args.logfc, args.fdr) out_dir = Path(args.out_dir) if not out_dir.exists(): out_dir.mkdir() for name, data in gene_sets.items(): if data.empty: send_message( warning( f"No {name}-regulated genes. Gene set was not created.")) else: save_genes(data, out_dir / f"{fname_prefix}_{name}.tab.gz")
def main(): """Invoke when run directly as a program.""" args = parse_arguments() validate_inputs(args) exp_type = args.exp_types[0] spikeins_mix = args.spikeins_mix expected = get_expected(spikeins_mix, log2=True) for sample_name, sample_exp in zip(args.sample_names, args.sample_exps): measured_zero = get_measured(sample_exp, sample_name, exp_type, only_zero=True) measured_nonzero = get_measured(sample_exp, sample_name, exp_type, only_nonzero=True, log2=True) merged_zero = merge_expected_measured(expected, measured_zero) merged_nonzero = merge_expected_measured(expected, measured_nonzero) # Get only ERCC spike-in's and plot the histogram-scatter figure. if merged_nonzero.iloc[ merged_nonzero.index.str.startswith('ERCC'), :].empty: print( warning('All ERCC spike-ins have zero expression in sample {}'. format(sample_name))) continue plot_histogram_scatter( expected=expected.iloc[expected.index.str.startswith('ERCC')], zero=merged_zero.iloc[merged_zero.index.str.startswith('ERCC'), :], nonzero=merged_nonzero.iloc[ merged_nonzero.index.str.startswith('ERCC'), :], spikein_type='ERCC', sample_name=sample_name, exp_type=exp_type, )
def main(): """Invoke when run directly as a program.""" args = parse_arguments() with open(args.geneset_file, "rU") as infile: # skip empty lines in input gene set file genes = [str(line.strip()) for line in infile if line.strip()] geneset = sorted(set(genes)) if len(genes) != len(geneset): send_message(warning("Removed duplicated genes.")) with open(args.output_json, "w") as json_out: json.dump({"genes": geneset}, json_out, separators=(",", ":"), allow_nan=False) with gzip.open(args.output_file, "w") as file_out: file_out.write("\n".join(geneset).encode("utf-8"))
parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('vcf_file', help="VCF file (can be compressed using gzip/bgzip).") parser.add_argument('summary', help="Summary file to append to.") args = parser.parse_args() try: vcf = VariantFile(args.vcf_file) except (OSError, ValueError) as error_msg: proc_error = 'Input VCF file does not exist or could not be correctly opened.' print(error(proc_error)) raise ValueError(error_msg) vcf_header = vcf.header header_records = {record.key: record.value for record in vcf_header.records} with open(args.summary, "a") as out_file: try: fasta_name = os.path.basename(header_records['reference']) except KeyError: fasta_name = '' print( warning( 'Reference sequence (FASTA) name could not be recognized from the VCF header.' )) out_file.write('\nReference (genome) sequence:\n{}\n'.format(fasta_name)) out_file.write('\nSamples:\n{}'.format('\n'.join(list( vcf_header.samples))))
def main(): """Invoke when run directly as a program.""" args = parse_arguments() if args.norm_expressions and args.norm_expressions_type: if len(args.norm_expressions) != len(args.norm_expressions_type): print( error( 'The number of additional expression files must match the number of specified ' 'expressions types.')) sys.exit(1) if args.norm_expressions_type: exp_types = [args.expressions_type] + args.norm_expressions_type if len(exp_types) != len(set(exp_types)): print( error( 'The union of the main expression type ({}) and additional normalized expression types {} ' 'does not contain unique items.'.format( args.expressions_type, args.norm_expressions_type))) sys.exit(1) res = resdk.Resolwe() feature_dict = {} df = parse_expression_file(args.expressions, args.expressions_type) # Get a list of feature IDs input_features = df['FEATURE_ID'].tolist() # Split feature IDs into chunks with max size of 10000 elements features_sublists = [ input_features[i:i + CHUNK_SIZE] for i in range(0, len(input_features), CHUNK_SIZE) ] # Fetch features from KB and add them to {feature_id: feature_name} mapping dict for fsublist in features_sublists: features = res.feature.filter(source=args.source_db, species=args.species, feature_id=fsublist) feature_dict.update({f.feature_id: f.name for f in features}) # Map gene symbols to feature IDs df['GENE_SYMBOL'] = df['FEATURE_ID'].map(feature_dict) # Check if all of the input feature IDs could be mapped to the gene symbols if not all(f_id in feature_dict for f_id in input_features): print( warning( '{} feature(s) could not be mapped to the associated feature symbols.' .format(sum(df.isnull().values.ravel())))) # Merge additional expression files with the original data frame if args.norm_expressions and args.norm_expressions_type: for exp_file, exp_type in zip(args.norm_expressions, args.norm_expressions_type): exp_df = parse_expression_file(exp_file, exp_type) df = df.merge(exp_df, on='FEATURE_ID') # Reorder the columns in dataframe columns = ['FEATURE_ID', 'GENE_SYMBOL', args.expressions_type] if args.norm_expressions_type: columns = columns + args.norm_expressions_type df = df[columns] # Replace NaN values with empty string df.fillna('', inplace=True) # Write to file df.to_csv(args.output_name + '.txt.gz', header=True, index=False, sep='\t', compression='gzip') # Write to JSON df_dict = df.set_index('FEATURE_ID').to_dict(orient='index') with open(args.output_name + '.json', 'w') as f: json.dump({'genes': df_dict}, f, allow_nan=False)
def warning(self, *args): """Log warning message.""" report = resolwe_runtime_utils.warning(' '.join([str(x) for x in args])) # TODO: Use the protocol to report progress. print(report)
import pandas as pd from resolwe_runtime_utils import send_message, warning parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("-bed", "--bed_file", required=True, help="All splice junctions in BED12 format") parser.add_argument("-sj", "--novel_sj", required=True, help="Table of annotated novel splice junctions") if __name__ == "__main__": args = parser.parse_args() bed_file = args.bed_file if os.path.getsize(bed_file) == 0: send_message(warning("Bed file has no entries.")) os.rename(bed_file, "novel_sj.bed") sys.exit(0) bed = pd.read_csv(args.bed_file, delimiter="\t", header=None, dtype=str) novel_sj = pd.read_csv(args.novel_sj, delimiter="\t", dtype=str) bed_novel_sj = bed[bed[3].isin(novel_sj["name"])] bed_novel_sj.to_csv("novel_sj.bed", sep="\t", index=False, header=False)
matrix = np.array(matrix) matrix_sum = np.sum(matrix, axis=0) # sum of expressions for each gene genes_zero = np.where(matrix_sum < 0.1)[0] if args.filter: matrix = np.delete(matrix, genes_zero, axis=1) if matrix.shape[1] == 0: raise ValueError("Expressions of all selected genes are 0") distance = distance_map[args.dstfunc.lower()] cluster = linkage(matrix, method=args.linkage.lower(), metric=distance) distance_sum = cluster[:, 2].sum() if distance_sum < 0.1: print(warning('All sample distances are 0.')) dend = dendrogram(cluster, no_plot=True) sample_ids = {} for i, sample_id in enumerate(args.sampleids): sample_ids[i + 1] = {'id': sample_id} output = {'cluster': {'linkage': cluster.tolist(), 'samples_names': sample_ids, 'order': dend['leaves']}} print(json.dumps(output, separators=(',', ':')))
def main(): """Compute sample hierarchical clustering.""" args = parse_args() if len(args.sample_files) != len(args.sample_ids): msg = "The number of sample files does not match the number of sample IDs." set_error(msg) if len(args.sample_files) != len(args.sample_names): msg = "The number of sample files does not match the number of sample names." set_error(msg) if len(args.sample_files) < 2: msg = ( "Select at least two samples to compute hierarchical clustering of samples." ) set_error(msg) if len(args.gene_labels) == 1 and args.distance_metric != "euclidean": msg = ( "Select at least two genes to compute hierarchical clustering of samples with " "correlation distance metric or use Euclidean distance metric.") set_error(msg) expressions, excluded = get_expressions(fnames=args.sample_files, gene_set=args.gene_labels) if len(expressions.index) == 0: if not args.gene_labels: msg = "The selected samples do not have any common genes." else: msg = "None of the selected genes are present in all samples." set_error(msg) if len(expressions.index) == 1 and args.distance_metric != "euclidean": if not args.gene_labels: msg = ( "The selected samples contain only one common gene ({}). At least two common " "genes are required to compute hierarchical clustering of samples with " "correlation distance metric. Select a different set of samples or use Euclidean " "distance metric.".format( get_gene_names(list(expressions.index), args.source, args.species)[0])) else: msg = ( "Only one of the selected genes ({}) is present in all samples but at least two " "such genes are required to compute hierarchical clustering of samples with " "correlation distance metric. Select more genes or use Euclidean distance " "metric.".format( get_gene_names(list(expressions.index), args.source, args.species)[0])) set_error(msg) expressions = transform(expressions, log2=args.log2, z_score=args.z_score) if args.remove_const: expressions, matches = remove_const_samples(expressions) if len(expressions.columns) == 0: msg = ( "All of the selected samples have constant expression across genes. Hierarchical " "clustering of samples cannot be computed.") set_error(msg) if len(expressions.columns) == 1: sample_name = [ id for i, id in enumerate(args.sample_names) if matches[i] ][0] msg = ( "Only one of the selected samples ({}) has a non-constant expression across " "genes. However, hierarchical clustering of samples cannot be computed with " "just one sample.".format(sample_name)) set_error(msg) removed = [ name for i, name in enumerate(args.sample_names) if not matches[i] ] suffix = "" if len(removed) <= 3 else ", ..." if removed: msg = ( "{} of the selected samples ({}) have constant expression across genes. " "Those samples are excluded from the computation of hierarchical clustering of " "samples with correlation distance " "metric.".format(len(removed), ", ".join(removed[:3]) + suffix)) send_message(warning(msg)) else: matches = [True] * len(args.sample_files) suffix = "" if len(excluded) <= 3 else ", ..." if excluded: excluded_names = get_gene_names(excluded[:3], args.source, args.species) if len(excluded) == 1: if not args.gene_labels: msg = ( "Gene {} is present in some but not all of the selected samples. This " "gene is excluded from the computation of hierarchical clustering of " "samples.".format(", ".join(excluded_names))) else: msg = ( "{} of the selected genes ({}) is missing in at least one of the selected " "samples. This gene is excluded from the computation of hierarchical " "clustering of samples.".format(len(excluded), ", ".join(excluded_names))) send_message(warning(msg)) if len(excluded) > 1: if not args.gene_labels: msg = ( "{} genes ({}) are present in some but not all of the selected samples. Those " "genes are excluded from the computation of hierarchical clustering of " "samples.".format(len(excluded), ", ".join(excluded_names))) else: msg = ( "{} of the selected genes ({}) are missing in at least one of the selected " "samples. Those genes are excluded from the computation of hierarchical " "clustering of samples.".format(len(excluded), ", ".join(excluded_names))) send_message(warning(msg)) linkage, dendrogram = get_clustering( expressions, distance_metric=get_distance_metric(args.distance_metric), linkage_method=args.linkage_method, order=args.order, ) sample_ids = [ sample_id for i, sample_id in enumerate(args.sample_ids) if matches[i] ] result = { "sample_ids": {i: { "id": sample_id } for i, sample_id in enumerate(sample_ids)}, "linkage": linkage.tolist(), "order": dendrogram["leaves"], } output_json(result, args.output)
import pandas as pd from resolwe_runtime_utils import warning parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("-bed", "--bed_file", required=True, help="All splice junctions in BED12 format") parser.add_argument("-sj", "--novel_sj", required=True, help="Table of annotated novel splice junctions") if __name__ == "__main__": args = parser.parse_args() bed_file = args.bed_file if os.path.getsize(bed_file) == 0: print(warning("Bed file has no entries.")) os.rename(bed_file, "novel_sj.bed") sys.exit(0) bed = pd.read_csv(args.bed_file, delimiter="\t", header=None, dtype=str) novel_sj = pd.read_csv(args.novel_sj, delimiter="\t", dtype=str) bed_novel_sj = bed[bed[3].isin(novel_sj["name"])] bed_novel_sj.to_csv("novel_sj.bed", sep="\t", index=False, header=False)
"""Filter novel splice junctions in BED12 format.""" import argparse import os import sys import pandas as pd from resolwe_runtime_utils import warning parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('-bed', '--bed_file', required=True, help="All splice junctions in BED12 format") parser.add_argument('-sj', '--novel_sj', required=True, help="Table of annotated novel splice junctions") if __name__ == "__main__": args = parser.parse_args() bed_file = args.bed_file if os.path.getsize(bed_file) == 0: print(warning('Bed file has no entries.')) os.rename(bed_file, 'novel_sj.bed') sys.exit(0) bed = pd.read_csv(args.bed_file, delimiter='\t', header=None, dtype=str) novel_sj = pd.read_csv(args.novel_sj, delimiter='\t', dtype=str) bed_novel_sj = bed[bed[3].isin(novel_sj["name"])] bed_novel_sj.to_csv('novel_sj.bed', sep='\t', index=False, header=False)
def test_string(self): self.assertEqual(warning('Some warning'), '{"proc.warning": "Some warning"}')
def main(): """Compute sample hierarchical clustering.""" args = parse_args() if len(args.sample_files) != len(args.sample_ids): msg = 'The number of sample files does not match the number of sample IDs.' set_error(msg) if len(args.sample_files) != len(args.sample_names): msg = 'The number of sample files does not match the number of sample names.' set_error(msg) if len(args.sample_files) < 2: msg = 'Select at least two samples to compute hierarchical clustering of samples.' set_error(msg) if len(args.gene_labels) == 1 and args.distance_metric != 'euclidean': msg = ('Select at least two genes to compute hierarchical clustering of samples with ' 'correlation distance metric or use Euclidean distance metric.') set_error(msg) expressions, excluded = get_expressions(fnames=args.sample_files, gene_set=args.gene_labels) if len(expressions.index) == 0: if not args.gene_labels: msg = 'The selected samples do not have any common genes.' else: msg = 'None of the selected genes are present in all samples.' set_error(msg) if len(expressions.index) == 1 and args.distance_metric != 'euclidean': if not args.gene_labels: msg = ('The selected samples contain only one common gene ({}). At least two common ' 'genes are required to compute hierarchical clustering of samples with ' 'correlation distance metric. Select a different set of samples or use Euclidean ' 'distance metric.'.format(get_gene_names(list(expressions.index), args.source, args.species)[0])) else: msg = ('Only one of the selected genes ({}) is present in all samples but at least two ' 'such genes are required to compute hierarchical clustering of samples with ' 'correlation distance metric. Select more genes or use Euclidean distance ' 'metric.'.format(get_gene_names(list(expressions.index), args.source, args.species)[0])) set_error(msg) expressions = transform(expressions, log2=args.log2, z_score=args.z_score) if args.remove_const: expressions, matches = remove_const_samples(expressions) if len(expressions.columns) == 0: msg = ('All of the selected samples have constant expression across genes. Hierarchical ' 'clustering of samples cannot be computed.') set_error(msg) if len(expressions.columns) == 1: sample_name = [id for i, id in enumerate(args.sample_names) if matches[i]][0] msg = ('Only one of the selected samples ({}) has a non-constant expression across ' 'genes. However, hierarchical clustering of samples cannot be computed with ' 'just one sample.'.format(sample_name)) set_error(msg) removed = [name for i, name in enumerate(args.sample_names) if not matches[i]] suffix = '' if len(removed) <= 3 else ', ...' if removed: msg = ('{} of the selected samples ({}) have constant expression across genes. ' 'Those samples are excluded from the computation of hierarchical clustering of ' 'samples with correlation distance ' 'metric.'.format(len(removed), ', '.join(removed[:3]) + suffix)) print(warning(msg)) else: matches = [True] * len(args.sample_files) suffix = '' if len(excluded) <= 3 else ', ...' if excluded: excluded_names = get_gene_names(excluded[:3], args.source, args.species) if len(excluded) == 1: if not args.gene_labels: msg = ('Gene {} is present in some but not all of the selected samples. This ' 'gene is excluded from the computation of hierarchical clustering of ' 'samples.'.format(len(excluded), ', '.join(excluded_names))) else: msg = ('{} of the selected genes ({}) is missing in at least one of the selected ' 'samples. This gene is excluded from the computation of hierarchical ' 'clustering of samples.'.format(len(excluded), ', '.join(excluded_names))) print(warning(msg)) if len(excluded) > 1: if not args.gene_labels: msg = ('{} genes ({}) are present in some but not all of the selected samples. Those ' 'genes are excluded from the computation of hierarchical clustering of ' 'samples.'.format(len(excluded), ', '.join(excluded_names))) else: msg = ('{} of the selected genes ({}) are missing in at least one of the selected ' 'samples. Those genes are excluded from the computation of hierarchical ' 'clustering of samples.'.format(len(excluded), ', '.join(excluded_names))) print(warning(msg)) linkage, dendrogram = get_clustering( expressions, distance_metric=get_distance_metric(args.distance_metric), linkage_method=args.linkage_method, order=args.order ) sample_ids = [sample_id for i, sample_id in enumerate(args.sample_ids) if matches[i]] result = { 'sample_ids': {i: {'id': sample_id} for i, sample_id in enumerate(sample_ids)}, 'linkage': linkage.tolist(), 'order': dendrogram['leaves'], } output_json(result, args.output)
if args.filter: matrix = np.delete(matrix, genes_zero, axis=1) if matrix.shape[1] == 0: msg = "Expressions of selected genes are 0. Please select additional genes." print(error(msg)) raise ValueError(msg) distance = distance_map[args.dstfunc] cluster = linkage(matrix, method=args.linkage, metric=distance) distance_sum = cluster[:, 2].sum() if distance_sum < 0.1: msg = 'All sample distances are 0.' print(warning(msg)) dend = dendrogram(cluster, no_plot=True) sample_ids = {} for i, sample_id in enumerate(args.sampleids): sample_ids[i + 1] = {'id': int(sample_id)} output = { 'cluster': { 'linkage': cluster.tolist(), 'samples_names': sample_ids, 'order': dend['leaves'] } }
def main(): """Invoke when run directly as a program.""" args = parse_arguments() res = resdk.Resolwe() with open(args.feature_ids) as gene_file: genes = [gene.strip() for gene in gene_file] org_features = res.feature.filter(source=args.source_db, species=args.species, feature_id=genes) if len(org_features) == 0: print(error("No genes were fetched from the knowledge base.")) exit(1) if args.source_db == args.target_db: target_ids = genes else: mapping_res = res.mapping.filter( source_db=args.source_db, source_species=args.species, target_db=args.target_db, target_species=args.species, source_id=genes, ) if len(mapping_res) == 0: print(error("Failed to map features.")) exit(1) mappings = {} for m in mapping_res: if m.source_id in genes: if m.source_id not in mappings: mappings[m.source_id] = m.target_id else: print( warning( "Mapping {} returned multiple times.".format(m))) if len(genes) > len(mappings): print(warning("Not all features could be mapped.")) target_ids = mappings.values() with tempfile.NamedTemporaryFile() as input_genes: input_genes.write(' '.join(target_ids).encode("UTF-8")) input_genes.flush() process = Popen([ 'processor', str(args.pval), str(args.min_genes), args.obo, args.gaf, input_genes.name ], stdout=PIPE, stderr=DEVNULL) out, err = process.communicate() with open('terms.json', 'w') as f: f.write(out.decode("UTF-8"))
def main(): """Compute gene hierarchical clustering.""" args = parse_args() if len(args.sample_files) != len(args.sample_names): msg = 'The number of sample files does not match the number of sample names.' set_error(msg) if len(args.gene_labels) == 1: msg = 'Select at least two genes to compute hierarchical clustering of genes.' set_error(msg) if len(args.sample_files) == 1 and args.distance_metric != 'euclidean': msg = ( 'Select at least two samples to compute hierarchical clustering of genes with ' 'correlation distance metric or use Euclidean distance metric.') set_error(msg) expressions, excluded = get_expressions(fnames=args.sample_files, gene_set=args.gene_labels) if len(expressions.index) == 0: if not args.gene_labels: msg = 'The selected samples do not have any common genes.' else: msg = 'None of the selected genes are present in all samples.' set_error(msg) if len(expressions.index) == 1 and args.distance_metric != 'euclidean': if not args.gene_labels: msg = ( 'The selected samples contain only one common gene ({}). At least two common ' 'genes are required to compute hierarchical clustering of genes with ' 'correlation distance metric. Select a different set of samples or use Euclidean ' 'distance metric.'.format( get_gene_names(list(expressions.index), args.source, args.species)[0])) else: msg = ( 'Only one of the selected genes ({}) is present in all samples but at least two ' 'such genes are required to compute hierarchical clustering of genes with ' 'correlation distance metric. Select more genes or use Euclidean distance ' 'metric.'.format( get_gene_names(list(expressions.index), args.source, args.species)[0])) set_error(msg) expressions = transform(expressions, log2=args.log2, z_score=args.z_score) if args.remove_const: expressions, matches = remove_const_genes(expressions) if len(expressions.index) == 0: msg = ( 'All of the selected genes have constant expression across samples. ' 'Hierarchical clustering of genes cannot be computed.') set_error(msg) if len(expressions.index) == 1: gene_names = get_gene_names(list(expressions.index), args.source, args.species) msg = ( 'Only one of the selected genes ({}) has a non-constant expression across ' 'samples. However, hierarchical clustering of genes cannot be computed with ' 'just one gene.'.format(gene_names[0])) set_error(msg) removed = [ name for i, name in enumerate(expressions.index) if not matches[i] ] suffix = '' if len(removed) <= 3 else ', ...' if removed: removed_names = get_gene_names(removed[:3], args.source, args.species) msg = ( '{} of the selected genes ({}) have constant expression across samples. ' 'Those genes are excluded from the computation of hierarchical clustering of ' 'genes with correlation distance ' 'metric.'.format(len(removed), ', '.join(removed_names) + suffix)) print(warning(msg)) else: matches = [True] * len(expressions.index) suffix = '' if len(excluded) <= 3 else ', ...' if excluded: excluded_names = get_gene_names(excluded[:3], args.source, args.species) if len(excluded) == 1: if not args.gene_labels: msg = ( 'Gene {} is present in some but not all of the selected samples. This ' 'gene is excluded from the computation of hierarchical clustering of ' 'genes.'.format(len(excluded), ', '.join(excluded_names))) else: msg = ( '{} of the selected genes ({}) is missing in at least one of the selected ' 'samples. This gene is excluded from the computation of hierarchical ' 'clustering of genes.'.format(len(excluded), ', '.join(excluded_names))) print(warning(msg)) if len(excluded) > 1: if not args.gene_labels: msg = ( '{} genes ({}) are present in some but not all of the selected samples. Those ' 'genes are excluded from the computation of hierarchical clustering of ' 'genes.'.format(len(excluded), ', '.join(excluded_names))) else: msg = ( '{} of the selected genes ({}) are missing in at least one of the selected ' 'samples. Those genes are excluded from the computation of hierarchical ' 'clustering of genes.'.format(len(excluded), ', '.join(excluded_names))) print(warning(msg)) linkage, dendrogram = get_clustering(expressions, distance_metric=get_distance_metric( args.distance_metric), linkage_method=args.linkage_method, order=args.order) result = { 'gene_symbols': {i: { 'gene': gene } for i, gene in enumerate(expressions.index)}, 'linkage': linkage.tolist(), 'order': dendrogram['leaves'], } output_json(result, args.output)
def main(): """Invoke when run directly as a program.""" args = parse_arguments() if args.norm_expressions and args.norm_expressions_type: if len(args.norm_expressions) != len(args.norm_expressions_type): print(error('The number of additional expression files must match the number of specified ' 'expressions types.')) sys.exit(1) if args.norm_expressions_type: exp_types = [args.expressions_type] + args.norm_expressions_type if len(exp_types) != len(set(exp_types)): print(error('The union of the main expression type ({}) and additional normalized expression types {} ' 'does not contain unique items.'.format(args.expressions_type, args.norm_expressions_type))) sys.exit(1) res = resdk.Resolwe() feature_dict = {} df = parse_expression_file(args.expressions, args.expressions_type) # Get a list of feature IDs input_features = df['FEATURE_ID'].tolist() # Split feature IDs into chunks with max size of 10000 elements features_sublists = [input_features[i:i + CHUNK_SIZE] for i in range(0, len(input_features), CHUNK_SIZE)] # Fetch features from KB and add them to {feature_id: feature_name} mapping dict for fsublist in features_sublists: features = res.feature.filter(source=args.source_db, species=args.species, feature_id=fsublist) feature_dict.update({f.feature_id: f.name for f in features}) # Map gene symbols to feature IDs df['GENE_SYMBOL'] = df['FEATURE_ID'].map(feature_dict) # Check if all of the input feature IDs could be mapped to the gene symbols if not all(f_id in feature_dict for f_id in input_features): print(warning('{} feature(s) could not be mapped to the associated feature symbols.'.format( sum(df.isnull().values.ravel()))) ) # Merge additional expression files with the original data frame if args.norm_expressions and args.norm_expressions_type: for exp_file, exp_type in zip(args.norm_expressions, args.norm_expressions_type): exp_df = parse_expression_file(exp_file, exp_type) df = df.merge(exp_df, on='FEATURE_ID') # Reorder the columns in dataframe columns = ['FEATURE_ID', 'GENE_SYMBOL', args.expressions_type] if args.norm_expressions_type: columns = columns + args.norm_expressions_type df = df[columns] # Replace NaN values with empty string df.fillna('', inplace=True) # Write to file df.to_csv(args.output_name + '.txt.gz', header=True, index=False, sep='\t', compression='gzip') # Write to JSON df_dict = df.set_index('FEATURE_ID').to_dict(orient='index') with open(args.output_name + '.json', 'w') as f: json.dump({'genes': df_dict}, f, allow_nan=False)
matrix_sum = np.sum(matrix, axis=0) # sum of expressions for each gene genes_zero = np.where(matrix_sum < 0.1)[0] if args.filter: matrix = np.delete(matrix, genes_zero, axis=1) if matrix.shape[1] == 0: msg = "Expressions of selected genes are 0. Please select additional genes." print(error(msg)) raise ValueError(msg) distance = distance_map[args.dstfunc] cluster = linkage(matrix, method=args.linkage, metric=distance) distance_sum = cluster[:, 2].sum() if distance_sum < 0.1: msg = 'All sample distances are 0.' print(warning(msg)) dend = dendrogram(cluster, no_plot=True) sample_ids = {} for i, sample_id in enumerate(args.sampleids): sample_ids[i + 1] = {'id': int(sample_id)} output = {'cluster': {'linkage': cluster.tolist(), 'samples_names': sample_ids, 'order': dend['leaves']}} print(json.dumps(output, separators=(',', ':')))
import pandas as pd from resolwe_runtime_utils import warning parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('-bed', '--bed_file', required=True, help="All splice junctions in BED12 format") parser.add_argument('-sj', '--novel_sj', required=True, help="Table of annotated novel splice junctions") if __name__ == "__main__": args = parser.parse_args() bed_file = args.bed_file if os.path.getsize(bed_file) == 0: print(warning('Bed file has no entries.')) os.rename(bed_file, 'novel_sj.bed') sys.exit(0) bed = pd.read_csv(args.bed_file, delimiter='\t', header=None, dtype=str) novel_sj = pd.read_csv(args.novel_sj, delimiter='\t', dtype=str) bed_novel_sj = bed[bed[3].isin(novel_sj["name"])] bed_novel_sj.to_csv('novel_sj.bed', sep='\t', index=False, header=False)