def transform(expressions, log2=False, const=1.0, normalization=None, ddof=1): """Compute log2 and normalize expression values. Parameters: - const: an additive constant used in computation of log2 - normalization: None or 'z-score' - ddof: degrees of freedom used in computation of z-scores """ if log2: try: expressions = expressions.applymap(lambda x: np.log2(x + const)) except: msg = 'Cannot apply log2 to expression values.' print(error(msg)) raise ValueError(msg) if normalization: if normalization == 'z-score': try: expressions = expressions.apply(lambda x: zscore(x, ddof=ddof), axis=0) except: msg = 'Cannot compute Z-scores.' print(error(msg)) raise ValueError(msg) else: msg = 'Unknown normalization type {}.'.format(normalization) print(error(msg)) raise ValueError(msg) return expressions
def main(): """Invoke when run directly as a program.""" args = parse_arguments() amplicon_names = set() with open(args.master_file, newline="") as masterfile: reader = csv.reader(masterfile, delimiter="\t") for row in reader: if len(row) != 12: print( error( "Uploaded master file must contain exactly 12 columns." )) if not check_dna_sequence(row[10]): print(error("11th column must contain a DNA sequence.")) if not check_dna_sequence(row[11]): print(error("12th column must contain a DNA sequence.")) amp_name = row[3] if amp_name not in amplicon_names: amplicon_names.add(amp_name) else: print( error( "Amplicon names must be unique. Amplicon {} is seen multiple times." .format(amp_name)))
def start(self, inputs): """Start the process. :param inputs: An instance of `Inputs` describing the process inputs :return: An instance of `Outputs` describing the process outputs """ self.logger.info("Process is starting") outputs = Outputs(self._meta.outputs) self.logger.info("Process is running") try: self.run(inputs.freeze(), outputs) return outputs.freeze() except Exception as error: self.logger.exception("Exception while running process") print(resolwe_runtime_utils.error(str(error))) raise except: # noqa self.logger.exception("Exception while running process") print( resolwe_runtime_utils.error("Exception while running process")) raise finally: self.logger.info("Process has finished")
def main(): """Invoke when run directly as a program.""" args = parse_arguments() res = resdk.Resolwe() with open(args.feature_ids) as gene_file: genes = [gene.strip() for gene in gene_file] org_features = res.feature.filter(source=args.source_db, species=args.species, feature_id=genes) if len(org_features) == 0: print(error("No genes were fetched from the knowledge base.")) exit(1) if args.source_db == args.target_db: target_ids = genes else: mapping_res = res.mapping.filter( source_db=args.source_db, source_species=args.species, target_db=args.target_db, target_species=args.species, source_id=genes, ) if len(mapping_res) == 0: print(error("Failed to map features.")) exit(1) mappings = {} for m in mapping_res: if m.source_id in genes: if m.source_id not in mappings: mappings[m.source_id] = m.target_id else: print(warning("Mapping {} returned multiple times.".format(m))) if len(genes) > len(mappings): print(warning("Not all features could be mapped.")) target_ids = mappings.values() with tempfile.NamedTemporaryFile() as input_genes: input_genes.write(' '.join(target_ids).encode("UTF-8")) input_genes.flush() process = Popen(['processor', str(args.pval), str(args.min_genes), args.obo, args.gaf, input_genes.name], stdout=PIPE, stderr=DEVNULL ) out, err = process.communicate() with open('terms.json', 'w') as f: f.write(out.decode("UTF-8"))
def parse_expression_file(exp_file, exp_type): """Parse expression file to a Pandas dataframe.""" with gzip.open(exp_file) as exp: df = pd.read_csv(exp, sep='\t') ALLOWED_COLUMNS = ["Gene", "Transcript", "Expression"] if not all(column_label in ALLOWED_COLUMNS for column_label in df.columns.values): print( error('Invalid column headers {} in file {}.'.format( df.columns.values, exp_file))) sys.exit(1) df.rename(index=str, columns={ "Gene": "FEATURE_ID", "Transcript": "FEATURE_ID", "Expression": exp_type, }, inplace=True) # Cast FEATURE_ID column to string df['FEATURE_ID'] = df['FEATURE_ID'].astype('str') # Remove any possible empty rows from the input file df.dropna(inplace=True) return df
def test_string(self): expected = { 'type': 'COMMAND', 'type_data': 'process_log', 'data': {'error': 'Some error'}, } self.assertEqual(error('Some error'), expected)
def main(): """Invoke when run directly as a program.""" args = parse_arguments() mappability = parse_mapability_file(args.mappability) expression = parse_expression_file(args.counts) missing_genes = expression.index.difference(mappability.index) if len(missing_genes) > 0: send_message( error("Feature ID {} is not present in the mappability file. " "Make sure that the expressions and mappability file are " "derived from the same annotations (GTF/GFF) file.".format( missing_genes[0]))) sys.exit(1) lib_size = expression.sum() result = 10**9 * expression / lib_size / mappability result[mappability == 0] = 0.0 result.loc[expression.index].to_csv( args.output, index_label="Gene", header=["Expression"], sep="\t", compression="gzip", )
def get_clustering(expressions, distance_metric='euclidean', linkage_method='average', ordering_method=None, n_keep=None, n_trials=1000): """Compute linkage, order, and produce a dendrogram.""" if len(expressions.columns) < 2: return np.array([]), {'leaves': list(range(len(expressions.columns)))} try: distances = pdist(np.transpose(np.array(expressions)), metric=distance_metric) if np.isnan(distances).any(): distances = np.nan_to_num(distances, copy=False) warning( 'Distances between some samples were undefined and were set to zero.' ) except: msg = 'Cannot compute distances between samples.' print(error(msg)) raise ValueError(msg) try: link = linkage(y=distances, method=linkage_method) except: msg = 'Cannot compute linkage.' print(error(msg)) raise ValueError(msg) if ordering_method: if ordering_method == 'knn': link = knn(link, distances) elif ordering_method == 'optimal': link = optimal(link, distances, n_keep) elif ordering_method == 'sa': link = simulated_annealing(link, distances, n_trials) else: msg = 'Unknown ordering method {}'.format(ordering_method) print(error(msg)) raise ValueError(msg) try: dend = dendrogram(link, no_plot=True) except: msg = 'Cannot compute dendrogram.' print(error(msg)) raise ValueError(msg) return link, dend
def main(): """Invoke when run directly as a program.""" args = parse_arguments() with open(args.input_file) as infile: data = json.load(infile) if 'expected_format' in data and 'compatible_fragment_ratio' in data: print(save('strandedness', data['expected_format'])) print(save('fragment_ratio', str(round(data['compatible_fragment_ratio'], 2)))) else: print(error("Cannot parse library type information file."))
def get_clustering(expressions, distance_metric='euclidean', linkage_method='average', order=False): """Compute linkage, order, and produce a dendrogram.""" try: link = linkage(y=expressions.transpose(), method=linkage_method, optimal_ordering=order) except: msg = 'Cannot compute linkage.' print(error(msg)) raise ValueError(msg) try: dend = dendrogram(link, no_plot=True) except: msg = 'Cannot compute dendrogram.' print(error(msg)) raise ValueError(msg) return link, dend
def main(): """Invoke when run directly as a program.""" args = parse_arguments() amplicon_names = set() with open(args.master_file, newline='') as masterfile: reader = csv.reader(masterfile, delimiter='\t') for row in reader: if len(row) != 12: print(error('Uploaded master file must contain exactly 12 columns.')) if not check_dna_sequence(row[10]): print(error('11th column must contain a DNA sequence.')) if not check_dna_sequence(row[11]): print(error('12th column must contain a DNA sequence.')) amp_name = row[3] if amp_name not in amplicon_names: amplicon_names.add(amp_name) else: print(error('Amplicon names must be unique. Amplicon {} is seen multiple times.'.format(amp_name)))
def main(): """Invoke when run directly as a program.""" args = parse_arguments() with open(args.input_file) as infile: data = json.load(infile) if "expected_format" in data and "compatible_fragment_ratio" in data: send_message(save("strandedness", data["expected_format"])) send_message( save("fragment_ratio", str(round(data["compatible_fragment_ratio"], 2)))) else: send_message(error("Cannot parse library type information file."))
def main(): """Invoke when run directly as a program.""" args = parse_arguments() with open(args.input_file) as infile: data = json.load(infile) if 'expected_format' in data and 'compatible_fragment_ratio' in data: print(save('strandedness', data['expected_format'])) print( save('fragment_ratio', str(round(data['compatible_fragment_ratio'], 2)))) else: print(error("Cannot parse library type information file."))
def parse_mapability_file(mapability_file): """Parse mapability file to a Pandas Series.""" try: mappability = pd.read_csv( mapability_file, sep='\t', usecols=['gene_id', 'coverage'], index_col='gene_id', dtype={ 'gene_id': str, 'coverage': float, }, squeeze=True, ) return mappability.dropna() except (ValueError, OSError) as parse_error: print( error("Failed to read mappability file {}. {}".format( basename(exp_file), parse_error))) sys.exit(1)
def parse_mapability_file(mapability_file): """Parse mapability file to a Pandas Series.""" try: mappability = pd.read_csv( mapability_file, sep="\t", usecols=["gene_id", "coverage"], index_col="gene_id", dtype={ "gene_id": str, "coverage": float, }, squeeze=True, ) return mappability.dropna() except (ValueError, OSError) as parse_error: send_message( error("Failed to read mappability file {}. {}".format( basename(mapability_file), parse_error))) sys.exit(1)
def parse_expression_file(exp_file): """Parse expression file to a Pandas Series.""" try: expression = pd.read_csv( exp_file, sep="\t", compression="gzip", usecols=["Gene", "Expression"], index_col="Gene", dtype={ "Gene": str, "Expression": float, }, squeeze=True, ) return expression.dropna() except (ValueError, OSError) as parse_error: send_message( error("Failed to read input file {}. {}".format( basename(exp_file), parse_error))) sys.exit(1)
def parse_expression_file(exp_file, exp_type): """Parse expression file to a Pandas dataframe.""" with gzip.open(exp_file) as exp: df = pd.read_csv(exp, sep='\t') ALLOWED_COLUMNS = ["Gene", "Transcript", "Expression"] if not all(column_label in ALLOWED_COLUMNS for column_label in df.columns.values): print(error('Invalid column headers {} in file {}.'.format(df.columns.values, exp_file))) sys.exit(1) df.rename(index=str, columns={ "Gene": "FEATURE_ID", "Transcript": "FEATURE_ID", "Expression": exp_type, }, inplace=True) # Cast FEATURE_ID column to string df['FEATURE_ID'] = df['FEATURE_ID'].astype('str') # Remove any possible empty rows from the input file df.dropna(inplace=True) return df
def parse_expression_file(exp_file): """Parse expression file to a Pandas Series.""" try: expression = pd.read_csv( exp_file, sep='\t', compression='gzip', usecols=['Gene', 'Expression'], index_col='Gene', dtype={ 'Gene': str, 'Expression': float, }, squeeze=True, ) return expression.dropna() except (ValueError, OSError) as parse_error: print( error("Failed to read input file {}. {}".format( basename(exp_file), parse_error))) sys.exit(1)
def transform(expressions, log2=False, const=1.0, z_score=False, ddof=1): """Compute log2 and normalize expression values. Parameters: - log2: use log2(x+const) transformation - const: an additive constant used in computation of log2 - z_score: use Z-score normalization - ddof: degrees of freedom used in computation of Z-score """ if log2: expressions = expressions.applymap(lambda x: np.log2(x + const)) if expressions.isnull().values.any(): msg = 'Cannot apply log2 to expression values.' print(error(msg)) raise ValueError(msg) if z_score: expressions = expressions.apply(lambda x: zscore(x, ddof=ddof), axis=1, result_type='broadcast') expressions.fillna(value=0.0, inplace=True) return expressions
def main(): """Invoke when run directly as a program.""" args = parse_arguments() res = resdk.Resolwe() with open(args.feature_ids) as gene_file: genes = [gene.strip() for gene in gene_file] org_features = res.feature.filter(source=args.source_db, species=args.species, feature_id=genes) if len(org_features) == 0: print(error("No genes were fetched from the knowledge base.")) exit(1) if args.source_db == args.target_db: target_ids = genes else: mapping_res = res.mapping.filter( source_db=args.source_db, source_species=args.species, target_db=args.target_db, target_species=args.species, source_id=genes, ) if len(mapping_res) == 0: print(error("Failed to map features.")) exit(1) mappings = {} for m in mapping_res: if m.source_id in genes: if m.source_id not in mappings: mappings[m.source_id] = m.target_id else: print( warning( "Mapping {} returned multiple times.".format(m))) if len(genes) > len(mappings): print(warning("Not all features could be mapped.")) target_ids = mappings.values() with tempfile.NamedTemporaryFile() as input_genes: input_genes.write(' '.join(target_ids).encode("UTF-8")) input_genes.flush() process = Popen([ 'processor', str(args.pval), str(args.min_genes), args.obo, args.gaf, input_genes.name ], stdout=PIPE, stderr=DEVNULL) out, err = process.communicate() with open('terms.json', 'w') as f: f.write(out.decode("UTF-8"))
def main(): """Invoke when run directly as a program.""" args = parse_arguments() if args.norm_expressions and args.norm_expressions_type: if len(args.norm_expressions) != len(args.norm_expressions_type): print(error('The number of additional expression files must match the number of specified ' 'expressions types.')) sys.exit(1) if args.norm_expressions_type: exp_types = [args.expressions_type] + args.norm_expressions_type if len(exp_types) != len(set(exp_types)): print(error('The union of the main expression type ({}) and additional normalized expression types {} ' 'does not contain unique items.'.format(args.expressions_type, args.norm_expressions_type))) sys.exit(1) res = resdk.Resolwe() feature_dict = {} df = parse_expression_file(args.expressions, args.expressions_type) # Get a list of feature IDs input_features = df['FEATURE_ID'].tolist() # Split feature IDs into chunks with max size of 10000 elements features_sublists = [input_features[i:i + CHUNK_SIZE] for i in range(0, len(input_features), CHUNK_SIZE)] # Fetch features from KB and add them to {feature_id: feature_name} mapping dict for fsublist in features_sublists: features = res.feature.filter(source=args.source_db, species=args.species, feature_id=fsublist) feature_dict.update({f.feature_id: f.name for f in features}) # Map gene symbols to feature IDs df['GENE_SYMBOL'] = df['FEATURE_ID'].map(feature_dict) # Check if all of the input feature IDs could be mapped to the gene symbols if not all(f_id in feature_dict for f_id in input_features): print(warning('{} feature(s) could not be mapped to the associated feature symbols.'.format( sum(df.isnull().values.ravel()))) ) # Merge additional expression files with the original data frame if args.norm_expressions and args.norm_expressions_type: for exp_file, exp_type in zip(args.norm_expressions, args.norm_expressions_type): exp_df = parse_expression_file(exp_file, exp_type) df = df.merge(exp_df, on='FEATURE_ID') # Reorder the columns in dataframe columns = ['FEATURE_ID', 'GENE_SYMBOL', args.expressions_type] if args.norm_expressions_type: columns = columns + args.norm_expressions_type df = df[columns] # Replace NaN values with empty string df.fillna('', inplace=True) # Write to file df.to_csv(args.output_name + '.txt.gz', header=True, index=False, sep='\t', compression='gzip') # Write to JSON df_dict = df.set_index('FEATURE_ID').to_dict(orient='index') with open(args.output_name + '.json', 'w') as f: json.dump({'genes': df_dict}, f, allow_nan=False)
help='clustering linkage function') parser.add_argument('--filter', help="Filter genes with low expression", action="store_true") args = parser.parse_args() distance_map = { 'spearman': lambda x, y: 1 - spearmanr(x, y).correlation, 'pearson': lambda x, y: 1 - np.corrcoef(x, y)[0][1], 'euclidean': 'euclidean' } if args.dstfunc not in distance_map: msg = "Invalid distance function {}".format(args.dstfunc) print(error(msg)) raise ValueError(msg) if args.linkage not in ['average', 'single', 'complete']: msg = "Invalid clustering linkage function {}".format(args.linkage) print(error(msg)) raise ValueError(msg) if not args.sampleids or len(args.sampleids) != len(args.sample_files): msg = "Number of sample ids must match the number of files" print(error(msg)) raise ValueError(msg) # read data matrix = [] gene_subset = set(args.genes) if args.genes else None
def test_string(self): self.assertEqual(error('Some error'), '{"proc.error": "Some error"}')
def set_error(msg): """Print error message and raise ValueError.""" print(error(msg)) raise ValueError(msg)
def main(): """Invoke when run directly as a program.""" args = parse_arguments() if args.norm_expressions and args.norm_expressions_type: if len(args.norm_expressions) != len(args.norm_expressions_type): print( error( 'The number of additional expression files must match the number of specified ' 'expressions types.')) sys.exit(1) if args.norm_expressions_type: exp_types = [args.expressions_type] + args.norm_expressions_type if len(exp_types) != len(set(exp_types)): print( error( 'The union of the main expression type ({}) and additional normalized expression types {} ' 'does not contain unique items.'.format( args.expressions_type, args.norm_expressions_type))) sys.exit(1) res = resdk.Resolwe() feature_dict = {} df = parse_expression_file(args.expressions, args.expressions_type) # Get a list of feature IDs input_features = df['FEATURE_ID'].tolist() # Split feature IDs into chunks with max size of 10000 elements features_sublists = [ input_features[i:i + CHUNK_SIZE] for i in range(0, len(input_features), CHUNK_SIZE) ] # Fetch features from KB and add them to {feature_id: feature_name} mapping dict for fsublist in features_sublists: features = res.feature.filter(source=args.source_db, species=args.species, feature_id=fsublist) feature_dict.update({f.feature_id: f.name for f in features}) # Map gene symbols to feature IDs df['GENE_SYMBOL'] = df['FEATURE_ID'].map(feature_dict) # Check if all of the input feature IDs could be mapped to the gene symbols if not all(f_id in feature_dict for f_id in input_features): print( warning( '{} feature(s) could not be mapped to the associated feature symbols.' .format(sum(df.isnull().values.ravel())))) # Merge additional expression files with the original data frame if args.norm_expressions and args.norm_expressions_type: for exp_file, exp_type in zip(args.norm_expressions, args.norm_expressions_type): exp_df = parse_expression_file(exp_file, exp_type) df = df.merge(exp_df, on='FEATURE_ID') # Reorder the columns in dataframe columns = ['FEATURE_ID', 'GENE_SYMBOL', args.expressions_type] if args.norm_expressions_type: columns = columns + args.norm_expressions_type df = df[columns] # Replace NaN values with empty string df.fillna('', inplace=True) # Write to file df.to_csv(args.output_name + '.txt.gz', header=True, index=False, sep='\t', compression='gzip') # Write to JSON df_dict = df.set_index('FEATURE_ID').to_dict(orient='index') with open(args.output_name + '.json', 'w') as f: json.dump({'genes': df_dict}, f, allow_nan=False)
import argparse from pysam import VariantFile from resolwe_runtime_utils import error, warning parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('vcf_file', help="VCF file (can be compressed using gzip/bgzip).") parser.add_argument('summary', help="Summary file to append to.") args = parser.parse_args() try: vcf = VariantFile(args.vcf_file) except (OSError, ValueError) as error_msg: proc_error = 'Input VCF file does not exist or could not be correctly opened.' print(error(proc_error)) raise ValueError(error_msg) vcf_header = vcf.header header_records = {record.key: record.value for record in vcf_header.records} with open(args.summary, "a") as out_file: try: fasta_name = os.path.basename(header_records['reference']) except KeyError: fasta_name = '' print( warning( 'Reference sequence (FASTA) name could not be recognized from the VCF header.' ))
""" import argparse import pandas as pd from pandas.errors import EmptyDataError from resolwe_runtime_utils import error parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("-f", "--bed_file", help="Bed file.") args = parser.parse_args() try: df = pd.read_csv(args.bed_file, delimiter="\t", header=None, dtype=str) except EmptyDataError: print( error( f"The input BED file {args.bed_file} is empty. Your analysis might " f"have failed to identify regions of interest (peaks, junctions, etc.)." )) else: df.iloc[:, 4] = pd.to_numeric(df.iloc[:, 4]).round().astype(int) df.iloc[:, 4] = df.iloc[:, 4].clip(upper=1000) # if strand column exist replace '?' with '.' if len(df.columns) >= 6: df.iloc[:, 5] = df.iloc[:, 5].replace("?", ".") output_name = "_".join(["corrected", args.bed_file]) df.to_csv(output_name, sep="\t", index=False, header=False)
def error(self, *args): """Log error message.""" report = resolwe_runtime_utils.error(' '.join([str(x) for x in args])) # TODO: Use the protocol to report progress. print(report)
#!/usr/bin/env python3 """Check if sample names are unique.""" import argparse from resolwe_runtime_utils import error, send_message parser = argparse.ArgumentParser( description="Check if sample names are unique") parser.add_argument("samples", help="All samples") args = parser.parse_args() samples = args.samples.split(",") if len(samples) > len(set(samples)): send_message((error("Sample names must be unique.")))
import os from pysam import VariantFile from resolwe_runtime_utils import error, send_message, warning parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("vcf_file", help="VCF file (can be compressed using gzip/bgzip).") parser.add_argument("summary", help="Summary file to append to.") args = parser.parse_args() try: vcf = VariantFile(args.vcf_file) except (OSError, ValueError) as error_msg: proc_error = "Input VCF file does not exist or could not be correctly opened." send_message(error(proc_error)) raise ValueError(error_msg) vcf_header = vcf.header header_records = {record.key: record.value for record in vcf_header.records} with open(args.summary, "a") as out_file: try: fasta_name = os.path.basename(header_records["reference"]) except KeyError: fasta_name = "" send_message( warning( "Reference sequence (FASTA) name could not be recognized from the VCF header." ))
break if args.c: x_axis = data.iloc[:, 8][::-1] y_axis = data.iloc[:, 6] - data.iloc[:, 7] else: x_axis = data.iloc[:, 7][::-1] y_axis = data.iloc[:, 6] n_sup_enh, rows = data[data.isSuper == 1].shape chr_pos = data.CHROM.map(str) + ":" + data.START.map( str) + "-" + data.STOP.map(str) if len(x_axis) != len(y_axis): send_message(error("Scatter plot error. len(x_axis) != len(y_axis)")) if len(labels) > 0 and len(labels) != len(x_axis): send_message(error("Scatter plot error. len(labels) != len(x_axis)")) data = { "points": { "x_axis": list(x_axis), "y_axis": list(y_axis), "items": labels }, "annotations": [ { "type": "line", "x1": 0, "y1": float(cutoff),
def main(): """Invoke when run directly as a program.""" args = parse_arguments() de_data = pd.read_csv(args.raw_file, sep="\t") de_data.rename(columns={"Unnamed: 0": "gene_id"}, inplace=True) de_data.fillna(value=1, inplace=True) columns = {} col_order = [] # Make sure all listed numeric columns are valid numeric variables based # on a union of numeric column names from cuffdiff, edgeR, deseq2 and test # files. numeric_columns = [ "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj", "value_1", "value_2", "log2(fold_change)", "test_stat", "p_value", "q_value", "logfc", "fdr", "stat", "logFC", "logCPM", "LR", "Pvalue", "FDR", ] de_columns = de_data.columns for column in numeric_columns: if column not in de_columns: continue if not is_numeric_dtype(de_data[column]): msg = ( f"Column {column} is not numeric. Please make sure " f"that the input file has valid numeric values (i.e. " f"periods for decimal places)." ) print(error(msg)) raise ValueError(msg) if args.gene_id: if args.gene_id == "index": columns["gene_id"] = list(de_data.index.astype(str)) col_order.append("gene_id") else: columns["gene_id"] = list(de_data[args.gene_id].astype(str)) col_order.append("gene_id") if args.logfc: col = np.array(de_data[args.logfc]) col[np.isinf(col)] = 0 columns["logfc"] = list(col) col_order.append("logfc") if args.fdr: columns["fdr"] = list(de_data[args.fdr]) col_order.append("fdr") if args.pvalue: columns["pvalue"] = list(de_data[args.pvalue]) col_order.append("pvalue") if args.fwer: columns["fwer"] = list(de_data[args.fwer]) col_order.append("fwer") if args.logodds: columns["logodds"] = list(de_data[args.logodds]) col_order.append("logodds") if args.stat: columns["stat"] = list(de_data[args.stat]) col_order.append("stat") with open(args.output_json, "w") as f: json.dump(columns, f, separators=(",", ":"), allow_nan=False) outdf = pd.DataFrame(columns) outdf = outdf[col_order] outdf.to_csv(args.output_file, sep="\t", index=False, compression="gzip")
barcode, filename = "", "" if len(t) == 2: barcode, filename = t[0:2] if len(t) > 2 and isnum(t[0]): barcode, filename = t[1:3] barcode, filename = barcode.strip(), filename.strip() if barcode and filename: pool_maps[barcode] = filename if barcode_length > 0 and barcode_length != len(barcode): send_message( error("Barcodes should be of the same length.")) exit(1) else: barcode_length = len(barcode) for bar, _map in iteritems(pool_maps): print("{}: {}".format(bar, _map)) def read_multiplexed(reads1_file, reads2_file, barcodes_file, pool_maps, progress_start): """Parse multiplexed file.""" pool_name = reads1_file.split(".")[0] def nicename(a): return a.replace("#", "").replace(" ",
def set_error(msg): """Print error message and raise ValueError.""" send_message(error(msg)) raise ValueError(msg)
parser.add_argument('-g', '--genes', nargs='+', default=[], help='subset of gene ids') parser.add_argument('-d', '--dstfunc', default='euclidean', help='distance function') parser.add_argument('-l', '--linkage', default='average', help='clustering linkage function') parser.add_argument('--filter', help="Filter genes with low expression", action="store_true") args = parser.parse_args() distance_map = { 'spearman': lambda x, y: 1 - spearmanr(x, y).correlation, 'pearson': lambda x, y: 1 - np.corrcoef(x, y)[0][1], 'euclidean': 'euclidean' } if args.dstfunc not in distance_map: msg = "Invalid distance function {}".format(args.dstfunc) print(error(msg)) raise ValueError(msg) if args.linkage not in ['average', 'single', 'complete']: msg = "Invalid clustering linkage function {}".format(args.linkage) print(error(msg)) raise ValueError(msg) if not args.sampleids or len(args.sampleids) != len(args.sample_files): msg = "Number of sample ids must match the number of files" print(error(msg)) raise ValueError(msg) # read data matrix = [] gene_subset = set(args.genes) if args.genes else None