def main(): """Invoked when run directly as a program.""" args = parse_arguments() terms = defaultdict(list) with gzopen(args.gaf_file) as gaf: for line in gaf: if line.startswith('!'): continue go_data = line.strip().split('\t') terms[go_data[4]].append(go_data[1]) for go_term in terms: outfile = '{}.tab'.format(str(go_term).replace(':', '')) with open(outfile, "wt") as f: writer = csv.writer(f, delimiter=str('\t'), lineterminator='\n') for gene in terms[go_term]: writer.writerow([gene]) print(export(outfile)) d = { 'process': 'upload-geneset', 'input': { 'src': outfile, 'source': args.source } } print('run {}'.format(json.dumps(d, separators=(',', ':')))) print('{{"num_genesets":{}}}'.format(len(terms)))
def get_measured(sample_exp, sample_name, exp_type, only_zero=False, only_nonzero=False, log2=False): """Get measured expression values. If specified, also log2 transform and only keep nonzero values. """ handle = utils.gzopen(sample_exp) exp = pd.read_csv(handle, delimiter='\t', index_col='Gene') exp = exp.loc[:, 'Expression'].astype('float') assert not (only_zero and only_nonzero) if only_zero: exp = exp[exp == 0] elif only_nonzero: exp = exp.iloc[exp.nonzero()[0]] if log2: exp = np.log2(exp) return exp
def get_measured( sample_exp, sample_name, exp_type, only_zero=False, only_nonzero=False, log2=False ): """Get measured expression values. If specified, also log2 transform and only keep nonzero values. """ handle = utils.gzopen(sample_exp) exp = pd.read_csv(handle, delimiter="\t", index_col="Gene") exp = exp.loc[:, "Expression"].astype("float") assert not (only_zero and only_nonzero) if only_zero: exp = exp[exp == 0] elif only_nonzero: exp = exp.iloc[exp.nonzero()[0]] if log2: exp = np.log2(exp) return exp
args = parser.parse_args() out_file = open(args.out, "w") header = ["Gene"] geneset = [] exp = defaultdict(list) experiments = iter(args.experiments) for etc in args.files: if not os.path.isfile(etc): exit(1) with utils.gzopen(etc) as f: etc_data = json.load(f) x = next(experiments) header = header + [ x + ' - ' + tp + 'h' for tp in map(str, etc_data["etc"]["timePoints"]) ] gn = set([g for g in etc_data["etc"]["genes"]]) geneset.append(gn) for g in gn: exp[g].append(etc_data["etc"]["genes"][g]) genes = set.intersection(*geneset)
import csv import re import sys import utils parser = argparse.ArgumentParser( description='Create BEDGRAPH coverage file for a tab file w.r.t. given GFF3 annotations.') parser.add_argument('--tab', dest='tab_file', help='Tab file') parser.add_argument('--tab-coverage-col', dest='tab_col_val', help='Tab column with coverage value') parser.add_argument('--gff3', dest='gff3_file', help='GFF3 file') args = parser.parse_args() # Fetch gene ids and their expressions from tab file with utils.gzopen(args.tab_file) as f: rdr = csv.reader(f, delimiter='\t') rdr.next() # skip header tab_vals = {row[0]: float(row[int(args.tab_col_val)]) for row in rdr} genes = {} # Fetch gene regions and chromosomes they belong to with open(args.gff3_file, 'r') as f: rdr = csv.reader(f, delimiter='\t') gene_id_regex = re.compile(r'ID=([A-Za-z0-9_]+);') for i, row in enumerate(rdr): # skip GFF3 headers if row[0][0:2] == '##': continue # skip if not mRNA if row[2] != 'mRNA' or row[2] != 'transcript':
print('{"rc":"1"}') exit(1) if not (args.input and os.path.isfile(args.input)): print('{"rc":"1"}') exit(1) def isfloat(value): """Check if value is float.""" try: float(value) return True except ValueError: return False with utils.gzopen(args.input) as f: # Split lines by tabs # Ignore lines without a number in second column # Build a dictionary of gene-expression pairs exp = {'genes': {gene_exp[0]: float(gene_exp[1]) for gene_exp in (l.split('\t') for l in f) if len(gene_exp) == 2 and isfloat(gene_exp[1])}} if args.output: with open(args.output, 'w') as f: json.dump(exp, f) else: print('{"exp_json":%s}' % json.dumps(exp, separators=(',', ':')))
if not (args.input and os.path.isfile(args.input)): print('{"rc":"1"}') exit(1) def isfloat(value): """Check if value is float.""" try: float(value) return True except ValueError: return False with utils.gzopen(args.input) as f: # Split lines by tabs # Ignore lines without a number in second column # Build a dictionary of gene-expression pairs exp = { 'genes': { utils.escape_mongokey(gene_exp[0]): float(gene_exp[1]) for gene_exp in (l.split('\t') for l in f) if len(gene_exp) == 2 and isfloat(gene_exp[1]) } } if args.output: with open(args.output, 'w') as f: json.dump(exp, f) else:
if args.dstfunc not in distance_map: raise ValueError("Invalid distance function {}".format(args.dstfunc)) if args.linkage not in linkage_map: raise ValueError("Invalid clustering linkage function {}".format( args.linkage)) if not args.expids or len(args.expids) != len(args.etc_files): raise ValueError("Number of experiment ids must match the number of files") etcs = [] timepoints = set() # read data for i, fname in enumerate(args.etc_files): etcjson = json.load(utils.gzopen(fname)) tps = etcjson['etc']['timePoints'] expid = args.expids[i] if not all(tps[i] <= tps[i + 1] for i in range(len(tps) - 1)): raise ValueError("Timepoints should be ordered") etc = {'genes': {}, 'experiment': expid, 'timePoints': np.array(tps)} timepoints.update(tps) for gene in args.genes: if gene in etcjson['etc']['genes']: etc['genes'][gene] = np.array(etcjson['etc']['genes'][gene]) etcs.append(etc)
# Main split = os.path.split ontology_id = split(split(args.ontology)[0])[1] annotation_id = split(split(args.annotation)[0])[1] annotation_cache = os.path.join('/tmp', 'GO_' + ontology_id + '_Annotation_' + annotation_id) if os.path.isfile(annotation_cache): try: with open(annotation_cache, 'rb') as fd: annotations = pickle.load(fd) except: os.remove(annotation_cache) raise else: with utils.gzopen(args.ontology) as fd: ontology = Ontology(fd) annotations = Annotations(file=args.annotation, ontology=ontology) with open(annotation_cache, 'wb') as fd: pickle.dump(annotations, fd, -1) translator = {a.DB_Object_Symbol: a.DB_Object_ID for a in annotations} orth = {} genes = set() if args.orthologues: orth = dict(l.strip().split("\t") for l in utils.gzopen(args.orthologues))
if args.dstfunc not in distance_map: raise ValueError("Invalid distance function {}".format(args.dstfunc)) if args.linkage not in linkage_map: raise ValueError("Invalid clustering linkage function {}".format(args.linkage)) if not args.expids or len(args.expids) != len(args.etc_files): raise ValueError("Number of experiment ids must match the number of files") etcs = [] timepoints = set() # read data for i, fname in enumerate(args.etc_files): etcjson = json.load(utils.gzopen(fname)) tps = etcjson['etc']['timePoints'] expid = args.expids[i] if not all(tps[i] <= tps[i + 1] for i in xrange(len(tps) - 1)): raise ValueError("Timepoints should be ordered") etc = {'genes': {}, 'experiment': expid, 'timePoints': np.array(tps)} timepoints.update(tps) for gene in args.genes: if gene in etcjson['etc']['genes']: etc['genes'][gene] = np.array(etcjson['etc']['genes'][gene]) etcs.append(etc)
import utils if len(sys.argv) != 2: print '{"rc":"1"}' exit(1) fname = sys.argv[1] if not os.path.isfile(fname): print '{"rc":"1"}' exit(1) def isfloat(value): try: float(value) return True except ValueError: return False with utils.gzopen(fname) as f: # Split lines by tabs # Ignore lines without a number in second column # Build a dictionary of gene-expression pairs exp = {'genes': {utils.escape_mongokey(gene_exp[0]): float(gene_exp[1]) for gene_exp in (l.split('\t') for l in f) if len(gene_exp) == 2 and isfloat(gene_exp[1])}} print '{"exp_json":%s}' % json.dumps(exp, separators=(',', ':'))
args = parser.parse_args() out_file = open(args.out, "w") header = ["Gene"] geneset = [] exp = defaultdict(list) experiments = iter(args.experiments) for etc in args.files: if not os.path.isfile(etc): exit(1) with utils.gzopen(etc) as f: etc_data = json.load(f) x = experiments.next() header = header + [x + ' - ' + tp + 'h' for tp in map(str, etc_data["etc"]["timePoints"])] gn = set([g for g in etc_data["etc"]["genes"]]) geneset.append(gn) for g in gn: exp[g].append(etc_data["etc"]["genes"][g]) genes = set.intersection(*geneset) if args.genes: genes = genes.intersection(args.genes)
# pylint: disable=missing-docstring,invalid-name # XXX: Refactor to a comand line tool and remove pylint disable """Change genes names to orthologues ones.""" from __future__ import absolute_import, division, print_function import argparse import csv import utils parser = argparse.ArgumentParser(description='Change genes names to orthologues ones.') parser.add_argument('ortholog_file', help='file with orthologues') parser.add_argument('genes', nargs='*', help='genes names') args = parser.parse_args() orthologues = {} with utils.gzopen(args.ortholog_file) as ortholog_tsv: for ortholog in csv.reader(ortholog_tsv, delimiter='\t'): orthologues[ortholog[0]] = ortholog[1] genes = args.genes for i, gene in enumerate(genes): if gene in orthologues: genes[i] = orthologues[gene] print(' '.join(genes))
def spearman(x, y): """Compute Spearman's rank.""" return spearmanr(x, y)[0] # 2nd argument: True if higher value means better score distance_map = { 'euclidean': [euclidian, False], 'pearson': [pearson, True], 'spearman': [spearman, True] } search_gene = args.gene file_handler = utils.gzopen(args.etc_file) expressions = json.load(file_handler) file_handler.close() search_f, rev_sort = distance_map[args.dstfunc] if args.dstfunc not in distance_map: raise ValueError("Invalid distance function {}".format(args.dstfunc)) search_gene_expression = expressions['etc']['genes'][search_gene] similarity = [{ 'gene': gene, 'distance': search_f(expressions['etc']['genes'][gene], search_gene_expression) } for gene in expressions['etc']['genes'] if gene != search_gene]
return pearsonr(x, y)[0] def spearman(x, y): return spearmanr(x, y)[0] # 2nd argument: True if higher value means better score distance_map = { 'euclidean': [euclidian, False], 'pearson': [pearson, True], 'spearman': [spearman, True] } search_gene = args.gene file_handler = utils.gzopen(args.etc_file) expressions = json.load(file_handler) file_handler.close() search_f, rev_sort = distance_map[args.dstfunc] if args.dstfunc not in distance_map: raise ValueError("Invalid distance function {}".format(args.dstfunc)) search_gene_expression = expressions['etc']['genes'][search_gene] similarity = [{'gene': gene, 'distance': search_f(expressions['etc']['genes'][gene], search_gene_expression)} for gene in expressions['etc']['genes'] if gene != search_gene] similarity = filter(lambda x: not math.isnan(x['distance']), similarity) similarity.sort(reverse=rev_sort, key=lambda x: x['distance'])
parser = argparse.ArgumentParser(description='Median gene expressions of multiple experiments.') parser.add_argument('files', nargs='*', help='expression files') parser.add_argument('--name', help='expression column name') parser.add_argument('--out', help='output file') args = parser.parse_args() expressions = collections.defaultdict(list) for f in args.files: if not os.path.isfile(f): exit(1) base, ext = os.path.splitext(f) delimiter = ';' if ext == '.csv' else '\t' with utils.gzopen(f) as csvfile: reader = csv.reader(csvfile, delimiter=delimiter) header = reader.next() for gene, exp in reader: expressions[gene].append(float(exp)) genes = sorted(expressions.keys()) medians = [np.median(expressions[g]) for g in genes] fhandler = open(args.out, 'w') if args.out else sys.stdout fhandler.write('Gene\t{}Median\n'.format(args.name if args.name else '')) for gene, med in zip(genes, medians): fhandler.write('{}\t{:.6f}\n'.format(gene, med))
args = parser.parse_args() data = {} for (exp_file, build, species, exp_type, sample_name) in zip(args.file_path, args.build, args.species, args.exp_type, args.sample_names): data.setdefault((build, species, exp_type), []).append([exp_file, sample_name]) for (build, species, exp_type), data_values in data.items(): df = pd.DataFrame(np.nan, index=[], columns=[]) header = [] for (exp_file, sample_name) in data_values: header.append(sample_name) with utils.gzopen(exp_file) as csvfile: reader = pd.read_csv(csvfile, index_col='Gene', delimiter='\t', dtype=str) df = pd.concat([df, reader], axis=1) # Add numbers to duplicated sample names. counts = Counter(header) for sample_name, num in counts.items(): if num > 1: for suffix in range(1, num + 1): header[header.index(sample_name)] = '{}_{}'.format( sample_name, suffix) df.columns = header name = '_'.join([species, build, exp_type, 'all_expressions.txt'])
# raise ValueError("Number of experiments must match the number of files") genes = set() expressions = [] headers = [] op = set.intersection if args.intersection else set.union offset = 0 for f in args.files: if not os.path.isfile(f): exit(1) base, ext = os.path.splitext(f) delimiter = ';' if ext == '.csv' else '\t' with utils.gzopen(f) as csvfile: reader = csv.reader(csvfile, delimiter=delimiter) header = reader.next()[1:] headers.append( args.experiments[offset:offset + len(header)] if args.experiments else header) offset += len(headers[-1]) expressions.append(dict((r[0], r[1:]) for r in reader)) genes = set( expressions[-1].keys()) if args.intersection and not genes else op( genes, expressions[-1].keys()) if args.genes: genes = genes.intersection(args.genes) genes = sorted(genes)
# pylint: disable=missing-docstring,invalid-name # XXX: Refactor to a comand line tool and remove pylint disable """Change genes names to orthologues ones.""" from __future__ import absolute_import, division, print_function import argparse import csv import utils parser = argparse.ArgumentParser( description='Change genes names to orthologues ones.') parser.add_argument('ortholog_file', help='file with orthologues') parser.add_argument('genes', nargs='*', help='genes names') args = parser.parse_args() orthologues = {} with utils.gzopen(args.ortholog_file) as ortholog_tsv: for ortholog in csv.reader(ortholog_tsv, delimiter='\t'): orthologues[ortholog[0]] = ortholog[1] genes = args.genes for i, gene in enumerate(genes): if gene in orthologues: genes[i] = orthologues[gene] print(' '.join(genes))