def main(args): OptionParser.format_description = lambda self, formatter: self.description OptionParser.format_epilog = lambda self, formatter: self.epilog parser = OptionParser(usage='', description=prehelp%VERSION, epilog=posthelp%AUTH_LIST) parser.add_option("", "--url", dest="url", default=API_URL, help="communities API url") parser.add_option("", "--input", dest="input", default='-', help="input: filename or stdin (-), default is stdin") parser.add_option("", "--output", dest="output", default='-', help="output: filename or stdout (-), default is stdout") parser.add_option("", "--format", dest="format", default='biom', help="input format: 'text' for tabbed table, 'biom' for BIOM format, default is biom") parser.add_option("", "--cluster", dest="cluster", default='ward', help="cluster function, one of: ward, single, complete, mcquitty, median, centroid, default is ward") parser.add_option("", "--distance", dest="distance", default='bray-curtis', help="distance function, one of: bray-curtis, euclidean, maximum, manhattan, canberra, minkowski, difference, default is bray-curtis") parser.add_option("", "--name", dest="name", type="int", default=0, help="label columns by name, default is by id: 1=true, 0=false") parser.add_option("", "--normalize", dest="normalize", type="int", default=0, help="normalize the input data, default is off: 1=true, 0=false") # get inputs (opts, args) = parser.parse_args() if (opts.input != '-') and (not os.path.isfile(opts.input)): sys.stderr.write("ERROR: input data missing\n") return 1 if opts.format not in ['text', 'biom']: sys.stderr.write("ERROR: invalid input format\n") return 1 # parse inputs rows = [] cols = [] data = [] try: indata = sys.stdin.read() if opts.input == '-' else open(opts.input, 'r').read() if opts.format == 'biom': try: biom = json.loads(indata) rows, cols, data = biom_to_matrix(biom, col_name=opts.name) except: sys.stderr.write("ERROR: input BIOM data not correct format\n") return 1 else: rows, cols, data = tab_to_matrix(indata) except: sys.stderr.write("ERROR: unable to load input data\n") return 1 # retrieve data raw = '0' if opts.normalize else '1' post = {"raw": raw, "cluster": opts.cluster, "distance": opts.distance, "columns": cols, "rows": rows, "data": data} hmap = obj_from_url(opts.url+'/compute/heatmap', data=json.dumps(post, separators=(',',':'))) # output data if (not opts.output) or (opts.output == '-'): out_hdl = sys.stdout else: out_hdl = open(opts.output, 'w') out_hdl.write(json.dumps(hmap, separators=(', ',': '), indent=4)+"\n") out_hdl.close() return 0
def main(args): OptionParser.format_description = lambda self, formatter: self.description OptionParser.format_epilog = lambda self, formatter: self.epilog parser = OptionParser(usage='', description=prehelp%VERSION, epilog=posthelp%AUTH_LIST) parser.add_option("", "--id", dest="id", default=None, help="KBase Metagenome ID, required") parser.add_option("", "--url", dest="url", default=API_URL, help="communities API url") parser.add_option("", "--input", dest="input", default='-', help="input: filename or stdin (-), default is stdin") parser.add_option("", "--output", dest="output", default='text', help="output format: 'text' for tabbed table, 'biom' for BIOM format, default is text") # get inputs (opts, args) = parser.parse_args() if (opts.input != '-') and (not os.path.isfile(opts.input)): sys.stderr.write("ERROR: input data missing\n") return 1 if opts.output not in ['text', 'biom']: sys.stderr.write("ERROR: invalid output format\n") return 1 # get biom try: indata = sys.stdin.read() if opts.input == '-' else open(opts.input, 'r').read() biom = json.loads(indata) rows, cols, matrix = biom_to_matrix(biom) except: sys.stderr.write("ERROR: unable to load input biom data\n") return 1 # get SS hierarchy ss_hier = dict([ (x['accession'], x) for x in obj_from_url(opts.url+'m5nr/ontology?version=1&source=Subsystems')['data'] ]) # biom KO -> SS ssrows = [] ssmatrix = [] for r, rid in enumerate(rows): roles, md5s = ko2roles(opts, ss_hier, rid) if not roles: continue for role in roles: fig_ids = role2figs(opts, role, md5s) if opts.output == 'text': # text output: feature list, function, abundance for function, avg evalue for function, organism safe_print("%s\t%s\t%d\t%.2e\t%s\n" %(",".join(fig_ids), role, matrix[r][0], 0, 'glob')) elif opts.output == 'biom': ssrows.append({'id': role, 'metadata': {'accession': fig_ids}}) ssmatrix.append(matrix[r]) # biom output if opts.output == 'biom': biom['matrix_type'] = 'sparse' biom['shape'][0] = len(ssrows) biom['rows'] = ssrows biom['data'] = ssmatrix safe_print(json.dumps(biom)+"\n") return 0
def main(args): ArgumentParser.format_description = lambda self, formatter: self.description ArgumentParser.format_epilog = lambda self, formatter: self.epilog parser = ArgumentParser(usage='', description=prehelp%VERSION, epilog=posthelp%AUTH_LIST) parser.add_argument("--id", dest="id", default=None, help="KBase Metagenome ID, required") parser.add_argument("--url", dest="url", default=API_URL, help="communities API url") parser.add_argument("--input", dest="input", default='-', help="input: filename or stdin (-), default is stdin") parser.add_argument("--output", dest="output", default='text', help="output format: 'text' for tabbed table, 'biom' for BIOM format, default is text") # get inputs opts = parser.parse_args() if (opts.input != '-') and (not os.path.isfile(opts.input)): sys.stderr.write("ERROR: input data missing\n") return 1 if opts.output not in ['text', 'biom']: sys.stderr.write("ERROR: invalid output format\n") return 1 # get biom try: indata = sys.stdin.read() if opts.input == '-' else open(opts.input, 'r').read() biom = json.loads(indata) rows, cols, matrix = biom_to_matrix(biom) except: sys.stderr.write("ERROR: unable to load input biom data\n") return 1 # get SS hierarchy ss_hier = dict([ (x['accession'], x) for x in obj_from_url(opts.url+'m5nr/ontology?version=1&source=Subsystems')['data'] ]) # biom KO -> SS ssrows = [] ssmatrix = [] for r, rid in enumerate(rows): roles, md5s = ko2roles(opts, ss_hier, rid) if not roles: continue for role in roles: fig_ids = role2figs(opts, role, md5s) if opts.output == 'text': # text output: feature list, function, abundance for function, avg evalue for function, organism safe_print("%s\t%s\t%d\t%.2e\t%s\n" %(",".join(fig_ids), role, matrix[r][0], 0, 'glob')) elif opts.output == 'biom': ssrows.append({'id': role, 'metadata': {'accession': fig_ids}}) ssmatrix.append(matrix[r]) # biom output if opts.output == 'biom': biom['matrix_type'] = 'sparse' biom['shape'][0] = len(ssrows) biom['rows'] = ssrows biom['data'] = ssmatrix safe_print(json.dumps(biom)+"\n") return 0
def main(args): ArgumentParser.format_description = lambda self, formatter: self.description ArgumentParser.format_epilog = lambda self, formatter: self.epilog parser = ArgumentParser(usage='', description=prehelp%VERSION, epilog=posthelp%AUTH_LIST) parser.add_argument("-i", "--input", dest="input", default='-', help="input: filename or stdin (-), default is stdin") parser.add_argument("-o", "--output", dest="output", default='-', help="input: filename or stdout (-), default is stdout") parser.add_argument("--row_start", dest="row_start", type=int, default=None, help="row position to start table with, default is first") parser.add_argument("--row_end", dest="row_end", type=int, default=None, help="row position to end table with, default is last") parser.add_argument("--col_start", dest="col_start", type=int, default=None, help="column position to start table with, default is first") parser.add_argument("--col_end", dest="col_end", type=int, default=None, help="column position to end table with, default is last") parser.add_argument("--stats", dest="stats", action="store_true", default=False, help="include significance stats in output, default is off") # get inputs opts = parser.parse_args() if (opts.input != '-') and (not os.path.isfile(opts.input)): sys.stderr.write("ERROR: input data missing\n") return 1 if (not opts.output) or (opts.output == '-'): out_hdl = sys.stdout else: out_hdl = open(opts.output, 'w') # parse inputs try: indata = sys.stdin.read() if opts.input == '-' else open(opts.input, 'r').read() try: biom = json.loads(indata) rows, cols, data = biom_to_matrix(biom, sig_stats=opts.stats) except: sys.stderr.write("ERROR: input BIOM data not correct format\n") return 1 except: sys.stderr.write("ERROR: unable to load input data\n") return 1 row_start = 0 if opts.row_start is None else opts.row_start - 1 row_end = len(rows) if opts.row_end is None else opts.row_end col_start = 0 if opts.col_start is None else opts.col_start - 1 col_end = len(cols) if opts.col_end is None else opts.col_end # output data try: sub_rows = rows[row_start:row_end] out_hdl.write("\t%s\n" %"\t".join(cols[col_start:col_end])) for i, d in enumerate(data[row_start:row_end]): out_hdl.write("%s\t%s\n" %(sub_rows[i], "\t".join(map(str, d[col_start:col_end])))) out_hdl.close() except: sys.stderr.write("ERROR: unable to sub-select BIOM, inputted positions are out of bounds\n") return 1 return 0
def main(args): ArgumentParser.format_description = lambda self, formatter: self.description ArgumentParser.format_epilog = lambda self, formatter: self.epilog parser = ArgumentParser(usage='', description=prehelp % VERSION, epilog=posthelp % AUTH_LIST) parser.add_argument("--url", dest="url", default=API_URL, help="communities API url") parser.add_argument("--rlib", dest="rlib", default=None, help="R lib path") parser.add_argument("--input", dest="input", default='-', help="input: filename or stdin (-), default is stdin") parser.add_argument( "--output", dest="output", default='-', help="output: filename or stdout (-), default is stdout") parser.add_argument( "--outdir", dest="outdir", default=None, help= "ouput is placed in dir as filenmae.obj, fielname.type, only for 'biom' input" ) parser.add_argument( "--format", dest="format", default='biom', help= "input / output format: 'text' for tabbed table, 'biom' for BIOM format, default is biom" ) # get inputs opts = parser.parse_args() if (opts.input != '-') and (not os.path.isfile(opts.input)): sys.stderr.write("ERROR: input data missing\n") return 1 if opts.format not in ['text', 'biom']: sys.stderr.write("ERROR: invalid format\n") return 1 if (not opts.rlib) and ('KB_PERL_PATH' in os.environ): opts.rlib = os.environ['KB_PERL_PATH'] # parse inputs biom = None rows = [] cols = [] data = [] maxval = 0 tmp_in = 'tmp_' + random_str() + '.txt' tmp_hdl = open(tmp_in, 'w') try: indata = sys.stdin.read() if opts.input == '-' else open( opts.input, 'r').read() if opts.format == 'biom': try: biom = json.loads(indata) if opts.rlib: maxval = biom_to_tab(biom, tmp_hdl) else: rows, cols, data = biom_to_matrix(biom) except: sys.stderr.write("ERROR: input BIOM data not correct format\n") return 1 else: rows, cols, data = tab_to_matrix(indata) data = map(lambda x: map(float, x), data) # floatify it if opts.rlib: tmp_hdl.write(indata) except: sys.stderr.write("ERROR: unable to load input data\n") return 1 finally: tmp_hdl.close() # check values to see if already normalized, otherwise R fails badly data = list(data) if len(data) > 0: maxval = max(map(max, data)) if maxval <= 1: os.remove(tmp_in) sys.stderr.write("ERROR: data is already normalized.\n") return 1 # retrieve data norm = None if opts.rlib: tmp_out = 'tmp_' + random_str() + '.txt' r_cmd = """source("%s/preprocessing.r") suppressMessages( MGRAST_preprocessing( file_in="%s", file_out="%s" ))""" % (opts.rlib, tmp_in, tmp_out) execute_r(r_cmd) nrows, ncols, ndata = tab_to_matrix(open(tmp_out, 'r').read()) num_data = map(lambda x: map(float, x), ndata) norm = {"columns": ncols, "rows": nrows, "data": num_data} os.remove(tmp_out) else: post = {"columns": cols, "rows": rows, "data": data} norm = obj_from_url(opts.url + '/compute/normalize', data=json.dumps(post, separators=(',', ':'))) # output data os.remove(tmp_in) if (not opts.output) or (opts.output == '-'): out_hdl = sys.stdout else: out_hdl = open(opts.output, 'w') if biom and (opts.format == 'biom'): # may have rows removed new_rows = [] for r in biom['rows']: if r['id'] in norm['rows']: new_rows.append(r) biom['rows'] = new_rows biom['data'] = norm['data'] biom['shape'][0] = len(biom['rows']) biom['id'] = biom['id'] + '_normalized' biom['matrix_type'] = 'dense' biom['matrix_element_type'] = 'float' matrix_type = None if biom['type'].startswith('Taxon'): matrix_type = "Communities.TaxonomicMatrix" elif biom['type'].startswith('Function'): matrix_type = "Communities.FunctionalMatrix" if opts.outdir and matrix_type: if not os.path.isdir(opts.outdir): os.mkdir(opts.outdir) ohdl = open(os.path.join(opts.outdir, opts.output + '.obj'), 'w') thdl = open(os.path.join(opts.outdir, opts.output + '.type'), 'w') ohdl.write(json.dumps(biom) + "\n") thdl.write(matrix_type) ohdl.close() thdl.close() else: out_hdl.write(json.dumps(biom) + "\n") else: out_hdl.write("\t%s\n" % "\t".join(norm['columns'])) for i, d in enumerate(norm['data']): out_hdl.write("%s\t%s\n" % (norm['rows'][i], "\t".join(map(str, d)))) out_hdl.close() if os.stat(opts.output).st_size == 0: os.remove(opts.output) return 0
def main(args): OptionParser.format_description = lambda self, formatter: self.description OptionParser.format_epilog = lambda self, formatter: self.epilog parser = OptionParser(usage='', description=prehelp%VERSION, epilog=posthelp%AUTH_LIST) parser.add_option("", "--url", dest="url", default=API_URL, help="communities API url") parser.add_option("", "--rlib", dest="rlib", default=None, help="R lib path") parser.add_option("", "--input", dest="input", default='-', help="input: filename or stdin (-), default is stdin") parser.add_option("", "--output", dest="output", default='-', help="output: filename or stdout (-), default is stdout") parser.add_option("", "--outdir", dest="outdir", default=None, help="ouput is placed in dir as filenmae.obj, fielname.type, only for 'biom' input") parser.add_option("", "--format", dest="format", default='biom', help="input / output format: 'text' for tabbed table, 'biom' for BIOM format, default is biom") # get inputs (opts, args) = parser.parse_args() if (opts.input != '-') and (not os.path.isfile(opts.input)): sys.stderr.write("ERROR: input data missing\n") return 1 if opts.format not in ['text', 'biom']: sys.stderr.write("ERROR: invalid format\n") return 1 if (not opts.rlib) and ('KB_PERL_PATH' in os.environ): opts.rlib = os.environ['KB_PERL_PATH'] # parse inputs biom = None rows = [] cols = [] data = [] maxval = 0 tmp_in = 'tmp_'+random_str()+'.txt' tmp_hdl = open(tmp_in, 'w') try: indata = sys.stdin.read() if opts.input == '-' else open(opts.input, 'r').read() if opts.format == 'biom': try: biom = json.loads(indata) if opts.rlib: maxval = biom_to_tab(biom, tmp_hdl) else: rows, cols, data = biom_to_matrix(biom) except: sys.stderr.write("ERROR: input BIOM data not correct format\n") return 1 else: rows, cols, data = tab_to_matrix(indata) data = map(lambda x: map(float, x), data) # floatify it if opts.rlib: tmp_hdl.write(indata) except: sys.stderr.write("ERROR: unable to load input data\n") return 1 finally: tmp_hdl.close() # check values to see if already normalized, otherwise R fails badly if len(data) > 0: maxval = max( map(max, data) ) if maxval <= 1: os.remove(tmp_in) sys.stderr.write("ERROR: data is already normalized.\n") return 1 # retrieve data norm = None if opts.rlib: tmp_out = 'tmp_'+random_str()+'.txt' r_cmd = """source("%s/preprocessing.r") suppressMessages( MGRAST_preprocessing( file_in="%s", file_out="%s" ))"""%(opts.rlib, tmp_in, tmp_out) execute_r(r_cmd) nrows, ncols, ndata = tab_to_matrix(open(tmp_out, 'r').read()) num_data = map(lambda x: map(float, x), ndata) norm = {"columns": ncols, "rows": nrows, "data": num_data} os.remove(tmp_out) else: post = {"columns": cols, "rows": rows, "data": data} norm = obj_from_url(opts.url+'/compute/normalize', data=json.dumps(post, separators=(',',':'))) # output data os.remove(tmp_in) if (not opts.output) or (opts.output == '-'): out_hdl = sys.stdout else: out_hdl = open(opts.output, 'w') if biom and (opts.format == 'biom'): # may have rows removed new_rows = [] for r in biom['rows']: if r['id'] in norm['rows']: new_rows.append(r) biom['rows'] = new_rows biom['data'] = norm['data'] biom['shape'][0] = len(biom['rows']) biom['id'] = biom['id']+'_normalized' biom['matrix_type'] = 'dense' biom['matrix_element_type'] = 'float' matrix_type = None if biom['type'].startswith('Taxon'): matrix_type = "Communities.TaxonomicMatrix" elif biom['type'].startswith('Function'): matrix_type = "Communities.FunctionalMatrix" if opts.outdir and matrix_type: if not os.path.isdir(opts.outdir): os.mkdir(opts.outdir) ohdl = open(os.path.join(opts.outdir, opts.output+'.obj'), 'w') thdl = open(os.path.join(opts.outdir, opts.output+'.type'), 'w') ohdl.write(json.dumps(biom)+"\n") thdl.write(matrix_type) ohdl.close() thdl.close() else: out_hdl.write(json.dumps(biom)+"\n") else: out_hdl.write( "\t%s\n" %"\t".join(norm['columns']) ) for i, d in enumerate(norm['data']): out_hdl.write( "%s\t%s\n" %(norm['rows'][i], "\t".join(map(str, d))) ) out_hdl.close() if os.stat(opts.output).st_size == 0: os.remove(opts.output); return 0
def main(args): ArgumentParser.format_description = lambda self, formatter: self.description ArgumentParser.format_epilog = lambda self, formatter: self.epilog parser = ArgumentParser(usage='', description=prehelp % VERSION, epilog=posthelp % AUTH_LIST) parser.add_argument("--url", dest="url", default=API_URL, help="communities API url") parser.add_argument("--input", dest="input", default='-', help="input: filename or stdin (-), default is stdin") parser.add_argument( "--output", dest="output", default='-', help="output: filename or stdout (-), default is stdout") parser.add_argument( "--format", dest="format", default='biom', help= "input format: 'text' for tabbed table, 'biom' for BIOM format, default is biom" ) parser.add_argument( "--cluster", dest="cluster", default='ward', help= "cluster function, one of: ward, single, complete, mcquitty, median, centroid, default is ward" ) parser.add_argument( "--distance", dest="distance", default='bray-curtis', help= "distance function, one of: bray-curtis, euclidean, maximum, manhattan, canberra, minkowski, difference, default is bray-curtis" ) parser.add_argument( "--name", dest="name", type=int, default=0, help="label columns by name, default is by id: 1=true, 0=false") parser.add_argument( "--normalize", dest="normalize", type=int, default=0, help="normalize the input data, default is off: 1=true, 0=false") # get inputs opts = parser.parse_args() if (opts.input != '-') and (not os.path.isfile(opts.input)): sys.stderr.write("ERROR: input data missing\n") return 1 if opts.format not in ['text', 'biom']: sys.stderr.write("ERROR: invalid input format\n") return 1 # parse inputs rows = [] cols = [] data = [] try: indata = sys.stdin.read() if opts.input == '-' else open( opts.input, 'r').read() if opts.format == 'biom': try: biom = json.loads(indata) rows, cols, data = biom_to_matrix(biom, col_name=opts.name) except: sys.stderr.write("ERROR: input BIOM data not correct format\n") return 1 else: rows, cols, data = tab_to_matrix(indata) except: sys.stderr.write("ERROR: unable to load input data\n") return 1 # retrieve data raw = '0' if opts.normalize else '1' post = { "raw": raw, "cluster": opts.cluster, "distance": opts.distance, "columns": cols, "rows": rows, "data": data } hmap = obj_from_url(opts.url + '/compute/heatmap', data=json.dumps(post, separators=(',', ':'))) # output data if (not opts.output) or (opts.output == '-'): out_hdl = sys.stdout else: out_hdl = open(opts.output, 'w') out_hdl.write(json.dumps(hmap, separators=(', ', ': '), indent=4) + "\n") out_hdl.close() return 0
def main(args): ArgumentParser.format_description = lambda self, formatter: self.description ArgumentParser.format_epilog = lambda self, formatter: self.epilog parser = ArgumentParser(usage='', description=prehelp % VERSION, epilog=posthelp % AUTH_LIST) parser.add_argument("--url", dest="url", default=API_URL, help="communities API url") parser.add_argument("--input", dest="input", default='-', help="input: filename or stdin (-), default is stdin") parser.add_argument( "--output", dest="output", default='-', help="output: filename or stdout (-), default is stdout") parser.add_argument( "--format", dest="format", default='biom', help= "input / output format: 'text' for tabbed table, 'biom' for BIOM / json format, default is biom" ) parser.add_argument( "--metadata", dest="metadata", default=None, help="metadata field to group by, only for 'biom' input") parser.add_argument( "--distance", dest="distance", default='bray-curtis', help= "distance metric, one of: bray-curtis, euclidean, maximum, manhattan, canberra, minkowski, difference, default is bray-curtis" ) parser.add_argument( "--name", dest="name", type=int, default=0, help="label columns by name, default is by id: 1=true, 0=false") parser.add_argument( "--normalize", dest="normalize", type=int, default=0, help="normalize the input data, default is off: 1=true, 0=false") # get inputs opts = parser.parse_args() if (opts.input != '-') and (not os.path.isfile(opts.input)): sys.stderr.write("ERROR: input data missing\n") return 1 if opts.format not in ['text', 'biom']: sys.stderr.write("ERROR: invalid input format\n") return 1 # parse inputs rows = [] cols = [] data = [] groups = [] try: indata = sys.stdin.read() if opts.input == '-' else open( opts.input, 'r').read() if opts.format == 'biom': try: biom = json.loads(indata) col_name = True if opts.name == 1 else False rows, cols, data = biom_to_matrix(biom, col_name=col_name) if opts.metadata: groups = metadata_from_biom(biom, opts.metadata) except: sys.stderr.write("ERROR: input BIOM data not correct format\n") return 1 else: rows, cols, data = tab_to_matrix(indata) except: sys.stderr.write("ERROR: unable to load input data\n") return 1 # get group map gmap = {} for i, g in enumerate(groups): if g != 'null': gmap[cols[i]] = g # retrieve data raw = '0' if opts.normalize == 1 else '1' post = { "raw": raw, "distance": opts.distance, "columns": cols, "rows": rows, "data": data } pcoa = obj_from_url(opts.url + '/compute/pcoa', data=json.dumps(post, separators=(',', ':'))) # output data if (not opts.output) or (opts.output == '-'): out_hdl = sys.stdout else: out_hdl = open(opts.output, 'w') if opts.format == 'biom': for i in range(len(pcoa['data'])): pcoa['data'][i]['group'] = gmap[ pcoa['data'][i]['id']] if pcoa['data'][i]['id'] in gmap else "" out_hdl.write(json.dumps(pcoa) + "\n") else: out_hdl.write("ID\tGroup\tPC1\tPC2\tPC3\tPC4\n") for d in pcoa['data']: out_hdl.write("%s\t%s\t%s\n" % (d['id'], gmap[d['id']] if (d['id'] in gmap) else "", "\t".join( map(str, d['pco'][0:4])))) out_hdl.close() return 0
def main(args): ArgumentParser.format_description = lambda self, formatter: self.description ArgumentParser.format_epilog = lambda self, formatter: self.epilog parser = ArgumentParser(usage='', description=prehelp%VERSION, epilog=posthelp%AUTH_LIST) parser.add_argument("--input", dest="input", default='-', help="input: filename or stdin (-), default is stdin") parser.add_argument("--format", dest="format", default='biom', help="input format: 'text' for tabbed table, 'biom' for BIOM format, default is biom") parser.add_argument("--output", dest="output", default='biom', help="output format: 'text' for tabbed table, 'biom' for BIOM format, default is biom") parser.add_argument("--order", dest="order", type=int, default=None, help="column number to order output by (0 for last column), default is no ordering") parser.add_argument("--direction", dest="direction", default="desc", help="direction of order. 'asc' for ascending order, 'desc' for descending order, default is desc") parser.add_argument("--cols", dest="cols", type=int, default=None, help="number of columns from the left to return from input table, default is all") parser.add_argument("--rows", dest="rows", type=int, default=None, help="number of rows from the top to return from input table, default is all") # get inputs opts = parser.parse_args() if (opts.input != '-') and (not os.path.isfile(opts.input)): sys.stderr.write("ERROR: input data missing\n") return 1 if opts.format not in ['text', 'biom']: sys.stderr.write("ERROR: invalid input format\n") return 1 if opts.output not in ['text', 'biom']: sys.stderr.write("ERROR: invalid output format\n") return 1 if opts.direction not in ['asc', 'desc']: sys.stderr.write("ERROR: invalid order direction\n") return 1 # parse inputs biom = None rows = [] cols = [] data = [] try: indata = sys.stdin.read() if opts.input == '-' else open(opts.input, 'r').read() if opts.format == 'biom': try: biom = json.loads(indata) rows, cols, data = biom_to_matrix(biom, sig_stats=True) except: sys.stderr.write("ERROR: input BIOM data not correct format\n") return 1 else: rows, cols, data = tab_to_matrix(indata) except: sys.stderr.write("ERROR: unable to load input data\n") return 1 # first we sort if opts.order is not None: rev_order = True if opts.direction == 'desc' else False order_col = opts.order if order_col > len(cols): sys.stderr.write("ERROR: --order value is greater than number of columns in table\n") order_col = order_col - 1 rd_merged = zip(rows, data) rd_sorted = sorted(rd_merged, key=lambda x: x[1][order_col], reverse=rev_order) rows, data = zip(*rd_sorted) rows, data = list(rows), list(data) # subselect rows if opts.rows is not None: subrow = opts.rows rows = rows[:subrow] data = data[:subrow] if opts.cols is not None: subcol = opts.cols cols = cols[:subcol] data = sub_matrix(data, subcol) # output data if biom and (opts.output == 'biom'): # get list of new rows and columns with old indexes br_ids = [r['id'] for r in biom['rows']] bc_ids = [c['id'] for c in biom['columns']] rindex = [] cindex = [] for r in rows: try: rindex.append(br_ids.index(r)) except: pass for c in cols: try: cindex.append(bc_ids.index(c)) except: pass # update biom biom['id'] = biom['id']+'_altered' biom['data'] = sub_matrix(data, biom['shape'][1]) biom['rows'] = [biom['rows'][r] for r in rindex] biom['columns'] = [biom['columns'][c] for c in cindex] biom['shape'] = [len(biom['rows']), len(biom['columns'])] biom['matrix_type'] = 'dense' safe_print(json.dumps(biom)+'\n') else: safe_print("\t%s\n" %"\t".join(cols)) for i, d in enumerate(data): safe_print("%s\t%s\n" %(rows[i], "\t".join(map(str, d)))) return 0
def main(args): OptionParser.format_description = lambda self, formatter: self.description OptionParser.format_epilog = lambda self, formatter: self.epilog parser = OptionParser(usage='', description=prehelp%VERSION, epilog=posthelp%AUTH_LIST) parser.add_option("", "--input", dest="input", default='-', help="input: filename or stdin (-), default is stdin") parser.add_option("", "--format", dest="format", default='biom', help="input format: 'text' for tabbed table, 'biom' for BIOM format, default is biom") parser.add_option("", "--output", dest="output", default='biom', help="output format: 'full' for tabbed abundances and stats, 'minimum' for tabbed stats only, 'biom' for BIOM format, default is biom") parser.add_option("", "--metadata", dest="metadata", default=None, help="metadata field to correlate, only for 'biom' input") parser.add_option("", "--groups", dest="groups", default=None, help="list of groups in JSON or tabbed format - either as input string or filename") parser.add_option("", "--group_pos", dest="group_pos", type="int", default=1, help="position of group to use, default is 1 (first)") parser.add_option("", "--cutoff", dest="cutoff", default=None, help="only show p-value less than this, default show all") parser.add_option("", "--fdr", dest="fdr", action="store_true", default=False, help="output FDR for computed p-values, default is off") # get inputs (opts, args) = parser.parse_args() if (opts.input != '-') and (not os.path.isfile(opts.input)): sys.stderr.write("ERROR: input data missing\n") return 1 if opts.format not in ['text', 'biom']: sys.stderr.write("ERROR: invalid input format\n") return 1 if opts.output not in ['full', 'minimum', 'biom']: sys.stderr.write("ERROR: invalid output format\n") return 1 # parse inputs biom = None rows = [] cols = [] data = [] groups = [] try: indata = sys.stdin.read() if opts.input == '-' else open(opts.input, 'r').read() if opts.format == 'biom': try: biom = json.loads(indata) rows, cols, data = biom_to_matrix(biom) if opts.metadata: groups = metadata_from_biom(biom, opts.metadata) except: sys.stderr.write("ERROR: input BIOM data not correct format\n") return 1 else: rows, cols, data = tab_to_matrix(indata) except: sys.stderr.write("ERROR: unable to load input data\n") return 1 # get groups if not in BIOM metadata and option used if (len(groups) == 0) and opts.groups: # is it json ? ## example of 2 group sets in json format ## [ {"group1": ["mg_id_1", "mg_id_2"], "group2": ["mg_id_3", "mg_id_4", "mg_id_5"]}, ## {"group1": ["mg_id_1", "mg_id_2", "mg_id_3"], "group2": ["mg_id_4", "mg_id_5"]} ] try: gdata = json.load(open(opts.groups, 'r')) if os.path.isfile(opts.groups) else json.loads(opts.groups) if opts.group_pos > len(gdata): sys.stderr.write("ERROR: position (%d) of metadata is out of bounds\n"%opts.group_pos) return 1 for m in cols: found_g = None for g, mgs in gdata[opts.group_pos-1].items(): if m in mgs: found_g = g break if found_g: groups.append(found_g) else: sys.stderr.write("ERROR: metagenome %s missing metadata\n"%m) return 1 # no - its tabbed except: gtext = open(opts.groups, 'r').read() if os.path.isfile(opts.groups) else opts.groups grows, gcols, gdata = tab_to_matrix(gtext) if opts.group_pos > len(gdata[0]): sys.stderr.write("ERROR: position (%d) of metadata is out of bounds\n"%opts.group_pos) for m in cols: try: midx = gcols.index(m) groups.append(gdata[midx][opts.group_pos-1]) except: sys.stderr.write("ERROR: metagenome %s missing metadata\n"%m) return 1 # validate metadata if len(groups) != len(cols): sys.stderr.write("ERROR: Not all metagenomes have metadata\n") return 1 try: groups = map(lambda x: float(x), groups) except: sys.stderr.write("ERROR: Metadata is not numeric\n") return 1 # check correlation results = [] pvalues = [] for i, a in enumerate(rows): # annotations l_meta = [] l_anno = [] for j, m in enumerate(cols): # metagenomes l_meta.append(groups[j]) l_anno.append(float(data[i][j])) gradient, intercept, r_value, p_value, std_err = stats.linregress(l_meta, l_anno) if biom and (opts.output == 'biom'): results.append([('r-value', r_value), ('p-value', p_value)]) pvalues.append(p_value) else: if opts.output == 'full': l_result = [a]+[float(x) for x in data[i]]+[r_value, p_value] elif opts.output == 'minimum': l_result = [a, r_value, p_value] if (not opts.cutoff) or (opts.cutoff and (p_value < opts.cutoff)): results.append(l_result) pvalues.append(p_value) # calculate fdr if opts.fdr and pvalues: fdr_values = calculate_fdr(pvalues) for i, x in enumerate(fdr_values): results[i].append(('fdr', x) if biom and (opts.output == 'biom') else x) # output if biom and (opts.output == 'biom'): # add stats to row data, same order new_rows = [] for i, robj in enumerate(biom['rows']): if not robj['metadata']: robj['metadata'] = {'correlate': results[i]} else: robj['metadata']['correlate'] = results[i] new_rows.append(robj) # update biom biom['id'] = biom['id']+'_corr' biom['rows'] = new_rows safe_print(json.dumps(biom)+'\n') else: header = ['r-value', 'p-value'] if opts.output == 'full': header = header if opts.fdr: header.append('fdr') safe_print("\t%s\n"%"\t".join(header)) for r in results: safe_print(r[0]) for x in r[1:]: if int(x) == float(x): safe_print("\t%d"%int(x)) else: safe_print("\t%.5f"%float(x)) safe_print("\n") return 0
def main(args): ArgumentParser.format_description = lambda self, formatter: self.description ArgumentParser.format_epilog = lambda self, formatter: self.epilog parser = ArgumentParser(usage='', description=prehelp%VERSION, epilog=posthelp%AUTH_LIST) parser.add_argument("--input", dest="input", default='-', help="input: filename or stdin (-), default is stdin") parser.add_argument("--format", dest="format", default='biom', help="input format: 'text' for tabbed table, 'biom' for BIOM format, default is biom") parser.add_argument("--output", dest="output", default='biom', help="output format: 'full' for tabbed abundances and stats, 'minimum' for tabbed stats only, 'biom' for BIOM format, default is biom") parser.add_argument("--metadata", dest="metadata", default=None, help="metadata field to correlate, only for 'biom' input") parser.add_argument("--groups", dest="groups", default=None, help="list of groups in JSON or tabbed format - either as input string or filename") parser.add_argument("--group_pos", dest="group_pos", type=int, default=1, help="position of group to use, default is 1 (first)") parser.add_argument("--cutoff", dest="cutoff", default=None, help="only show p-value less than this, default show all") parser.add_argument("--fdr", dest="fdr", action="store_true", default=False, help="output FDR for computed p-values, default is off") # get inputs opts = parser.parse_args() if (opts.input != '-') and (not os.path.isfile(opts.input)): sys.stderr.write("ERROR: input data missing\n") return 1 if opts.format not in ['text', 'biom']: sys.stderr.write("ERROR: invalid input format\n") return 1 if opts.output not in ['full', 'minimum', 'biom']: sys.stderr.write("ERROR: invalid output format\n") return 1 # parse inputs biom = None rows = [] cols = [] data = [] groups = [] try: indata = sys.stdin.read() if opts.input == '-' else open(opts.input, 'r').read() if opts.format == 'biom': try: biom = json.loads(indata) rows, cols, data = biom_to_matrix(biom) if opts.metadata: groups = metadata_from_biom(biom, opts.metadata) except: sys.stderr.write("ERROR: input BIOM data not correct format\n") return 1 else: rows, cols, data = tab_to_matrix(indata) except: sys.stderr.write("ERROR: unable to load input data\n") return 1 # get groups if not in BIOM metadata and option used if (len(groups) == 0) and opts.groups: # is it json ? ## example of 2 group sets in json format ## [ {"group1": ["mg_id_1", "mg_id_2"], "group2": ["mg_id_3", "mg_id_4", "mg_id_5"]}, ## {"group1": ["mg_id_1", "mg_id_2", "mg_id_3"], "group2": ["mg_id_4", "mg_id_5"]} ] try: gdata = json.load(open(opts.groups, 'r')) if os.path.isfile(opts.groups) else json.loads(opts.groups) if opts.group_pos > len(gdata): sys.stderr.write("ERROR: position (%d) of metadata is out of bounds\n"%opts.group_pos) return 1 for m in cols: found_g = None for g, mgs in gdata[opts.group_pos-1].items(): if m in mgs: found_g = g break if found_g: groups.append(found_g) else: sys.stderr.write("ERROR: metagenome %s missing metadata\n"%m) return 1 # no - its tabbed except: gtext = open(opts.groups, 'r').read() if os.path.isfile(opts.groups) else opts.groups grows, gcols, gdata = tab_to_matrix(gtext) if opts.group_pos > len(gdata[0]): sys.stderr.write("ERROR: position (%d) of metadata is out of bounds\n"%opts.group_pos) for m in cols: try: midx = gcols.index(m) groups.append(gdata[midx][opts.group_pos-1]) except: sys.stderr.write("ERROR: metagenome %s missing metadata\n"%m) return 1 # validate metadata if len(groups) != len(cols): sys.stderr.write("ERROR: Not all metagenomes have metadata\n") return 1 try: groups = map(lambda x: float(x), groups) except: sys.stderr.write("ERROR: Metadata is not numeric\n") return 1 # check correlation results = [] pvalues = [] for i, a in enumerate(rows): # annotations l_meta = [] l_anno = [] for j, m in enumerate(cols): # metagenomes l_meta.append(groups[j]) l_anno.append(float(data[i][j])) gradient, intercept, r_value, p_value, std_err = stats.linregress(l_meta, l_anno) if biom and (opts.output == 'biom'): results.append([('r-value', r_value), ('p-value', p_value)]) pvalues.append(p_value) else: if opts.output == 'full': l_result = [a]+[float(x) for x in data[i]]+[r_value, p_value] elif opts.output == 'minimum': l_result = [a, r_value, p_value] if (not opts.cutoff) or (opts.cutoff and (p_value < opts.cutoff)): results.append(l_result) pvalues.append(p_value) # calculate fdr if opts.fdr and pvalues: fdr_values = calculate_fdr(pvalues) for i, x in enumerate(fdr_values): results[i].append(('fdr', x) if biom and (opts.output == 'biom') else x) # output if biom and (opts.output == 'biom'): # add stats to row data, same order new_rows = [] for i, robj in enumerate(biom['rows']): if not robj['metadata']: robj['metadata'] = {'correlate': results[i]} else: robj['metadata']['correlate'] = results[i] new_rows.append(robj) # update biom biom['id'] = biom['id']+'_corr' biom['rows'] = new_rows safe_print(json.dumps(biom)+'\n') else: header = ['r-value', 'p-value'] if opts.output == 'full': header = header if opts.fdr: header.append('fdr') safe_print("\t%s\n"%"\t".join(header)) for r in results: safe_print(r[0]) for x in r[1:]: if int(x) == float(x): safe_print("\t%d"%int(x)) else: safe_print("\t%.5f"%float(x)) safe_print("\n") return 0
def main(args): ArgumentParser.format_description = lambda self, formatter: self.description ArgumentParser.format_epilog = lambda self, formatter: self.epilog parser = ArgumentParser(usage='', description=prehelp % VERSION, epilog=posthelp % AUTH_LIST) parser.add_argument("-i", "--input", dest="input", default='-', help="input: filename or stdin (-), default is stdin") parser.add_argument( "-o", "--output", dest="output", default='-', help="input: filename or stdout (-), default is stdout") parser.add_argument( "--row_start", dest="row_start", type=int, default=None, help="row position to start table with, default is first") parser.add_argument("--row_end", dest="row_end", type=int, default=None, help="row position to end table with, default is last") parser.add_argument( "--col_start", dest="col_start", type=int, default=None, help="column position to start table with, default is first") parser.add_argument( "--col_end", dest="col_end", type=int, default=None, help="column position to end table with, default is last") parser.add_argument( "--stats", dest="stats", action="store_true", default=False, help="include significance stats in output, default is off") # get inputs opts = parser.parse_args() if (opts.input != '-') and (not os.path.isfile(opts.input)): sys.stderr.write("ERROR: input data missing\n") return 1 if (not opts.output) or (opts.output == '-'): out_hdl = sys.stdout else: out_hdl = open(opts.output, 'w') # parse inputs try: indata = sys.stdin.read() if opts.input == '-' else open( opts.input, 'r').read() try: biom = json.loads(indata) rows, cols, data = biom_to_matrix(biom, sig_stats=opts.stats) except: sys.stderr.write("ERROR: input BIOM data not correct format\n") return 1 except: sys.stderr.write("ERROR: unable to load input data\n") return 1 row_start = 0 if opts.row_start is None else opts.row_start - 1 row_end = len(rows) if opts.row_end is None else opts.row_end col_start = 0 if opts.col_start is None else opts.col_start - 1 col_end = len(cols) if opts.col_end is None else opts.col_end # output data try: sub_rows = rows[row_start:row_end] out_hdl.write("\t%s\n" % "\t".join(cols[col_start:col_end])) for i, d in enumerate(data[row_start:row_end]): out_hdl.write( "%s\t%s\n" % (sub_rows[i], "\t".join(map(str, d[col_start:col_end])))) out_hdl.close() except: sys.stderr.write( "ERROR: unable to sub-select BIOM, inputted positions are out of bounds\n" ) return 1 return 0
def main(args): ArgumentParser.format_description = lambda self, formatter: self.description ArgumentParser.format_epilog = lambda self, formatter: self.epilog parser = ArgumentParser(usage='', description=prehelp % VERSION, epilog=posthelp % AUTH_LIST) parser.add_argument("--input", dest="input", default='-', help="input: filename or stdin (-), default is stdin") parser.add_argument( "--format", dest="format", default='biom', help= "input format: 'text' for tabbed table, 'biom' for BIOM format, default is biom" ) parser.add_argument( "--output", dest="output", default='biom', help= "output format: 'text' for tabbed table, 'biom' for BIOM format, default is biom" ) parser.add_argument( "--order", dest="order", type=int, default=None, help= "column number to order output by (0 for last column), default is no ordering" ) parser.add_argument( "--direction", dest="direction", default="desc", help= "direction of order. 'asc' for ascending order, 'desc' for descending order, default is desc" ) parser.add_argument( "--cols", dest="cols", type=int, default=None, help= "number of columns from the left to return from input table, default is all" ) parser.add_argument( "--rows", dest="rows", type=int, default=None, help= "number of rows from the top to return from input table, default is all" ) # get inputs opts = parser.parse_args() if (opts.input != '-') and (not os.path.isfile(opts.input)): sys.stderr.write("ERROR: input data missing\n") return 1 if opts.format not in ['text', 'biom']: sys.stderr.write("ERROR: invalid input format\n") return 1 if opts.output not in ['text', 'biom']: sys.stderr.write("ERROR: invalid output format\n") return 1 if opts.direction not in ['asc', 'desc']: sys.stderr.write("ERROR: invalid order direction\n") return 1 # parse inputs biom = None rows = [] cols = [] data = [] try: indata = sys.stdin.read() if opts.input == '-' else open( opts.input, 'r').read() if opts.format == 'biom': try: biom = json.loads(indata) rows, cols, data = biom_to_matrix(biom, sig_stats=True) except: sys.stderr.write("ERROR: input BIOM data not correct format\n") return 1 else: rows, cols, data = tab_to_matrix(indata) except: sys.stderr.write("ERROR: unable to load input data\n") return 1 # first we sort if opts.order is not None: rev_order = True if opts.direction == 'desc' else False order_col = opts.order if order_col > len(cols): sys.stderr.write( "ERROR: --order value is greater than number of columns in table\n" ) order_col = order_col - 1 rd_merged = zip(rows, data) rd_sorted = sorted(rd_merged, key=lambda x: x[1][order_col], reverse=rev_order) rows, data = zip(*rd_sorted) rows, data = list(rows), list(data) # subselect rows if opts.rows is not None: subrow = opts.rows rows = rows[:subrow] data = data[:subrow] if opts.cols is not None: subcol = opts.cols cols = cols[:subcol] data = sub_matrix(data, subcol) # output data if biom and (opts.output == 'biom'): # get list of new rows and columns with old indexes br_ids = [r['id'] for r in biom['rows']] bc_ids = [c['id'] for c in biom['columns']] rindex = [] cindex = [] for r in rows: try: rindex.append(br_ids.index(r)) except: pass for c in cols: try: cindex.append(bc_ids.index(c)) except: pass # update biom biom['id'] = biom['id'] + '_altered' biom['data'] = sub_matrix(data, biom['shape'][1]) biom['rows'] = [biom['rows'][r] for r in rindex] biom['columns'] = [biom['columns'][c] for c in cindex] biom['shape'] = [len(biom['rows']), len(biom['columns'])] biom['matrix_type'] = 'dense' safe_print(json.dumps(biom) + '\n') else: safe_print("\t%s\n" % "\t".join(cols)) for i, d in enumerate(data): safe_print("%s\t%s\n" % (rows[i], "\t".join(map(str, d)))) return 0