def main(args):
    ArgumentParser.format_description = lambda self, formatter: self.description
    ArgumentParser.format_epilog = lambda self, formatter: self.epilog
    parser = ArgumentParser(usage='',
                            description=prehelp % VERSION,
                            epilog=posthelp % AUTH_LIST)
    parser.add_argument(
        "--ids",
        dest="ids",
        default=None,
        help="comma seperated list or file of KBase Metagenome IDs")
    parser.add_argument("--url",
                        dest="url",
                        default=API_URL,
                        help="communities API url")
    parser.add_argument("--user",
                        dest="user",
                        default=None,
                        help="OAuth username")
    parser.add_argument("--passwd",
                        dest="passwd",
                        default=None,
                        help="OAuth password")
    parser.add_argument("--token",
                        dest="token",
                        default=None,
                        help="OAuth token")
    parser.add_argument(
        "--level",
        dest="level",
        default='level3',
        help="functional level to retrieve abundances for, default is level3")
    parser.add_argument(
        "--source",
        dest="source",
        default='Subsystems',
        help="function datasource to filter results by, default is Subsystems")
    parser.add_argument("--filter_level",
                        dest="filter_level",
                        default=None,
                        help="function level to filter by")
    parser.add_argument(
        "--filter_name",
        dest="filter_name",
        default=None,
        help="function name to filter by, file or comma seperated list")
    parser.add_argument(
        "--intersect_source",
        dest="intersect_source",
        default='SEED',
        help="taxon datasource for insersection, default is SEED")
    parser.add_argument("--intersect_level",
                        dest="intersect_level",
                        default=None,
                        help="taxon level for insersection")
    parser.add_argument(
        "--intersect_name",
        dest="intersect_name",
        default=None,
        help="taxon name(s) for insersection, file or comma seperated list")
    parser.add_argument(
        "--output",
        dest="output",
        default='-',
        help="output: filename or stdout (-), default is stdout")
    parser.add_argument(
        "--format",
        dest="format",
        default='biom',
        help=
        "output format: 'text' for tabbed table, 'biom' for BIOM format, default is biom"
    )
    parser.add_argument(
        "--evalue",
        type=int,
        dest="evalue",
        default=15,
        help="negative exponent value for maximum e-value cutoff, default is 15"
    )
    parser.add_argument(
        "--identity",
        type=int,
        dest="identity",
        default=60,
        help="percent value for minimum %% identity cutoff, default is 60")
    parser.add_argument(
        "--length",
        type=int,
        dest="length",
        default=15,
        help="value for minimum alignment length cutoff, default is 15")
    parser.add_argument("--version",
                        type=int,
                        dest="version",
                        default=1,
                        help="M5NR annotation version to use, default is 1")
    parser.add_argument(
        "--temp",
        dest="temp",
        default=None,
        help="filename to temporarly save biom output at each iteration")

    # get inputs
    opts = parser.parse_args()
    if not opts.ids:
        sys.stderr.write("ERROR: one or more ids required\n")
        return 1
    if (opts.filter_name and
        (not opts.filter_level)) or ((not opts.filter_name)
                                     and opts.filter_level):
        sys.stderr.write(
            "ERROR: both --filter_level and --filter_name need to be used together\n"
        )
        return 1
    if (opts.intersect_name and
        (not opts.intersect_level)) or ((not opts.intersect_name)
                                        and opts.intersect_level):
        sys.stderr.write(
            "ERROR: both --intersect_level and --intersect_name need to be used together\n"
        )
        return 1
    if opts.format not in ['text', 'biom']:
        sys.stderr.write("ERROR: invalid input format\n")
        return 1

    # get auth
    token = get_auth_token(opts)

    # build url
    id_list = []
    if os.path.isfile(opts.ids):
        id_str = open(opts.ids, 'r').read()
        try:
            id_obj = json.loads(id_str)
            if 'elements' in id_obj:
                id_list = id_obj['elements'].keys()
            elif 'members' in id_obj:
                id_list = map(lambda x: x['ID'], id_obj['members'])
        except:
            id_list = id_str.strip().split('\n')
    else:
        id_list = opts.ids.strip().split(',')
    params = [('group_level', opts.level), ('source', opts.source),
              ('evalue', opts.evalue), ('identity', opts.identity),
              ('length', opts.length), ('version', opts.version),
              ('result_type', 'abundance'), ('asynchronous', '1')]
    if opts.intersect_level and opts.intersect_name:
        params.append(('filter_source', opts.intersect_source))
        params.append(('filter_level', opts.intersect_level))
        if os.path.isfile(opts.intersect_name):
            with open(opts.intersect_name) as file_:
                for f in file_:
                    params.append(('filter', f.strip()))
        else:
            for f in opts.intersect_name.strip().split(','):
                params.append(('filter', f))

    # retrieve data
    biom = None
    size = 50
    if len(id_list) > size:
        for i in xrange(0, len(id_list), size):
            sub_ids = id_list[i:i + size]
            cur_params = copy.deepcopy(params)
            for i in sub_ids:
                cur_params.append(('id', i))
            cur_url = opts.url + '/matrix/function?' + urlencode(
                cur_params, True)
            cur_biom = async_rest_api(cur_url, auth=token)
            biom = merge_biom(biom, cur_biom)
            if opts.temp:
                json.dump(biom, open(opts.temp, 'w'))
    else:
        for i in id_list:
            params.append(('id', i))
        url = opts.url + '/matrix/function?' + urlencode(params, True)
        biom = async_rest_api(url, auth=token)
        if opts.temp:
            json.dump(biom, open(opts.temp, 'w'))

    # get sub annotations
    sub_ann = set()
    if opts.filter_name and opts.filter_level:
        # get input filter list
        filter_list = []
        if os.path.isfile(opts.filter_name):
            with open(opts.filter_name) as file_:
                for f in file_:
                    filter_list.append(f.strip())
        else:
            for f in opts.filter_name.strip().split(','):
                filter_list.append(f)
        # annotation mapping from m5nr
        params = [('version', opts.version), ('min_level', opts.level),
                  ('source', opts.source)]
        url = opts.url + '/m5nr/ontology?' + urlencode(params, True)
        data = obj_from_url(url)
        level = 'level4' if opts.level == 'function' else opts.level
        for ann in data['data']:
            if (opts.filter_level
                    in ann) and (level in ann) and (ann[opts.filter_level]
                                                    in filter_list):
                sub_ann.add(ann[level])

    # output data
    if (not opts.output) or (opts.output == '-'):
        out_hdl = sys.stdout
    else:
        out_hdl = open(opts.output, 'w')

    if opts.format == 'biom':
        out_hdl.write(json.dumps(biom) + "\n")
    else:
        biom_to_tab(biom, out_hdl, rows=sub_ann)

    out_hdl.close()
    return 0
def main(args):
    ArgumentParser.format_description = lambda self, formatter: self.description
    ArgumentParser.format_epilog = lambda self, formatter: self.epilog
    parser = ArgumentParser(usage='',
                            description=prehelp % VERSION,
                            epilog=posthelp % AUTH_LIST)
    parser.add_argument("--url",
                        dest="url",
                        default=API_URL,
                        help="communities API url")
    parser.add_argument("--rlib", dest="rlib", default=None, help="R lib path")
    parser.add_argument("--input",
                        dest="input",
                        default='-',
                        help="input: filename or stdin (-), default is stdin")
    parser.add_argument(
        "--output",
        dest="output",
        default='-',
        help="output: filename or stdout (-), default is stdout")
    parser.add_argument(
        "--outdir",
        dest="outdir",
        default=None,
        help=
        "ouput is placed in dir as filenmae.obj, fielname.type, only for 'biom' input"
    )
    parser.add_argument(
        "--format",
        dest="format",
        default='biom',
        help=
        "input / output format: 'text' for tabbed table, 'biom' for BIOM format, default is biom"
    )

    # get inputs
    opts = parser.parse_args()
    if (opts.input != '-') and (not os.path.isfile(opts.input)):
        sys.stderr.write("ERROR: input data missing\n")
        return 1
    if opts.format not in ['text', 'biom']:
        sys.stderr.write("ERROR: invalid format\n")
        return 1
    if (not opts.rlib) and ('KB_PERL_PATH' in os.environ):
        opts.rlib = os.environ['KB_PERL_PATH']

    # parse inputs
    biom = None
    rows = []
    cols = []
    data = []
    maxval = 0
    tmp_in = 'tmp_' + random_str() + '.txt'
    tmp_hdl = open(tmp_in, 'w')
    try:
        indata = sys.stdin.read() if opts.input == '-' else open(
            opts.input, 'r').read()
        if opts.format == 'biom':
            try:
                biom = json.loads(indata)
                if opts.rlib:
                    maxval = biom_to_tab(biom, tmp_hdl)
                else:
                    rows, cols, data = biom_to_matrix(biom)
            except:
                sys.stderr.write("ERROR: input BIOM data not correct format\n")
                return 1
        else:
            rows, cols, data = tab_to_matrix(indata)
            data = map(lambda x: map(float, x), data)  # floatify it
            if opts.rlib:
                tmp_hdl.write(indata)
    except:
        sys.stderr.write("ERROR: unable to load input data\n")
        return 1
    finally:
        tmp_hdl.close()

    # check values to see if already normalized, otherwise R fails badly
    data = list(data)
    if len(data) > 0:
        maxval = max(map(max, data))
    if maxval <= 1:
        os.remove(tmp_in)
        sys.stderr.write("ERROR: data is already normalized.\n")
        return 1

    # retrieve data
    norm = None
    if opts.rlib:
        tmp_out = 'tmp_' + random_str() + '.txt'
        r_cmd = """source("%s/preprocessing.r")
suppressMessages( MGRAST_preprocessing(
    file_in="%s",
    file_out="%s"
))""" % (opts.rlib, tmp_in, tmp_out)
        execute_r(r_cmd)
        nrows, ncols, ndata = tab_to_matrix(open(tmp_out, 'r').read())
        num_data = map(lambda x: map(float, x), ndata)
        norm = {"columns": ncols, "rows": nrows, "data": num_data}
        os.remove(tmp_out)
    else:
        post = {"columns": cols, "rows": rows, "data": data}
        norm = obj_from_url(opts.url + '/compute/normalize',
                            data=json.dumps(post, separators=(',', ':')))

    # output data
    os.remove(tmp_in)
    if (not opts.output) or (opts.output == '-'):
        out_hdl = sys.stdout
    else:
        out_hdl = open(opts.output, 'w')

    if biom and (opts.format == 'biom'):
        # may have rows removed
        new_rows = []
        for r in biom['rows']:
            if r['id'] in norm['rows']:
                new_rows.append(r)
        biom['rows'] = new_rows
        biom['data'] = norm['data']
        biom['shape'][0] = len(biom['rows'])
        biom['id'] = biom['id'] + '_normalized'
        biom['matrix_type'] = 'dense'
        biom['matrix_element_type'] = 'float'
        matrix_type = None
        if biom['type'].startswith('Taxon'):
            matrix_type = "Communities.TaxonomicMatrix"
        elif biom['type'].startswith('Function'):
            matrix_type = "Communities.FunctionalMatrix"
        if opts.outdir and matrix_type:
            if not os.path.isdir(opts.outdir):
                os.mkdir(opts.outdir)
            ohdl = open(os.path.join(opts.outdir, opts.output + '.obj'), 'w')
            thdl = open(os.path.join(opts.outdir, opts.output + '.type'), 'w')
            ohdl.write(json.dumps(biom) + "\n")
            thdl.write(matrix_type)
            ohdl.close()
            thdl.close()
        else:
            out_hdl.write(json.dumps(biom) + "\n")
    else:
        out_hdl.write("\t%s\n" % "\t".join(norm['columns']))
        for i, d in enumerate(norm['data']):
            out_hdl.write("%s\t%s\n" %
                          (norm['rows'][i], "\t".join(map(str, d))))

    out_hdl.close()
    if os.stat(opts.output).st_size == 0:
        os.remove(opts.output)
    return 0
예제 #3
0
def main(args):
    ArgumentParser.format_description = lambda self, formatter: self.description
    ArgumentParser.format_epilog = lambda self, formatter: self.epilog
    parser = ArgumentParser(usage='', description=prehelp%VERSION, epilog=posthelp%AUTH_LIST)
    parser.add_argument("--user", dest="user", default=None, help="OAuth username")
    parser.add_argument("--passwd", dest="passwd", default=None, help="OAuth password")
    parser.add_argument("--token", dest="token", default=None, help="OAuth token")
    parser.add_argument("--input", dest="input", default='-', help="input: filename or stdin (-), default is stdin")
    parser.add_argument("--format", dest="format", default='biom', help="input format: 'text' for tabbed table, 'biom' for BIOM format, default is biom")
    parser.add_argument("--plot", dest="plot", default=None, help="filename for output plot")
    parser.add_argument("--distance", dest="distance", default='bray-curtis', help="distance metric, one of: bray-curtis, euclidean, maximum, manhattan, canberra, minkowski, difference, default is bray-curtis")
    parser.add_argument("--metadata", dest="metadata", default=None, help="metadata field to color by, only for 'biom' input")
    parser.add_argument("--groups", dest="groups", default=None, help="list of groups in JSON or tabbed format - either as input string or filename")
    parser.add_argument("--group_pos", dest="group_pos", type=int, default=1, help="position of group to use, default is 1 (first)")
    parser.add_argument("--color_auto", dest="color_auto", type=int, default=0, help="auto-create colors based on like group names, default is use group name as color: 1=true, 0=false")
    parser.add_argument("--rlib", dest="rlib", default=None, help="R lib path")
    parser.add_argument("--height", dest="height", type=float, default=10, help="image height in inches, default is 6")
    parser.add_argument("--width", dest="width", type=float, default=10, help="image width in inches, default is 6")
    parser.add_argument("--dpi", dest="dpi", type=int, default=300, help="image DPI, default is 300")
    parser.add_argument("--three", dest="three", type=int, default=0, help="create 3-D PCoA, default is 2-D: 1=true, 0=false")
    parser.add_argument("--name", dest="name", type=int, default=0, help="label columns by name, default is by id: 1=true, 0=false")
    parser.add_argument("--label", dest="label", type=int, default=0, help="label image rows, default is off: 1=true, 0=false")
    
    # get inputs
    opts = parser.parse_args()
    if (opts.input != '-') and (not os.path.isfile(opts.input)):
        sys.stderr.write("ERROR: input data missing\n")
        return 1
    if opts.format not in ['text', 'biom']:
        sys.stderr.write("ERROR: invalid input format\n")
        return 1
    if not opts.plot:
        sys.stderr.write("ERROR: missing output filename\n")
        return 1
    if (not opts.rlib) and ('KB_PERL_PATH' in os.environ):
        opts.rlib = os.environ['KB_PERL_PATH']
    if not opts.rlib:
        sys.stderr.write("ERROR: missing path to R libs\n")
        return 1
    if opts.metadata:
        opts.color_auto = 1
    for o in ['reference', 'color_auto', 'three', 'name', 'label']:
        if getattr(opts, o) not in [0, 1]:
            sys.stderr.write("ERROR: invalid value for '%s'\n"%o)
            return 1
    
    # get auth
    token = get_auth_token(opts)
    
    # parse inputs
    tmp_in  = 'tmp_'+random_str()+'.txt'
    tmp_hdl = open(tmp_in, 'w')
    mg_list = []
    groups  = []
    try:
        indata = sys.stdin.read() if opts.input == '-' else open(opts.input, 'r').read()
        if opts.format == 'biom':
            try:
                indata  = json.loads(indata)
                mg_list = map(lambda x: x['id'], indata['columns'])
                col_name = True if opts.name == 1 else False
                biom_to_tab(indata, tmp_hdl, col_name=col_name)
                if opts.metadata:
                    groups = metadata_from_biom(indata, opts.metadata)
            except:
                sys.stderr.write("ERROR: input BIOM data not correct format\n")
                return 1
        else:
            tmp_hdl.write(indata)
            mg_list = indata.split('\n')[0].strip().split('\t')
    except:
        sys.stderr.write("ERROR: unable to load input data\n")
        return 1
    tmp_hdl.close()
    
    # get groups if not in BIOM metadata and option used
    if (len(groups) == 0) and opts.groups:
        # is it json ?
        ## example of 2 group sets in json format
        ## [ {"group1": ["mg_id_1", "mg_id_2"], "group2": ["mg_id_3", "mg_id_4", "mg_id_5"]},
        ##   {"group1": ["mg_id_1", "mg_id_2", "mg_id_3"], "group2": ["mg_id_4", "mg_id_5"]} ]
        try:
            gdata = json.load(open(opts.groups, 'r')) if os.path.isfile(opts.groups) else json.loads(opts.groups)
            if opts.group_pos > len(gdata):
                sys.stderr.write("ERROR: position (%d) of group is out of bounds\n"%opts.group_pos)
                return 1
            for m in mg_list:
                found_g = None
                for g, mgs in gdata[opts.group_pos-1].items():
                    if m in mgs:
                        found_g = g
                        break
                if found_g:
                    groups.append(found_g)
                else:
                    sys.stderr.write("ERROR: metagenome %s not in a group\n"%m)
                    return 1                  
        # no - its tabbed
        except:
            gtext = open(opts.groups, 'r').read() if os.path.isfile(opts.groups) else opts.groups
            grows, gcols, gdata = tab_to_matrix(gtext)
            if opts.group_pos > len(gdata[0]):
                sys.stderr.write("ERROR: position (%d) of group is out of bounds\n"%opts.group_pos)
            for m in mg_list:
                try:
                    midx = gcols.index(m)
                    groups.append(gdata[midx][opts.group_pos-1])
                except:
                    sys.stderr.write("ERROR: metagenome %s not in a group\n"%m)
                    return 1
    
    # print groups to file for R input
    tmp_group = None
    if len(groups) == len(mg_list):
        tmp_group = 'tmp_'+random_str()+'.txt'
        hdl_group = open(tmp_group, 'w')
        hdl_group.write("\tgroup\n")
        for i, m in enumerate(mg_list):
            hdl_group.write("%s\t%s\n"%(m, ''.join([x if ord(x) < 128 else '?' for x in groups[i]])))
        hdl_group.close()
    elif len(groups) > 0:
        sys.stderr.write("Warning: Not all metagenomes in a group\n")
    
    # build R cmd
    three = 'c(1,2,3)' if opts.three == 1 else 'c(1,2)'
    label = 'TRUE' if opts.label == 1 else 'FALSE'
    table = '"%s"'%tmp_group if tmp_group else 'NA'
    color = 'TRUE' if opts.color_auto == 1 else 'FALSE'
    r_cmd = """source("%s/plot_mg_pcoa.r")
suppressMessages( plot_mg_pcoa(
    table_in="%s",
    image_out="%s",
    plot_pcs=%s,
    dist_metric="%s",
    label_points=%s,
    color_table=%s,
    color_column=1,
    auto_colors=%s,
    image_height_in=%.1f,
    image_width_in=%.1f,
    image_res_dpi=%d
))"""%(opts.rlib, tmp_in, opts.plot, three, opts.distance, label, table, color, opts.height, opts.width, opts.dpi)
    execute_r(r_cmd)
    
    # cleanup
    os.remove(tmp_in)
    if tmp_group:
        os.remove(tmp_group)
    
    return 0
예제 #4
0
def main(args):
    ArgumentParser.format_description = lambda self, formatter: self.description
    ArgumentParser.format_epilog = lambda self, formatter: self.epilog
    parser = ArgumentParser(usage='', description=prehelp%VERSION, epilog=posthelp%AUTH_LIST)
    parser.add_argument("--input", dest="input", default='-', help="input: filename or stdin (-), default is stdin")
    parser.add_argument("--format", dest="format", default='text', help="input format: 'text' for tabbed table, 'biom' for BIOM format, default is text")
    parser.add_argument("--groups", dest="groups", default=None, help="groups in JSON format - either as input string or filename")
    parser.add_argument("--rlib", dest="rlib", default=None, help="R lib path")
    parser.add_argument("--top", dest="top", type=int, default=10, help="display only the top N most changing groups, default is 10")
    parser.add_argument("--stat_test", dest="stat_test", default='Kruskal-Wallis', help="supported statistical tests, one of: Kruskal-Wallis, t-test-paired, Wilcoxon-paired, t-test-unpaired, Mann-Whitney-unpaired-Wilcoxon, ANOVA-one-way, default is Kruskal-Wallis")
    
    # get inputs
    opts = parser.parse_args()
    if (opts.input != '-') and (not os.path.isfile(opts.input)):
        sys.stderr.write("ERROR: input data missing\n")
        return 1
    if opts.format not in ['text', 'biom']:
        sys.stderr.write("ERROR: invalid input format\n")
        return 1
    if (not opts.rlib) and ('KB_PERL_PATH' in os.environ):
        opts.rlib = os.environ['KB_PERL_PATH']
    if not opts.rlib:
        sys.stderr.write("ERROR: missing path to R libs\n")
        return 1
    
    # get inputs
    tmp_in  = 'tmp_'+random_str()+'.txt'
    tmp_out = 'tmp_'+random_str()+'.txt'
    tmp_hdl = open(tmp_in, 'w')
    mg_list = []
    groups  = []
    try:
        indata = sys.stdin.read() if opts.input == '-' else open(opts.input, 'r').read()
        if opts.format == 'biom':
            try:
                indata = json.loads(indata)
                biom_to_tab(indata, tmp_hdl)
                mg_list = map(lambda x: x['id'], indata['columns'])
                try:
                    groups = map(lambda x: x['group'], indata['columns'])
                except:
                    pass
            except:
                sys.stderr.write("ERROR: input BIOM data not correct format\n")
                return 1
        else:
            tmp_hdl.write(indata)
            mg_list = indata.split('\n')[0].strip().split('\t')
    except:
        sys.stderr.write("ERROR: unable to load input data\n")
        return 1
    tmp_hdl.close()
    
    # get groups if not in BIOM
    if not groups:
        try:
            grdata = json.load(open(opts.groups, 'r')) if os.path.isfile(opts.groups) else json.loads(opts.groups)
        except:
            sys.stderr.write("ERROR: unable to parse groups JSON\n")
            return 1
        for mg in mg_list:
            found_gr = None
            for gr in grdata.keys():
                if mg in grdata[gr]:
                    found_gr = gr
                    break
            if found_gr:
                groups.append(found_gr)
            else:
                sys.stderr.write("ERROR: metagenome %s not in a group\n"%mg)
                return 1
    
    # build R cmd
    group_str  = 'c('+','.join(map(lambda x: '"%s"'%x, groups))+')'
    r_cmd = """source("%s/group_stats_plot.r")
suppressMessages( group_stats_plot(
    file_in="%s",
    file_out="%s",
    figure_out=NULL,
    stat_test="%s",
    order_by=NULL,
    order_decreasing=FALSE,
    my_grouping=%s
))"""%(opts.rlib, tmp_in, tmp_out, opts.stat_test, group_str)
    execute_r(r_cmd)
    
    # output results
    results = open(tmp_out, 'r').readlines()
    output  = "\n".join(results[0:opts.top+1])
    os.remove(tmp_in)
    os.remove(tmp_out)
    safe_print(output)
    
    return 0
def main(args):
    OptionParser.format_description = lambda self, formatter: self.description
    OptionParser.format_epilog = lambda self, formatter: self.epilog
    parser = OptionParser(usage='', description=prehelp%VERSION, epilog=posthelp%AUTH_LIST)
    parser.add_option("", "--url", dest="url", default=API_URL, help="communities API url")
    parser.add_option("", "--rlib", dest="rlib", default=None, help="R lib path")
    parser.add_option("", "--input", dest="input", default='-', help="input: filename or stdin (-), default is stdin")
    parser.add_option("", "--output", dest="output", default='-', help="output: filename or stdout (-), default is stdout")
    parser.add_option("", "--outdir", dest="outdir", default=None, help="ouput is placed in dir as filenmae.obj, fielname.type, only for 'biom' input")
    parser.add_option("", "--format", dest="format", default='biom', help="input / output format: 'text' for tabbed table, 'biom' for BIOM format, default is biom")
    
    # get inputs
    (opts, args) = parser.parse_args()
    if (opts.input != '-') and (not os.path.isfile(opts.input)):
        sys.stderr.write("ERROR: input data missing\n")
        return 1
    if opts.format not in ['text', 'biom']:
        sys.stderr.write("ERROR: invalid format\n")
        return 1
    if (not opts.rlib) and ('KB_PERL_PATH' in os.environ):
        opts.rlib = os.environ['KB_PERL_PATH']
    
    # parse inputs
    biom = None
    rows = []
    cols = []
    data = []
    maxval = 0
    tmp_in = 'tmp_'+random_str()+'.txt'
    tmp_hdl = open(tmp_in, 'w')
    try:
        indata = sys.stdin.read() if opts.input == '-' else open(opts.input, 'r').read()
        if opts.format == 'biom':
            try:
                biom = json.loads(indata)
                if opts.rlib:
                    maxval = biom_to_tab(biom, tmp_hdl)
                else:
                    rows, cols, data = biom_to_matrix(biom)
            except:
                sys.stderr.write("ERROR: input BIOM data not correct format\n")
                return 1
        else:
            rows, cols, data = tab_to_matrix(indata)
            data = map(lambda x: map(float, x), data) # floatify it
            if opts.rlib:
                tmp_hdl.write(indata)
    except:
        sys.stderr.write("ERROR: unable to load input data\n")
        return 1
    finally:
        tmp_hdl.close()
    
    # check values to see if already normalized, otherwise R fails badly
    if len(data) > 0:
        maxval = max( map(max, data) )
    if maxval <= 1:
        os.remove(tmp_in)
        sys.stderr.write("ERROR: data is already normalized.\n")
        return 1
    
    # retrieve data
    norm = None
    if opts.rlib:
        tmp_out = 'tmp_'+random_str()+'.txt'
        r_cmd = """source("%s/preprocessing.r")
suppressMessages( MGRAST_preprocessing(
    file_in="%s",
    file_out="%s"
))"""%(opts.rlib, tmp_in, tmp_out)
        execute_r(r_cmd)
        nrows, ncols, ndata = tab_to_matrix(open(tmp_out, 'r').read())
        num_data = map(lambda x: map(float, x), ndata)
        norm = {"columns": ncols, "rows": nrows, "data": num_data}
        os.remove(tmp_out)
    else:
        post = {"columns": cols, "rows": rows, "data": data}
        norm = obj_from_url(opts.url+'/compute/normalize', data=json.dumps(post, separators=(',',':')))
    
    # output data
    os.remove(tmp_in)
    if (not opts.output) or (opts.output == '-'):
        out_hdl = sys.stdout
    else:
        out_hdl = open(opts.output, 'w')
    
    if biom and (opts.format == 'biom'):
        # may have rows removed
        new_rows = []
        for r in biom['rows']:
            if r['id'] in norm['rows']:
                new_rows.append(r)
        biom['rows'] = new_rows
        biom['data'] = norm['data']
        biom['shape'][0] = len(biom['rows'])
        biom['id'] = biom['id']+'_normalized'
        biom['matrix_type'] = 'dense'
        biom['matrix_element_type'] = 'float'
        matrix_type = None
        if biom['type'].startswith('Taxon'):
            matrix_type = "Communities.TaxonomicMatrix"
        elif biom['type'].startswith('Function'):
            matrix_type = "Communities.FunctionalMatrix"
        if opts.outdir and matrix_type:
            if not os.path.isdir(opts.outdir):
                os.mkdir(opts.outdir)
            ohdl = open(os.path.join(opts.outdir, opts.output+'.obj'), 'w')
            thdl = open(os.path.join(opts.outdir, opts.output+'.type'), 'w')
            ohdl.write(json.dumps(biom)+"\n")
            thdl.write(matrix_type)
            ohdl.close()
            thdl.close()
        else:
            out_hdl.write(json.dumps(biom)+"\n")
    else:
        out_hdl.write( "\t%s\n" %"\t".join(norm['columns']) )
        for i, d in enumerate(norm['data']):
            out_hdl.write( "%s\t%s\n" %(norm['rows'][i], "\t".join(map(str, d))) )
    
    out_hdl.close()
    if os.stat(opts.output).st_size == 0:
        os.remove(opts.output);
    return 0
예제 #6
0
def main(args):
    ArgumentParser.format_description = lambda self, formatter: self.description
    ArgumentParser.format_epilog = lambda self, formatter: self.epilog
    parser = ArgumentParser(usage='', description=prehelp%VERSION, epilog=posthelp%AUTH_LIST)
    parser.add_argument("--ids", dest="ids", default=None, help="comma seperated list or file of KBase Metagenome IDs")
    parser.add_argument("--url", dest="url", default=API_URL, help="communities API url")
    parser.add_argument("--user", dest="user", default=None, help="OAuth username")
    parser.add_argument("--passwd", dest="passwd", default=None, help="OAuth password")
    parser.add_argument("--token", dest="token", default=None, help="OAuth token")
    parser.add_argument("--level", dest="level", default='genus', help="taxon level to retrieve abundances for, default is genus")
    parser.add_argument("--source", dest="source", default='SEED', help="taxon datasource to filter results by, default is SEED")
    parser.add_argument("--hit_type", dest="hit_type", default='lca', help="Set of organisms to search results by, one of: all, single, lca")
    parser.add_argument("--filter_level", dest="filter_level", default=None, help="taxon level to filter by")
    parser.add_argument("--filter_name", dest="filter_name", default=None, help="taxon name to filter by, file or comma seperated list")
    parser.add_argument("--intersect_source", dest="intersect_source", default='Subsystems', help="function datasource for insersection, default is Subsystems")
    parser.add_argument("--intersect_level", dest="intersect_level", default=None, help="function level for insersection")
    parser.add_argument("--intersect_name", dest="intersect_name", default=None, help="function name(s) for insersection, file or comma seperated list")
    parser.add_argument("--output", dest="output", default='-', help="output: filename or stdout (-), default is stdout")
    parser.add_argument("--format", dest="format", default='biom', help="output format: 'text' for tabbed table, 'biom' for BIOM format, default is biom")
    parser.add_argument("--evalue", type=int, dest="evalue", default=15, help="negative exponent value for maximum e-value cutoff, default is 15")
    parser.add_argument("--identity", type=int, dest="identity", default=60, help="percent value for minimum %% identity cutoff, default is 60")
    parser.add_argument("--length", type=int, dest="length", default=15, help="value for minimum alignment length cutoff, default is 15")
    parser.add_argument("--version", type=int, dest="version", default=1, help="M5NR annotation version to use, default is 1")
    parser.add_argument("--temp", dest="temp", default=None, help="filename to temporarly save biom output at each iteration")
    
    # get inputs
    opts = parser.parse_args()
    if not opts.ids:
        sys.stderr.write("ERROR: one or more ids required\n")
        return 1
    if (opts.filter_name and (not opts.filter_level)) or ((not opts.filter_name) and opts.filter_level):
        sys.stderr.write("ERROR: both --filter_level and --filter_name need to be used together\n")
        return 1
    if (opts.intersect_name and (not opts.intersect_level)) or ((not opts.intersect_name) and opts.intersect_level):
        sys.stderr.write("ERROR: both --intersect_level and --intersect_name need to be used together\n")
        return 1
    if opts.format not in ['text', 'biom']:
        sys.stderr.write("ERROR: invalid input format\n")
        return 1
    
    # get auth
    token = get_auth_token(opts)
    
    # build url
    id_list = []
    if os.path.isfile(opts.ids):
        id_str = open(opts.ids,'r').read()
        try:
            id_obj  = json.loads(id_str)
            if 'elements' in id_obj:
                id_list = id_obj['elements'].keys()
            elif 'members' in id_obj:
                id_list = map(lambda x: x['ID'], id_obj['members'])
        except:
            id_list = id_str.strip().split('\n')
    else:
        id_list = opts.ids.strip().split(',')
    params = [ ('group_level', opts.level), 
               ('source', opts.source),
               ('hit_type', opts.hit_type),
               ('evalue', opts.evalue),
               ('identity', opts.identity),
               ('length', opts.length),
               ('version', opts.version),
               ('result_type', 'abundance'),
               ('asynchronous', '1') ]
    if opts.intersect_level and opts.intersect_name:
        params.append(('filter_source', opts.intersect_source))
        params.append(('filter_level', opts.intersect_level))
        if os.path.isfile(opts.intersect_name):
            with open(opts.intersect_name) as file_:
                for f in file_:
                    params.append(('filter', f.strip()))
        else:
            for f in opts.intersect_name.strip().split(','):
                params.append(('filter', f))

    # retrieve data
    biom = None
    size = 50
    if len(id_list) > size:
        for i in xrange(0, len(id_list), size):
            sub_ids = id_list[i:i+size]
            cur_params = copy.deepcopy(params)
            for i in sub_ids:
                cur_params.append(('id', i))
            cur_url  = opts.url+'/matrix/organism?'+urlencode(cur_params, True)
            cur_biom = async_rest_api(cur_url, auth=token)
            biom = merge_biom(biom, cur_biom)
            if opts.temp:
                json.dump(biom, open(opts.temp, 'w'))
    else:
        for i in id_list:
            params.append(('id', i))
        url = opts.url+'/matrix/organism?'+urlencode(params, True)
        biom = async_rest_api(url, auth=token)
        if opts.temp:
            json.dump(biom, open(opts.temp, 'w'))
    
    # get sub annotations
    sub_ann = set()
    if opts.filter_name and opts.filter_level:
        # get input filter list
        filter_list = []
        if os.path.isfile(opts.filter_name):
            with open(opts.filter_name) as file_:
                for f in file_:
                    filter_list.append(f.strip())
        else:
            for f in opts.filter_name.strip().split(','):
                filter_list.append(f)
        # annotation mapping from m5nr
        params = [ ('version', opts.version),
                   ('min_level', opts.level) ]
        url = opts.url+'/m5nr/taxonomy?'+urlencode(params, True)
        data = obj_from_url(url)
        for ann in data['data']:
            if (opts.filter_level in ann) and (opts.level in ann) and (ann[opts.filter_level] in filter_list):
                sub_ann.add(ann[opts.level])
    
    # output data
    if (not opts.output) or (opts.output == '-'):
        out_hdl = sys.stdout
    else:
        out_hdl = open(opts.output, 'w')
    
    if opts.format == 'biom':
        out_hdl.write(json.dumps(biom)+"\n")
    else:
        biom_to_tab(biom, out_hdl, rows=sub_ann)
    
    out_hdl.close()
    return 0
예제 #7
0
def main(args):
    ArgumentParser.format_description = lambda self, formatter: self.description
    ArgumentParser.format_epilog = lambda self, formatter: self.epilog
    parser = ArgumentParser(usage='',
                            description=prehelp % VERSION,
                            epilog=posthelp % AUTH_LIST)
    parser.add_argument("--input",
                        dest="input",
                        default='-',
                        help="input: filename or stdin (-), default is stdin")
    parser.add_argument(
        "--format",
        dest="format",
        default='text',
        help=
        "input format: 'text' for tabbed table, 'biom' for BIOM format, default is text"
    )
    parser.add_argument(
        "--groups",
        dest="groups",
        default=None,
        help="groups in JSON format - either as input string or filename")
    parser.add_argument("--rlib", dest="rlib", default=None, help="R lib path")
    parser.add_argument(
        "--top",
        dest="top",
        type=int,
        default=10,
        help="display only the top N most changing groups, default is 10")
    parser.add_argument(
        "--stat_test",
        dest="stat_test",
        default='Kruskal-Wallis',
        help=
        "supported statistical tests, one of: Kruskal-Wallis, t-test-paired, Wilcoxon-paired, t-test-unpaired, Mann-Whitney-unpaired-Wilcoxon, ANOVA-one-way, default is Kruskal-Wallis"
    )

    # get inputs
    opts = parser.parse_args()
    if (opts.input != '-') and (not os.path.isfile(opts.input)):
        sys.stderr.write("ERROR: input data missing\n")
        return 1
    if opts.format not in ['text', 'biom']:
        sys.stderr.write("ERROR: invalid input format\n")
        return 1
    if (not opts.rlib) and ('KB_PERL_PATH' in os.environ):
        opts.rlib = os.environ['KB_PERL_PATH']
    if not opts.rlib:
        sys.stderr.write("ERROR: missing path to R libs\n")
        return 1

    # get inputs
    tmp_in = 'tmp_' + random_str() + '.txt'
    tmp_out = 'tmp_' + random_str() + '.txt'
    tmp_hdl = open(tmp_in, 'w')
    mg_list = []
    groups = []
    try:
        indata = sys.stdin.read() if opts.input == '-' else open(
            opts.input, 'r').read()
        if opts.format == 'biom':
            try:
                indata = json.loads(indata)
                biom_to_tab(indata, tmp_hdl)
                mg_list = map(lambda x: x['id'], indata['columns'])
                try:
                    groups = map(lambda x: x['group'], indata['columns'])
                except:
                    pass
            except:
                sys.stderr.write("ERROR: input BIOM data not correct format\n")
                return 1
        else:
            tmp_hdl.write(indata)
            mg_list = indata.split('\n')[0].strip().split('\t')
    except:
        sys.stderr.write("ERROR: unable to load input data\n")
        return 1
    tmp_hdl.close()

    # get groups if not in BIOM
    if not groups:
        try:
            grdata = json.load(open(opts.groups, 'r')) if os.path.isfile(
                opts.groups) else json.loads(opts.groups)
        except:
            sys.stderr.write("ERROR: unable to parse groups JSON\n")
            return 1
        for mg in mg_list:
            found_gr = None
            for gr in grdata.keys():
                if mg in grdata[gr]:
                    found_gr = gr
                    break
            if found_gr:
                groups.append(found_gr)
            else:
                sys.stderr.write("ERROR: metagenome %s not in a group\n" % mg)
                return 1

    # build R cmd
    group_str = 'c(' + ','.join(map(lambda x: '"%s"' % x, groups)) + ')'
    r_cmd = """source("%s/group_stats_plot.r")
suppressMessages( group_stats_plot(
    file_in="%s",
    file_out="%s",
    figure_out=NULL,
    stat_test="%s",
    order_by=NULL,
    order_decreasing=TRUE,
    my_grouping=%s
))""" % (opts.rlib, tmp_in, tmp_out, opts.stat_test, group_str)
    execute_r(r_cmd)

    # output results
    results = open(tmp_out, 'r').readlines()
    output = "\n".join(results[0:opts.top + 1])
    os.remove(tmp_in)
    os.remove(tmp_out)
    safe_print(output)

    return 0
def main(args):
    OptionParser.format_description = lambda self, formatter: self.description
    OptionParser.format_epilog = lambda self, formatter: self.epilog
    parser = OptionParser(usage='', description=prehelp%VERSION, epilog=posthelp%AUTH_LIST)
    parser.add_option("", "--user", dest="user", default=None, help="OAuth username")
    parser.add_option("", "--passwd", dest="passwd", default=None, help="OAuth password")
    parser.add_option("", "--token", dest="token", default=None, help="OAuth token")
    parser.add_option("", "--input", dest="input", default='-', help="input: filename or stdin (-), default is stdin")
    parser.add_option("", "--format", dest="format", default='biom', help="input format: 'text' for tabbed table, 'biom' for BIOM format, default is biom")
    parser.add_option("", "--plot", dest="plot", default=None, help="filename for output plot")
    parser.add_option("", "--rlib", dest="rlib", default=None, help="R lib path")
    parser.add_option("", "--height", dest="height", type="float", default=8.5, help="image height in inches, default is 4")
    parser.add_option("", "--width", dest="width", type="float", default=11, help="image width in inches, default is 5")
    parser.add_option("", "--dpi", dest="dpi", type="int", default=300, help="image DPI, default is 300")
    parser.add_option("", "--name", dest="name", type="int", default=0, help="label columns by name, default is by id: 1=true, 0=false")
    parser.add_option("", "--label", dest="label", type="int", default=0, help="label image rows, default is off: 1=true, 0=false")
    
    # get inputs
    (opts, args) = parser.parse_args()
    if (opts.input != '-') and (not os.path.isfile(opts.input)):
        sys.stderr.write("ERROR: input data missing\n")
        return 1
    if opts.format not in ['text', 'biom']:
        sys.stderr.write("ERROR: invalid input format\n")
        return 1
    if not opts.plot:
        sys.stderr.write("ERROR: missing output filename\n")
        return 1
    if (not opts.rlib) and ('KB_PERL_PATH' in os.environ):
        opts.rlib = os.environ['KB_PERL_PATH']
    if not opts.rlib:
        sys.stderr.write("ERROR: missing path to R libs\n")
        return 1
    for o in ['reference', 'name', 'label']:
        if getattr(opts, o) not in [0, 1]:
            sys.stderr.write("ERROR: invalid value for '%s'\n"%o)
            return 1
    
    return 0
    # get auth
    token = get_auth_token(opts)
    
    # parse input for R
    tmp_in  = 'tmp_'+random_str()+'.txt'
    tmp_hdl = open(tmp_in, 'w')
    try:
        indata = sys.stdin.read() if opts.input == '-' else open(opts.input, 'r').read()
        if opts.format == 'biom':
            try:
                indata = json.loads(indata)
                col_name = True if opts.name == 1 else False
                biom_to_tab(indata, tmp_hdl, col_name=col_name)
            except:
                sys.stderr.write("ERROR: input BIOM data not correct format\n")
                return 1
        else:
            tmp_hdl.write(indata)
    except:
        sys.stderr.write("ERROR: unable to load input data\n")
        return 1
    tmp_hdl.close()
    
    # build R cmd
    label = 'TRUE' if opts.label == 1 else 'FALSE'
    r_cmd = """source("%s/plot_mg_boxplot.r")
suppressMessages( plot_mg_boxplot(
    table_in="%s",
    image_out="%s",
    label_rows=%s,
    image_height_in=%.1f,
    image_width_in=%.1f,
    image_res_dpi=%d
))"""%(opts.rlib, tmp_in, opts.plot, label, opts.height, opts.width, opts.dpi)
    execute_r(r_cmd)
    
    # cleanup
    os.remove(tmp_in)
    
    return 0
예제 #9
0
def main(args):
    ArgumentParser.format_description = lambda self, formatter: self.description
    ArgumentParser.format_epilog = lambda self, formatter: self.epilog
    parser = ArgumentParser(usage='',
                            description=prehelp % VERSION,
                            epilog=posthelp % AUTH_LIST)
    parser.add_argument("--user",
                        dest="user",
                        default=None,
                        help="OAuth username")
    parser.add_argument("--passwd",
                        dest="passwd",
                        default=None,
                        help="OAuth password")
    parser.add_argument("--token",
                        dest="token",
                        default=None,
                        help="OAuth token")
    parser.add_argument("--input",
                        dest="input",
                        default='-',
                        help="input: filename or stdin (-), default is stdin")
    parser.add_argument(
        "--format",
        dest="format",
        default='biom',
        help=
        "input format: 'text' for tabbed table, 'biom' for BIOM format, default is biom"
    )
    parser.add_argument("--plot",
                        dest="plot",
                        default=None,
                        help="filename for output plot")
    parser.add_argument(
        "--cluster",
        dest="cluster",
        default='ward',
        help=
        "cluster function, one of: ward, single, complete, mcquitty, median, centroid, default is ward"
    )
    parser.add_argument(
        "--distance",
        dest="distance",
        default='bray-curtis',
        help=
        "distance function, one of: bray-curtis, euclidean, maximum, manhattan, canberra, minkowski, difference, default is bray-curtis"
    )
    parser.add_argument("--rlib", dest="rlib", default=None, help="R lib path")
    parser.add_argument("--height",
                        dest="height",
                        type=float,
                        default=10,
                        help="image height in inches, default is 5")
    parser.add_argument("--width",
                        dest="width",
                        type=float,
                        default=10,
                        help="image width in inches, default is 4")
    parser.add_argument("--dpi",
                        dest="dpi",
                        type=int,
                        default=300,
                        help="image DPI, default is 300")
    parser.add_argument("--order",
                        dest="order",
                        type=int,
                        default=0,
                        help="order columns, default is off: 1=true, 0=false")
    parser.add_argument(
        "--name",
        dest="name",
        type=int,
        default=0,
        help="label columns by name, default is by id: 1=true, 0=false")
    parser.add_argument(
        "--label",
        dest="label",
        type=int,
        default=0,
        help="label image rows, default is off: 1=true, 0=false")

    # get inputs
    opts = parser.parse_args()
    if (opts.input != '-') and (not os.path.isfile(opts.input)):
        sys.stderr.write("ERROR: input data missing\n")
        return 1
    if opts.format not in ['text', 'biom']:
        sys.stderr.write("ERROR: invalid input format\n")
        return 1
    if not opts.plot:
        sys.stderr.write("ERROR: missing output filename\n")
        return 1
    if (not opts.rlib) and ('KB_PERL_PATH' in os.environ):
        opts.rlib = os.environ['KB_PERL_PATH']
    if not opts.rlib:
        sys.stderr.write("ERROR: missing path to R libs\n")
        return 1
    for o in ['reference', 'order', 'name', 'label']:
        if getattr(opts, o) not in [0, 1]:
            sys.stderr.write("ERROR: invalid value for '%s'\n" % o)
            return 1

    # get auth
    token = get_auth_token(opts)

    # parse input for R
    tmp_in = 'tmp_' + random_str() + '.txt'
    tmp_hdl = open(tmp_in, 'w')
    try:
        indata = sys.stdin.read() if opts.input == '-' else open(
            opts.input, 'r').read()
        if opts.format == 'biom':
            try:
                indata = json.loads(indata)
                col_name = True if opts.name == 1 else False
                biom_to_tab(indata, tmp_hdl, col_name=col_name)
            except:
                sys.stderr.write("ERROR: input BIOM data not correct format\n")
                return 1
        else:
            tmp_hdl.write(indata)
    except:
        sys.stderr.write("ERROR: unable to load input data\n")
        return 1
    tmp_hdl.close()

    # build R cmd
    order = 'TRUE' if opts.order == 1 else 'FALSE'
    label = 'TRUE' if opts.label == 1 else 'FALSE'
    r_cmd = """source("%s/plot_mg_heatdend.r")
suppressMessages( plot_mg_heatdend(
    table_in="%s",
    image_out="%s",
    order_columns=%s,
    label_rows=%s,
    image_height_in=%.1f,
    image_width_in=%.1f,
    image_res_dpi=%d
))""" % (opts.rlib, tmp_in, opts.plot, order, label, opts.height, opts.width,
         opts.dpi)
    execute_r(r_cmd)

    # cleanup
    os.remove(tmp_in)

    return 0
예제 #10
0
def main(args):
    ArgumentParser.format_description = lambda self, formatter: self.description
    ArgumentParser.format_epilog = lambda self, formatter: self.epilog
    parser = ArgumentParser(usage='',
                            description=prehelp % VERSION,
                            epilog=posthelp % AUTH_LIST)
    parser.add_argument("--user",
                        dest="user",
                        default=None,
                        help="OAuth username")
    parser.add_argument("--passwd",
                        dest="passwd",
                        default=None,
                        help="OAuth password")
    parser.add_argument("--token",
                        dest="token",
                        default=None,
                        help="OAuth token")
    parser.add_argument("--input",
                        dest="input",
                        default='-',
                        help="input: filename or stdin (-), default is stdin")
    parser.add_argument(
        "--format",
        dest="format",
        default='biom',
        help=
        "input format: 'text' for tabbed table, 'biom' for BIOM format, default is biom"
    )
    parser.add_argument("--plot",
                        dest="plot",
                        default=None,
                        help="filename for output plot")
    parser.add_argument(
        "--distance",
        dest="distance",
        default='bray-curtis',
        help=
        "distance metric, one of: bray-curtis, euclidean, maximum, manhattan, canberra, minkowski, difference, default is bray-curtis"
    )
    parser.add_argument(
        "--metadata",
        dest="metadata",
        default=None,
        help="metadata field to color by, only for 'biom' input")
    parser.add_argument(
        "--groups",
        dest="groups",
        default=None,
        help=
        "list of groups in JSON or tabbed format - either as input string or filename"
    )
    parser.add_argument("--group_pos",
                        dest="group_pos",
                        type=int,
                        default=1,
                        help="position of group to use, default is 1 (first)")
    parser.add_argument(
        "--color_auto",
        dest="color_auto",
        type=int,
        default=0,
        help=
        "auto-create colors based on like group names, default is use group name as color: 1=true, 0=false"
    )
    parser.add_argument("--rlib", dest="rlib", default=None, help="R lib path")
    parser.add_argument("--height",
                        dest="height",
                        type=float,
                        default=10,
                        help="image height in inches, default is 6")
    parser.add_argument("--width",
                        dest="width",
                        type=float,
                        default=10,
                        help="image width in inches, default is 6")
    parser.add_argument("--dpi",
                        dest="dpi",
                        type=int,
                        default=300,
                        help="image DPI, default is 300")
    parser.add_argument(
        "--three",
        dest="three",
        type=int,
        default=0,
        help="create 3-D PCoA, default is 2-D: 1=true, 0=false")
    parser.add_argument(
        "--name",
        dest="name",
        type=int,
        default=0,
        help="label columns by name, default is by id: 1=true, 0=false")
    parser.add_argument(
        "--label",
        dest="label",
        type=int,
        default=0,
        help="label image rows, default is off: 1=true, 0=false")

    # get inputs
    opts = parser.parse_args()
    if (opts.input != '-') and (not os.path.isfile(opts.input)):
        sys.stderr.write("ERROR: input data missing\n")
        return 1
    if opts.format not in ['text', 'biom']:
        sys.stderr.write("ERROR: invalid input format\n")
        return 1
    if not opts.plot:
        sys.stderr.write("ERROR: missing output filename\n")
        return 1
    if (not opts.rlib) and ('KB_PERL_PATH' in os.environ):
        opts.rlib = os.environ['KB_PERL_PATH']
    if not opts.rlib:
        sys.stderr.write("ERROR: missing path to R libs\n")
        return 1
    if opts.metadata:
        opts.color_auto = 1
    for o in ['reference', 'color_auto', 'three', 'name', 'label']:
        if getattr(opts, o) not in [0, 1]:
            sys.stderr.write("ERROR: invalid value for '%s'\n" % o)
            return 1

    # get auth
    token = get_auth_token(opts)

    # parse inputs
    tmp_in = 'tmp_' + random_str() + '.txt'
    tmp_hdl = open(tmp_in, 'w')
    mg_list = []
    groups = []
    try:
        indata = sys.stdin.read() if opts.input == '-' else open(
            opts.input, 'r').read()
        if opts.format == 'biom':
            try:
                indata = json.loads(indata)
                mg_list = map(lambda x: x['id'], indata['columns'])
                col_name = True if opts.name == 1 else False
                biom_to_tab(indata, tmp_hdl, col_name=col_name)
                if opts.metadata:
                    groups = metadata_from_biom(indata, opts.metadata)
            except:
                sys.stderr.write("ERROR: input BIOM data not correct format\n")
                return 1
        else:
            tmp_hdl.write(indata)
            mg_list = indata.split('\n')[0].strip().split('\t')
    except:
        sys.stderr.write("ERROR: unable to load input data\n")
        return 1
    tmp_hdl.close()

    # get groups if not in BIOM metadata and option used
    if (len(groups) == 0) and opts.groups:
        # is it json ?
        ## example of 2 group sets in json format
        ## [ {"group1": ["mg_id_1", "mg_id_2"], "group2": ["mg_id_3", "mg_id_4", "mg_id_5"]},
        ##   {"group1": ["mg_id_1", "mg_id_2", "mg_id_3"], "group2": ["mg_id_4", "mg_id_5"]} ]
        try:
            gdata = json.load(open(opts.groups, 'r')) if os.path.isfile(
                opts.groups) else json.loads(opts.groups)
            if opts.group_pos > len(gdata):
                sys.stderr.write(
                    "ERROR: position (%d) of group is out of bounds\n" %
                    opts.group_pos)
                return 1
            for m in mg_list:
                found_g = None
                for g, mgs in gdata[opts.group_pos - 1].items():
                    if m in mgs:
                        found_g = g
                        break
                if found_g:
                    groups.append(found_g)
                else:
                    sys.stderr.write("ERROR: metagenome %s not in a group\n" %
                                     m)
                    return 1
        # no - its tabbed
        except:
            gtext = open(opts.groups, 'r').read() if os.path.isfile(
                opts.groups) else opts.groups
            grows, gcols, gdata = tab_to_matrix(gtext)
            if opts.group_pos > len(gdata[0]):
                sys.stderr.write(
                    "ERROR: position (%d) of group is out of bounds\n" %
                    opts.group_pos)
            for m in mg_list:
                try:
                    midx = gcols.index(m)
                    groups.append(gdata[midx][opts.group_pos - 1])
                except:
                    sys.stderr.write("ERROR: metagenome %s not in a group\n" %
                                     m)
                    return 1

    # print groups to file for R input
    tmp_group = None
    if len(groups) == len(mg_list):
        tmp_group = 'tmp_' + random_str() + '.txt'
        hdl_group = open(tmp_group, 'w')
        hdl_group.write("\tgroup\n")
        for i, m in enumerate(mg_list):
            hdl_group.write(
                "%s\t%s\n" %
                (m, ''.join([x if ord(x) < 128 else '?' for x in groups[i]])))
        hdl_group.close()
    elif len(groups) > 0:
        sys.stderr.write("Warning: Not all metagenomes in a group\n")

    # build R cmd
    three = 'c(1,2,3)' if opts.three == 1 else 'c(1,2)'
    label = 'TRUE' if opts.label == 1 else 'FALSE'
    table = '"%s"' % tmp_group if tmp_group else 'NA'
    color = 'TRUE' if opts.color_auto == 1 else 'FALSE'
    r_cmd = """source("%s/plot_mg_pcoa.r")
suppressMessages( plot_mg_pcoa(
    table_in="%s",
    image_out="%s",
    plot_pcs=%s,
    dist_metric="%s",
    label_points=%s,
    color_table=%s,
    color_column=1,
    auto_colors=%s,
    image_height_in=%.1f,
    image_width_in=%.1f,
    image_res_dpi=%d
))""" % (opts.rlib, tmp_in, opts.plot, three, opts.distance, label, table,
         color, opts.height, opts.width, opts.dpi)
    execute_r(r_cmd)

    # cleanup
    os.remove(tmp_in)
    if tmp_group:
        os.remove(tmp_group)

    return 0
def main(args):
    OptionParser.format_description = lambda self, formatter: self.description
    OptionParser.format_epilog = lambda self, formatter: self.epilog
    parser = OptionParser(usage='', description=prehelp%VERSION, epilog=posthelp%AUTH_LIST)
    parser.add_option("", "--input", dest="input", default='-', help="input: filename or stdin (-), default is stdin")
    parser.add_option("", "--format", dest="format", default='biom', help="input format: 'text' for tabbed table, 'biom' for BIOM format, default is biom")
    parser.add_option("", "--output", dest="output", default='biom', help="output format: 'text' for tabbed table, 'biom' for BIOM format, default is biom")
    parser.add_option("", "--plot", dest="plot", default=None, help="filename for output plot, optional")
    parser.add_option("", "--stat_test", dest="stat_test", default='Kruskal-Wallis', help="supported statistical tests, one of: Kruskal-Wallis, t-test-paired, Wilcoxon-paired, t-test-unpaired, Mann-Whitney-unpaired-Wilcoxon, ANOVA-one-way, default is Kruskal-Wallis")
    parser.add_option("", "--metadata", dest="metadata", default=None, help="metadata field to group by, only for 'biom' input")
    parser.add_option("", "--groups", dest="groups", default=None, help="list of groups in JSON or tabbed format - either as input string or filename")
    parser.add_option("", "--group_pos", dest="group_pos", type="int", default=1, help="position of group to use, default is 1 (first)")
    parser.add_option("", "--rlib", dest="rlib", default=None, help="R lib path")
    parser.add_option("", "--order", dest="order", default=None, help="column number to order output by, default is last column")
    parser.add_option("", "--direction", dest="direction", default="desc", help="direction of order. 'asc' for ascending order, 'desc' for descending order, default is desc")
    parser.add_option("", "--height", dest="height", type="float", default=6, help="image height in inches, default is 6")
    parser.add_option("", "--width", dest="width", type="float", default=6, help="image width in inches, default is 6")
    parser.add_option("", "--dpi", dest="dpi", type="int", default=300, help="image DPI, default is 300")
    
    # get inputs
    (opts, args) = parser.parse_args()
    if (opts.input != '-') and (not os.path.isfile(opts.input)):
        sys.stderr.write("ERROR: input data missing\n")
        return 1
    if opts.format not in ['text', 'biom']:
        sys.stderr.write("ERROR: invalid input format\n")
        return 1
    if opts.output not in ['text', 'biom']:
        sys.stderr.write("ERROR: invalid output format\n")
        return 1
    if (not opts.rlib) and ('KB_PERL_PATH' in os.environ):
        opts.rlib = os.environ['KB_PERL_PATH']
    if not opts.rlib:
        sys.stderr.write("ERROR: missing path to R libs\n")
        return 1
    if opts.direction not in ['asc', 'desc']:
        sys.stderr.write("ERROR: invalid order direction\n")
        return 1
    
    # parse inputs
    tmp_in  = 'tmp_'+random_str()+'.txt'
    tmp_out = 'tmp_'+random_str()+'.txt'
    tmp_hdl = open(tmp_in, 'w')
    mg_list = []
    groups  = []
    biom = None
    try:
        indata = sys.stdin.read() if opts.input == '-' else open(opts.input, 'r').read()
        if opts.format == 'biom':
            try:
                biom = json.loads(indata)
                mg_list = map(lambda x: x['id'], biom['columns'])
                biom_to_tab(biom, tmp_hdl)
                if opts.metadata:
                    groups = metadata_from_biom(biom, opts.metadata)
            except:
                sys.stderr.write("ERROR: input BIOM data not correct format\n")
                return 1
        else:
            tmp_hdl.write(indata)
            mg_list = indata.split('\n')[0].strip().split('\t')
    except:
        sys.stderr.write("ERROR: unable to load input data\n")
        return 1
    tmp_hdl.close()
    
    # get groups if not in BIOM metadata and option used
    if (len(groups) == 0) and opts.groups:
        # is it json ?
        ## example of 2 group sets in json format
        ## [ {"group1": ["mg_id_1", "mg_id_2"], "group2": ["mg_id_3", "mg_id_4", "mg_id_5"]},
        ##   {"group1": ["mg_id_1", "mg_id_2", "mg_id_3"], "group2": ["mg_id_4", "mg_id_5"]} ]
        try:
            gdata = json.load(open(opts.groups, 'r')) if os.path.isfile(opts.groups) else json.loads(opts.groups)
            if opts.group_pos > len(gdata):
                sys.stderr.write("ERROR: position (%d) of group is out of bounds\n"%opts.group_pos)
                return 1
            for m in mg_list:
                found_g = None
                for g, mgs in gdata[opts.group_pos-1].items():
                    if m in mgs:
                        found_g = g
                        break
                if found_g:
                    groups.append(found_g)
                else:
                    sys.stderr.write("ERROR: metagenome %s not in a group\n"%m)
                    return 1                  
        # no - its tabbed
        except:
            gtext = open(opts.groups, 'r').read() if os.path.isfile(opts.groups) else opts.groups
            grows, gcols, gdata = tab_to_matrix(gtext)
            if opts.group_pos > len(gdata[0]):
                sys.stderr.write("ERROR: position (%d) of group is out of bounds\n"%opts.group_pos)
            for m in mg_list:
                try:
                    midx = gcols.index(m)
                    groups.append(gdata[midx][opts.group_pos-1])
                except:
                    sys.stderr.write("ERROR: metagenome %s not in a group\n"%m)
                    return 1
    
    # validate groups
    if len(groups) != len(mg_list):
        sys.stderr.write("ERROR: Not all metagenomes in a group\n")
        return 1
    
    # build R cmd
    fig_out    = '"%s"'%opts.plot if opts.plot else 'NULL'
    order_by   = 'NULL' if opts.order is None else int(opts.order)
    order_desc = 'TRUE' if opts.direction == 'desc' else 'FALSE'
    group_str  = 'c('+','.join(map(lambda x: '"%s"'%x, groups))+')'
    r_cmd = """source("%s/group_stats_plot.r")
suppressMessages( group_stats_plot(
    file_in="%s",
    file_out="%s",
    figure_out=%s,
    figure_width_in=%.1f,
    figure_height_in=%.1f,
    figure_res_dpi=%d,
    stat_test="%s",
    order_by=%s,
    order_decreasing=%s,
    my_grouping=%s
))"""%(opts.rlib, tmp_in, tmp_out, fig_out, opts.height, opts.width, opts.dpi, opts.stat_test, order_by, order_desc, group_str)
    execute_r(r_cmd)
    
    # output results
    results = open(tmp_out, 'r').read()
    os.remove(tmp_in)
    os.remove(tmp_out)
    
    if biom and (opts.output == 'biom'):
        cnum = biom['shape'][1]
        rids = [r['id'] for r in biom['rows']]
        rrows, rcols, rdata = tab_to_matrix(results)
        if (len(rrows) != biom['shape'][0]) or (len(rcols[:cnum]) != biom['shape'][1]):
            sys.stderr.write("ERROR: significance test returned invalid results\n")
            return 1
        # add stats to row data, re-order
        new_rows = []
        new_data = []
        for i, row in enumerate(rdata):
            # get row
            rindex = rids.index(rrows[i])
            robj = biom['rows'][rindex]
            if not robj['metadata']:
                robj['metadata'] = {'significance': []}
            else:
                robj['metadata']['significance'] = []
            # add stats
            for j, stat in enumerate(row[cnum:]):
                try:
                    stat = float(stat)
                except:
                    stat = None
                robj['metadata']['significance'].append((rcols[cnum:][j], stat))
            new_rows.append(robj)
            new_data.append(row[:cnum])
        # update biom
        biom['id'] = biom['id']+'_sig'
        biom['rows'] = new_rows
        biom['data'] = new_data
        biom['matrix_type'] = 'dense'
        for i, g in enumerate(groups):
            biom['columns'][i]['group'] = g
        safe_print(json.dumps(biom)+'\n')
    else:
        safe_print(results)
    return 0
예제 #12
0
def main(args):
    ArgumentParser.format_description = lambda self, formatter: self.description
    ArgumentParser.format_epilog = lambda self, formatter: self.epilog
    parser = ArgumentParser(usage='', description=prehelp%VERSION, epilog=posthelp%AUTH_LIST)
    parser.add_argument("--input", dest="input", default='-', help="input: filename or stdin (-), default is stdin")
    parser.add_argument("--format", dest="format", default='biom', help="input format: 'text' for tabbed table, 'biom' for BIOM format, default is biom")
    parser.add_argument("--output", dest="output", default='biom', help="output format: 'text' for tabbed table, 'biom' for BIOM format, default is biom")
    parser.add_argument("--plot", dest="plot", default=None, help="filename for output plot, optional")
    parser.add_argument("--stat_test", dest="stat_test", default='Kruskal-Wallis', help="supported statistical tests, one of: Kruskal-Wallis, t-test-paired, Wilcoxon-paired, t-test-unpaired, Mann-Whitney-unpaired-Wilcoxon, ANOVA-one-way, default is Kruskal-Wallis")
    parser.add_argument("--metadata", dest="metadata", default=None, help="metadata field to group by, only for 'biom' input")
    parser.add_argument("--groups", dest="groups", default=None, help="list of groups in JSON or tabbed format - either as input string or filename")
    parser.add_argument("--group_pos", dest="group_pos", type=int, default=1, help="position of group to use, default is 1 (first)")
    parser.add_argument("--rlib", dest="rlib", default=None, help="R lib path")
    parser.add_argument("--order", dest="order", default=None, help="column number to order output by, default is last column")
    parser.add_argument("--direction", dest="direction", default="desc", help="direction of order. 'asc' for ascending order, 'desc' for descending order, default is desc")
    parser.add_argument("--height", dest="height", type=float, default=6, help="image height in inches, default is 6")
    parser.add_argument("--width", dest="width", type=float, default=6, help="image width in inches, default is 6")
    parser.add_argument("--dpi", dest="dpi", type=int, default=300, help="image DPI, default is 300")
    
    # get inputs
    opts = parser.parse_args()
    if (opts.input != '-') and (not os.path.isfile(opts.input)):
        sys.stderr.write("ERROR: input data missing\n")
        return 1
    if opts.format not in ['text', 'biom']:
        sys.stderr.write("ERROR: invalid input format\n")
        return 1
    if opts.output not in ['text', 'biom']:
        sys.stderr.write("ERROR: invalid output format\n")
        return 1
    if (not opts.rlib) and ('KB_PERL_PATH' in os.environ):
        opts.rlib = os.environ['KB_PERL_PATH']
    if not opts.rlib:
        sys.stderr.write("ERROR: missing path to R libs\n")
        return 1
    if opts.direction not in ['asc', 'desc']:
        sys.stderr.write("ERROR: invalid order direction\n")
        return 1
    
    # parse inputs
    tmp_in  = 'tmp_'+random_str()+'.txt'
    tmp_out = 'tmp_'+random_str()+'.txt'
    tmp_hdl = open(tmp_in, 'w')
    mg_list = []
    groups  = []
    biom = None
    try:
        indata = sys.stdin.read() if opts.input == '-' else open(opts.input, 'r').read()
        if opts.format == 'biom':
            try:
                biom = json.loads(indata)
                mg_list = map(lambda x: x['id'], biom['columns'])
                biom_to_tab(biom, tmp_hdl)
                if opts.metadata:
                    groups = metadata_from_biom(biom, opts.metadata)
            except:
                sys.stderr.write("ERROR: input BIOM data not correct format\n")
                return 1
        else:
            tmp_hdl.write(indata)
            mg_list = indata.split('\n')[0].strip().split('\t')
    except:
        sys.stderr.write("ERROR: unable to load input data\n")
        return 1
    tmp_hdl.close()
    
    # get groups if not in BIOM metadata and option used
    if (len(groups) == 0) and opts.groups:
        # is it json ?
        ## example of 2 group sets in json format
        ## [ {"group1": ["mg_id_1", "mg_id_2"], "group2": ["mg_id_3", "mg_id_4", "mg_id_5"]},
        ##   {"group1": ["mg_id_1", "mg_id_2", "mg_id_3"], "group2": ["mg_id_4", "mg_id_5"]} ]
        try:
            gdata = json.load(open(opts.groups, 'r')) if os.path.isfile(opts.groups) else json.loads(opts.groups)
            if opts.group_pos > len(gdata):
                sys.stderr.write("ERROR: position (%d) of group is out of bounds\n"%opts.group_pos)
                return 1
            for m in mg_list:
                found_g = None
                for g, mgs in gdata[opts.group_pos-1].items():
                    if m in mgs:
                        found_g = g
                        break
                if found_g:
                    groups.append(found_g)
                else:
                    sys.stderr.write("ERROR: metagenome %s not in a group\n"%m)
                    return 1                  
        # no - its tabbed
        except:
            gtext = open(opts.groups, 'r').read() if os.path.isfile(opts.groups) else opts.groups
            grows, gcols, gdata = tab_to_matrix(gtext)
            if opts.group_pos > len(gdata[0]):
                sys.stderr.write("ERROR: position (%d) of group is out of bounds\n"%opts.group_pos)
            for m in mg_list:
                try:
                    midx = gcols.index(m)
                    groups.append(gdata[midx][opts.group_pos-1])
                except:
                    sys.stderr.write("ERROR: metagenome %s not in a group\n"%m)
                    return 1
    
    # validate groups
    if len(groups) != len(mg_list):
        sys.stderr.write("ERROR: Not all metagenomes in a group\n")
        return 1
    
    # build R cmd
    fig_out    = '"%s"'%opts.plot if opts.plot else 'NULL'
    order_by   = 'NULL' if opts.order is None else int(opts.order)
    order_desc = 'TRUE' if opts.direction == 'desc' else 'FALSE'
    group_str  = 'c('+','.join(map(lambda x: '"%s"'%x, groups))+')'
    r_cmd = """source("%s/group_stats_plot.r")
suppressMessages( group_stats_plot(
    file_in="%s",
    file_out="%s",
    figure_out=%s,
    figure_width_in=%.1f,
    figure_height_in=%.1f,
    figure_res_dpi=%d,
    stat_test="%s",
    order_by=%s,
    order_decreasing=%s,
    my_grouping=%s
))"""%(opts.rlib, tmp_in, tmp_out, fig_out, opts.height, opts.width, opts.dpi, opts.stat_test, order_by, order_desc, group_str)
    execute_r(r_cmd)
    
    # output results
    results = open(tmp_out, 'r').read()
    os.remove(tmp_in)
    os.remove(tmp_out)
    
    if biom and (opts.output == 'biom'):
        cnum = biom['shape'][1]
        rids = [r['id'] for r in biom['rows']]
        rrows, rcols, rdata = tab_to_matrix(results)
        if (len(rrows) != biom['shape'][0]) or (len(rcols[:cnum]) != biom['shape'][1]):
            sys.stderr.write("ERROR: significance test returned invalid results\n")
            return 1
        # add stats to row data, re-order
        new_rows = []
        new_data = []
        for i, row in enumerate(rdata):
            # get row
            rindex = rids.index(rrows[i])
            robj = biom['rows'][rindex]
            if not robj['metadata']:
                robj['metadata'] = {'significance': []}
            else:
                robj['metadata']['significance'] = []
            # add stats
            for j, stat in enumerate(row[cnum:]):
                try:
                    stat = float(stat)
                except:
                    stat = None
                robj['metadata']['significance'].append((rcols[cnum:][j], stat))
            new_rows.append(robj)
            new_data.append(row[:cnum])
        # update biom
        biom['id'] = biom['id']+'_sig'
        biom['rows'] = new_rows
        biom['data'] = new_data
        biom['matrix_type'] = 'dense'
        for i, g in enumerate(groups):
            biom['columns'][i]['group'] = g
        safe_print(json.dumps(biom)+'\n')
    else:
        safe_print(results)
    return 0