Exemplo n.º 1
0
def prepare_data(path, config):

    thresh = config.get('Evaluation', 'FilterThresh')
    data = pd.read_csv(path + '/abundance.tsv',
                       index_col=0,
                       sep='\t',
                       header=None)
    labels = np.genfromtxt(path + '/labels.txt', dtype=np.str_, delimiter=',')
    core_filt_thresh = float(thresh)
    opp_filt_thresh = 0.0

    data = data.transpose()

    sums = data.sum(axis=1)
    data = data.divide(sums, axis=0)
    labels, label_set = pd.factorize(labels)

    pos_set = data.iloc[np.where(labels == 1)]
    neg_set = data.iloc[np.where(labels == 0)]

    core = filter_data(data, labels, core_filt_thresh, opp_filt_thresh)

    data = core

    features = list(data.columns.values)
    print("There are %d raw features..." % (len(features)))
    features_df = get_feature_df(features)

    print("Building tree structure...")
    try:
        g = pickle.load(
            open(path + "/PopPhy-tree-" + str(core_filt_thresh) + "-core.pkl",
                 'rb'))
        print("Found tree file...")
    except:
        print("Tree file not found...")
        print("Contsructing tree..")
        g = Graph()
        g.build_graph()
        g.prune_graph(features_df)
        pickle.dump(
            g,
            open(path + "/PopPhy-tree-" + str(core_filt_thresh) + "-core.pkl",
                 'wb'))

    print("Populating trees...")
    results = Parallel(n_jobs=num_cores)(
        delayed(generate_maps)(x, g, features_df) for x in data.values)
    my_maps = np.array(np.take(results, 1, 1).tolist())
    counts = np.count_nonzero(my_maps, axis=0)

    my_benchmark = np.array(np.take(results, 0, 1).tolist())
    my_benchmark_tree = np.array(np.take(results, 2, 1).tolist())

    tree_features = g.graph_vector_features()

    my_benchmark_df = pd.DataFrame(index=tree_features,
                                   data=np.transpose(my_benchmark_tree))
    my_benchmark_df = my_benchmark_df.groupby(my_benchmark_df.index).mean()

    tree_features = my_benchmark_df.index
    my_benchmark_tree = np.transpose(my_benchmark_df.values)

    num_tree_features = len(tree_features)
    print("There are %d tree features..." % (num_tree_features))
    return my_maps, my_benchmark, my_benchmark_tree, features, tree_features, labels, label_set, g, features_df
Exemplo n.º 2
0
def prepare_data(path, config, k, m):
    # def prepare_data ( path , config):

    thresh = config.get('Evaluation', 'FilterThresh')
    data = pd.read_csv(path + '/pois_t2d_trainabun_1+1.tsv',
                       index_col=0,
                       sep='\t',
                       header=None)
    #542行(微生物),232列(样本),第一列为名称,后为数据,
    labels = np.genfromtxt(path + '/pois_t2d_trainlabel_1+1.txt',
                           dtype=np.str_,
                           delimiter=',')
    #一行,232列,依次记录"n"和"Cirrhosis"
    core_filt_thresh = float(thresh)
    opp_filt_thresh = 0.0

    data = data.transpose()
    #此时542列微生物特征和232行样本

    sums = data.sum(axis=1)
    #232个样本,每个样本的各类微生物之和,均为100
    data = data.divide(sums, axis=0)
    #
    labels, label_set = pd.factorize(labels)
    #label_set=['n','Cirrhosis']
    #labels:一行,前114个为0,后118个为1

    pos_set = data.iloc[np.where(labels == 1)]
    #118行,

    neg_set = data.iloc[np.where(labels == 0)]
    #114行

    core = filter_data(data, labels, core_filt_thresh, opp_filt_thresh)
    #可能是过滤数据或者是打乱数据顺序,原本232个样本,542个微生物特征,经过filter以后为232个样本,269个微生物特征
    data = core

    features = list(data.columns.values)
    print("There are %d raw features..." % (len(features)))
    features_df = get_feature_df(features)
    #每一种微生物名字剥离出来,分门别类,比如一行知道他是哪个界,哪个门,这样排序,成一个表。
    #看做微生物名字表:每一行都是一种具体微生物(即特征)所属的界、门、科、目、纲、属、种

    print("Building tree structure...")
    try:
        g = pickle.load(
            open(path + "/PopPhy-tree-" + str(core_filt_thresh) + "-core.pkl",
                 'rb'))
        print("Found tree file...")
    except:
        print("Tree file not found...")
        print("Contsructing tree..")
        g = Graph()
        g.build_graph()
        g.prune_graph(features_df)
        #build_graph为根据很多括号的通用树文件建立的树
        #而features_df为单一数据集中出现的微生物特征,根据当前数据集实际微生物特征修剪通用的进化树。
        g.removeRepeatName()
        g.routeToRoot()

        # pickle.dump(g, open(path + "/PopPhy-tree-" + str(core_filt_thresh) + "-core.pkl", 'wb'))
        # pickle.dump保存

    print("Populating trees...")
    results = Parallel(n_jobs=num_cores)(
        delayed(generate_maps)(x, g, features_df, k, m) for x in data.values)
    # results = Parallel ( n_jobs=num_cores ) (delayed ( generate_maps ) ( x , g , features_df ,) for x in data.values )
    # data.values 是232行,每一行一个样本。269列,每一列一个微生物特征的纯数据,不带名字
    #x 为data从第一行即第一个样本遍历到最后一行最后一个样本,再把第一行转置。即一个269个元素的列,代表一个样本
    my_maps = np.array(np.take(results, 1, 1).tolist())
    counts = np.count_nonzero(my_maps, axis=0)

    my_benchmark = np.array(np.take(results, 0, 1).tolist())
    my_benchmark_tree = np.array(np.take(results, 2, 1).tolist())

    tree_features = g.graph_vector_features()

    my_benchmark_df = pd.DataFrame(index=tree_features,
                                   data=np.transpose(my_benchmark_tree))
    my_benchmark_df = my_benchmark_df.groupby(my_benchmark_df.index).mean()

    tree_features = my_benchmark_df.index
    my_benchmark_tree = np.transpose(my_benchmark_df.values)

    num_tree_features = len(tree_features)
    print("There are %d tree features..." % (num_tree_features))
    return my_maps, my_benchmark, my_benchmark_tree, features, tree_features, labels, label_set, g, features_df