Exemplo n.º 1
0
def prop4da(dataset):
    root, name = op.split(dataset)
    name = op.splitext(name)[0]
    masterfile = op.join(root, name + "-master.csv")
    y = read_y_from_master(masterfile)
    classes = infer_classes(y)

    # Process CDK descriptors
    for descs in glob.glob(op.join(root, "*-cdk-*.csv")):
        with open(descs) as reader:
            header = reader.next()
            if header.startswith("Title"):
                x, features = cdkdeskui2dense(descs)
                mlio.save_arff(x, y, op.splitext(descs)[0] + ".arff", feature_names=features, classes=classes)
                mlio.save_tab(x, y, op.splitext(descs)[0] + ".txt", classes=classes)
            else:
                x, relation_name = cdkdeskuifps2dense(descs)
                features = mlio.generate_names(x.shape[0], descs)
                mlio.save_arff(
                    x,
                    y,
                    op.splitext(descs)[0] + ".arff",
                    relation_name=relation_name,
                    classes=classes,
                    feature_names=features,
                )
                mlio.save_tab(x, y, op.splitext(descs)[0] + ".txt", classes=classes)

    # Process ob spectrophores
    specs = op.join(root, name + "-ob-spectrophores.csv")
    with open(specs) as reader:
        specs = []
        for line in reader:
            specs.append(map(lambda a: float(a.strip()), line.split(",")))
        x = np.array(specs)
    feature_names = mlio.generate_names(len(specs[0]))
    mlio.save_arff(x, y, op.join(root, name + "-ob-spectrophores.arff"), classes=classes, feature_names=feature_names)
    mlio.save_tab(x, y, op.join(root, name + "-ob-spectrophores.txt"), classes=classes)
Exemplo n.º 2
0
    for dataset in datasets:
        print dataset

        y = read_y(root, dataset)
        classes = infer_classes(y)

        #Process ob spectrophores
        specs = op.join(root, dataset, dataset + '-ob-spectrophores.csv')
        with open(specs) as reader:
            specs = []
            for line in reader:
                specs.append(map(lambda a: float(a.strip()), line.split(',')))
            x = np.array(specs)
        mlio.save_arff(x, y, op.join(root, dataset, dataset + '-ob-spectrophores.arff'), classes=classes)
        mlio.save_tab(x, y, op.join(root, dataset, dataset + '-ob-spectrophores.txt'), classes=classes)

        #Process CDK descriptors
        for descs in glob.glob(op.join(root, dataset, '*-cdk-*.csv')):
            with open(descs) as reader:
                header = reader.next()
                if header.startswith('Title'):
                    x, features = cdkdeskui2dense(descs)
                    mlio.save_arff(x, y, op.splitext(descs)[0] + '.arff', feature_names=features, classes=classes)
                    mlio.save_tab(x, y, op.splitext(descs)[0] + '.txt', classes=classes)
                else:
                    x, name = cdkdeskuifps2dense(descs)
                    mlio.save_arff(x, y, op.splitext(descs)[0] + '.arff', relation_name=name, classes=classes)
                    mlio.save_tab(x, y, op.splitext(descs)[0] + '.txt', classes=classes)

#TODO: Save the compound ID too