Пример #1
0
    def test_aleph_mysql(self):
        conv = AlephConverter(self.context, target_att_val = 'east')
        aleph = Aleph()
        theory, features = aleph.induce('induce_features', conv.positive_examples(), 
                                        conv.negative_examples(),
                                        conv.background_knowledge())

        with open(os.path.join(RESULTS_FOLDER, 'wrappers', 'aleph', 'trains.arff')) as f:
            self.assertMultiLineEqual(theory, f.read())
Пример #2
0
    def test_aleph_mysql(self):
        conv = AlephConverter(self.context, target_att_val='east')
        aleph = Aleph()
        theory, features = aleph.induce('induce_features', conv.positive_examples(),
                                        conv.negative_examples(),
                                        conv.background_knowledge())

        with open(os.path.join(RESULTS_FOLDER, 'wrappers', 'aleph', 'trains.arff')) as f:
            self.assertMultiLineEqual(theory, f.read())
Пример #3
0
from rdm.db import DBVendor, DBConnection, DBContext, AlephConverter
from rdm.wrappers import Aleph

# Provide connection information
connection = DBConnection(
    'ilp',  # User
    'ilp123',  # Password
    'workflow.ijs.si',  # Host
    'ilp',  # Database
)

# Define learning context
context = DBContext(connection, target_table='trains', target_att='direction')

# Convert the data and induce features using Aleph
conv = AlephConverter(context, target_att_val='east')
aleph = Aleph()
theory, features = aleph.induce('induce_features', conv.positive_examples(),
                                conv.negative_examples(),
                                conv.background_knowledge())
print(theory)
Пример #4
0
def transform(algorithm,
              context,
              target_att_value,
              seed,
              result_file,
              transformations,
              fold_nums=10):
    fold_num = 0
    for train_context, test_context in cv_split(context,
                                                folds=fold_nums,
                                                random_seed=seed):
        fold_num += 1

        print("FOLD", fold_num)
        with open(result_file, 'a') as f:
            f.write("FOLD {}\n".format(fold_num))

        #ALEPH
        if algorithm == "aleph":

            start = time.time()
            conv = AlephConverter(train_context,
                                  target_att_val=target_att_value)
            aleph = Aleph()
            train_arff, features = aleph.induce('induce_features',
                                                conv.positive_examples(),
                                                conv.negative_examples(),
                                                conv.background_knowledge(),
                                                printOutput=False)

            data = arff.loads(str(train_arff))
            entries = []
            targets = []

            for entry in data['data']:
                en = list(entry)
                features_target = en[-1]
                features_train = en[0:len(en) - 1]
                features_train = [1 if x == "+" else 0 for x in features_train]
                entries.append(features_train)
                targets.append(features_target)

            tmp_learner = 'aleph'
            test_arff = mapper.domain_map(features,
                                          tmp_learner,
                                          train_context,
                                          test_context,
                                          format="csv",
                                          positive_class=target_att_value)
            test_ins = test_arff.split("\n")

            entries_test = []
            targets_test = []

            for entry in test_ins:
                en = entry.strip().split(",")
                if en[-1] != '':
                    features_target = en[-1]
                    features_train = en[0:len(en) - 1]
                    features_train = [
                        1 if x == "+" else 0 for x in features_train
                    ]
                    entries_test.append(features_train)
                    targets_test.append(features_target)

            targets_test = [
                'positive' if x == target_att_value else 'negative'
                for x in targets_test
            ]

            train_features = pd.DataFrame(entries).to_numpy()
            train_targets = pd.DataFrame(targets).to_numpy()
            test_features = pd.DataFrame(entries_test).to_numpy()
            test_targets = pd.DataFrame(targets_test).to_numpy()

            le = preprocessing.LabelEncoder()
            le.fit(train_targets)
            targets_train_encoded = le.transform(train_targets)
            targets_test_encoded = le.transform(test_targets)

            end = time.time()
            run_time = end - start
            train_data = (train_features, targets_train_encoded)
            test_data = (test_features, targets_test_encoded)

            pickle.dump(
                train_data,
                open("{}_{}_train.p".format(transformations, fold_num), "wb"))
            pickle.dump(
                test_data,
                open("{}_{}_test.p".format(transformations, fold_num), "wb"))

            print(algorithm, " TIME:", run_time)
            with open(result_file, 'a') as f:
                f.write("{} TIME: {}\n".format(algorithm, run_time))

        #RSD
        elif algorithm == "rsd":

            start = time.time()
            conv = RSDConverter(train_context)
            rsd = RSD()
            features, train_arff, _ = rsd.induce(conv.background_knowledge(),
                                                 examples=conv.all_examples(),
                                                 cn2sd=False)

            data = arff.loads(str(train_arff))
            entries = []
            targets = []

            for entry in data['data']:
                en = list(entry)
                features_target = en[-1]
                features_train = en[0:len(en) - 1]
                features_train = [1 if x == "+" else 0 for x in features_train]
                entries.append(features_train)
                targets.append(features_target)

            tmp_learner = 'rsd'
            test_arff = mapper.domain_map(features,
                                          tmp_learner,
                                          train_context,
                                          test_context,
                                          format="csv")
            test_ins = test_arff.split("\n")

            entries_test = []
            targets_test = []

            for entry in test_ins:
                en = entry.strip().split(",")
                if en[-1] != '':
                    features_target = en[-1]
                    features_train = en[0:len(en) - 1]
                    features_train = [
                        1 if x == "+" else 0 for x in features_train
                    ]
                    entries_test.append(features_train)
                    targets_test.append(features_target)

            train_features = pd.DataFrame(entries).to_numpy()
            train_targets = pd.DataFrame(targets).to_numpy()
            test_features = pd.DataFrame(entries_test).to_numpy()
            test_targets = pd.DataFrame(targets_test).to_numpy()

            le = preprocessing.LabelEncoder()
            le.fit(train_targets)
            targets_train_encoded = le.transform(train_targets)
            targets_test_encoded = le.transform(test_targets)

            end = time.time()
            run_time = end - start
            train_data = (train_features, targets_train_encoded)
            test_data = (test_features, targets_test_encoded)

            pickle.dump(
                train_data,
                open("{}_{}_train.p".format(transformations, fold_num), "wb"))
            pickle.dump(
                test_data,
                open("{}_{}_test.p".format(transformations, fold_num), "wb"))

            print(algorithm, " TIME:", run_time)
            with open(result_file, 'a') as f:
                f.write("{} TIME: {}\n".format(algorithm, run_time))

        #Treeliker
        elif algorithm == "treeliker":

            start = time.time()
            conv = TreeLikerConverter(train_context)
            conv2 = TreeLikerConverter(test_context)
            treeliker = TreeLiker(conv.dataset(), conv.default_template(),
                                  conv2.dataset())
            train_arff, test_arff = treeliker.run()
            wtag = False
            entries = []
            targets = []
            entries_test = []
            targets_test = []

            for entry in train_arff.split("\n"):
                if wtag:
                    en = entry.split(",")
                    if len(en) > 1:
                        en = [x.replace(" ", "") for x in en]
                        targets.append(en[-1])
                        en = [1 if "+" in x else 0 for x in en]
                        entries.append(en[0:len(en) - 1])
                if "@data" in entry:
                    wtag = True

            wtag = False
            for entry in test_arff.split("\n"):
                if wtag:
                    en = entry.split(",")
                    if len(en) > 1:
                        en = [x.replace(" ", "") for x in en]
                        targets_test.append(en[-1])
                        en = [1 if "+" in x else 0 for x in en]
                        entries_test.append(en[0:len(en) - 1])

                if "@data" in entry:
                    wtag = True

            train_features = pd.DataFrame(entries).to_numpy()
            train_targets = pd.DataFrame(targets).to_numpy()
            test_features = pd.DataFrame(entries_test).to_numpy()
            test_targets = pd.DataFrame(targets_test).to_numpy()

            le = preprocessing.LabelEncoder()
            le.fit(train_targets)
            targets_train_encoded = le.transform(train_targets)
            targets_test_encoded = le.transform(test_targets)

            end = time.time()
            run_time = end - start
            train_data = (train_features, targets_train_encoded)
            test_data = (test_features, targets_test_encoded)

            pickle.dump(
                train_data,
                open("{}_{}_train.p".format(transformations, fold_num), "wb"))
            pickle.dump(
                test_data,
                open("{}_{}_test.p".format(transformations, fold_num), "wb"))

            print(algorithm, " TIME:", run_time)
            with open(result_file, 'a') as f:
                f.write("{} TIME: {}\n".format(algorithm, run_time))

        #Wordification
        elif algorithm == "wordification":

            start = time.time()
            corange = OrangeConverter(train_context)
            torange = OrangeConverter(test_context)
            wordification = Wordification(corange.target_Orange_table(),
                                          corange.other_Orange_tables(),
                                          train_context)
            wordification.run(1)
            wordification.calculate_weights()
            train_arff = wordification.to_arff()
            wordification_test = Wordification(torange.target_Orange_table(),
                                               torange.other_Orange_tables(),
                                               test_context)
            wordification_test.run(1)
            wordification_test.calculate_weights()

            idfs = wordification.idf
            docs = wordification_test.resulting_documents
            classes = [str(a) for a in wordification_test.resulting_classes]
            feature_names = wordification.word_features
            feature_vectors = []
            for doc in docs:
                doc_vec = []
                for feature in feature_names:
                    cnt = 0
                    for x in doc:
                        if x == feature:
                            cnt += 1
                    idf = cnt * idfs[feature]
                    doc_vec.append(idf)
                feature_vectors.append(doc_vec)
            print(feature_vectors, classes)

            test_arff = wordification_test.to_arff()

            entries = []
            targets = []
            entries_test = []
            targets_test = []
            wtag = False

            for entry in train_arff.split("\n"):
                if wtag:
                    en = entry.split(",")
                    if len(en) > 1:
                        en = [x.replace(" ", "") for x in en]

                        targets.append(en[-1])
                        entries.append([float(x) for x in en[0:len(en) - 1]])
                if "@DATA" in entry:
                    wtag = True

            wtag = False

            targets_test = classes
            entries_test = feature_vectors

            train_features = pd.DataFrame(entries).to_numpy()
            train_targets = pd.DataFrame(targets).to_numpy()
            test_features = pd.DataFrame(entries_test).to_numpy()
            test_targets = pd.DataFrame(targets_test).to_numpy()

            le = preprocessing.LabelEncoder()
            le.fit(np.concatenate([train_targets, test_targets]))
            targets_train_encoded = le.transform(train_targets)
            targets_test_encoded = le.transform(test_targets)

            end = time.time()
            run_time = end - start
            train_data = (train_features, targets_train_encoded)
            test_data = (test_features, targets_test_encoded)

            pickle.dump(
                train_data,
                open("{}_{}_train.p".format(transformations, fold_num), "wb"))
            pickle.dump(
                test_data,
                open("{}_{}_test.p".format(transformations, fold_num), "wb"))

            print(algorithm, " TIME:", run_time)
            with open(result_file, 'a') as f:
                f.write("{} TIME: {}\n".format(algorithm, run_time))

        #relaggs/nrelaggs
        else:
            converter = context_converter(train_context,
                                          test_context,
                                          verbose=0)
            train_data = converter.get_train()
            test_data = converter.get_test()
            plan = converter.get_plan()

            pickle.dump(
                train_data,
                open("{}_{}_train.p".format(transformations, fold_num), "wb"))
            pickle.dump(
                test_data,
                open("{}_{}_test.p".format(transformations, fold_num), "wb"))
            pickle.dump(
                plan,
                open("{}_{}_plan.p".format(transformations, fold_num), "wb"))

            run_time = converter.get_time()
            print(algorithm, " TIME:", run_time)
            with open(result_file, 'a') as f:
                f.write("{} TIME: {}\n".format(algorithm, run_time))
Пример #5
0
        # Find features on the train set

        start = timer()
        if learner == "RSD":
            conv = RSDConverter(train_context)
            rsd = RSD()

            features, train_arff, _ = rsd.induce(
                conv.background_knowledge(),  # Background knowledge
                examples=conv.all_examples(),  # Training examples
                cn2sd=False  # Disable built-in subgroup discovery
            )

        if learner == "aleph":
            conv = AlephConverter(context, target_att_val=target_attr_value)
            aleph = Aleph()
            train_arff, features = aleph.induce('induce_features',
                                                conv.positive_examples(),
                                                conv.negative_examples(),
                                                conv.background_knowledge(),
                                                printOutput=False)

        if learner == "treeliker":
            conv = TreeLikerConverter(train_context)
            conv2 = TreeLikerConverter(test_context)
            treeliker = TreeLiker(conv.dataset(), conv.default_template(),
                                  conv2.dataset())  # Runs RelF by default
            train_arff, test_arff = treeliker.run()
            wtag = False
            entries = []
            targets = []