Exemplo n.º 1
0
def try_params(n_instances, params, base, train, valid, test, istest):
    n_instances = int(round(n_instances))
    # print "n_instances:", n_instances
    pprint(params)

    L = list([])

    if params['missingMerge'] == False:
        L.append("-M")

    if params['binarizeNumericAttributes'] == True:
        L.append("-B")

    # print L

    search = ASSearch(classname="weka.attributeSelection.Ranker")
    evaluator = ASEvaluation(classname="weka.attributeSelection.InfoGainAttributeEval", options=L)

    clf = Classifier(classname="weka.classifiers.meta.AttributeSelectedClassifier")

    clf.set_property("evaluator", evaluator.jobject)
    clf.set_property("search", search.jobject)
    clf.set_property("base", base.jobject)

    if istest:
        result = test_weka_classifier(clf, train, test)
    else:
        result = train_and_eval_weka_classifier(clf, train, valid, n_instances)

    return result
Exemplo n.º 2
0
def try_params(n_instances, params, base, train, valid, test, istest):
    n_instances = int(round(n_instances))
    pprint(params)

    L = list()

    if params['missingSeparate'] == True:
        L.append("-M")

    if params['locallyPredictive'] == False:
        L.append("-L")

    search = ASSearch(classname="weka.attributeSelection." + params['search'])
    evaluator = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval",
                             options=L)

    clf = Classifier(
        classname="weka.classifiers.meta.AttributeSelectedClassifier")

    clf.set_property("evaluator", evaluator.jobject)
    clf.set_property("base", base.jobject)

    if istest:
        result = test_weka_classifier(clf, train, test)
    else:
        result = train_and_eval_weka_classifier(clf, train, valid, n_instances)

    return result
Exemplo n.º 3
0
def Feature_Selection(infile):
    directory = os.getcwd() + '/'
    csvpath = directory + infile

    jvm.start(packages=True, max_heap_size="4g")
    print "\n\n"
    print "Loaded file: ", infile
    csvloader = Loader(classname="weka.core.converters.CSVLoader")
    csvdata = csvloader.load_file(csvpath)

    remover = Filter(classname="weka.filters.unsupervised.attribute.Remove",
                     options=["-R", " 1"])
    remover.inputformat(csvdata)
    filtered_data = remover.filter(csvdata)
    filtered_data.class_is_last()

    search = ASSearch(classname="weka.attributeSelection.BestFirst",
                      options=["-D", "1", "-N", "5"])
    evaluator = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval",
                             options=["-P", "1", "-E", "1"])
    attribs = AttributeSelection()
    attribs.search(search)
    attribs.evaluator(evaluator)
    attribs.select_attributes(filtered_data)
    print "Summary of Attribute Selection: "
    print attribs.results_string
    jvm.stop()
    return
Exemplo n.º 4
0
    def search(self):
        """
        Returns the search.

        :return: the search in use
        :rtype: ASSearch
        """
        return ASSearch(jobject=javabridge.call(self.jobject, "getSearch", "()Lweka/attributeSelection/ASSearch;"))
Exemplo n.º 5
0
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    anneal_file = helper.get_data_dir() + os.sep + "anneal.arff"
    helper.print_info("Loading dataset: " + anneal_file)
    loader = Loader("weka.core.converters.ArffLoader")
    anneal_data = loader.load_file(anneal_file)
    anneal_data.class_is_last()

    # perform attribute selection
    helper.print_title("Attribute selection")
    search = ASSearch(classname="weka.attributeSelection.BestFirst",
                      options=["-D", "1", "-N", "5"])
    evaluation = ASEvaluation(
        classname="weka.attributeSelection.CfsSubsetEval",
        options=["-P", "1", "-E", "1"])
    attsel = AttributeSelection()
    attsel.search(search)
    attsel.evaluator(evaluation)
    attsel.select_attributes(anneal_data)
    print("# attributes: " + str(attsel.number_attributes_selected))
    print("attributes (as numpy array): " + str(attsel.selected_attributes))
    print("attributes (as list): " + str(list(attsel.selected_attributes)))
    print("result string:\n" + attsel.results_string)

    # perform ranking
    helper.print_title("Attribute ranking (2-fold CV)")
    search = ASSearch(classname="weka.attributeSelection.Ranker",
                      options=["-N", "-1"])
    evaluation = ASEvaluation("weka.attributeSelection.InfoGainAttributeEval")
    attsel = AttributeSelection()
    attsel.ranking(True)
    attsel.folds(2)
    attsel.crossvalidation(True)
    attsel.seed(42)
    attsel.search(search)
    attsel.evaluator(evaluation)
    attsel.select_attributes(anneal_data)
    print("ranked attributes:\n" + str(attsel.ranked_attributes))
    print("result string:\n" + attsel.results_string)
Exemplo n.º 6
0
def get_search(params):
    # pprint(params)

    L = list([])

    L.append("-N")
    L.append(str(params['num_attr']))

    search = ASSearch(classname="weka.attributeSelection.Ranker", options=L)

    return search
Exemplo n.º 7
0
 def showAttributeRanking(self, data):
     search = ASSearch(
         classname="weka.attributeSelection.Ranker",
         options=["-T", "-1.7976931348623157E308", "-N", "-1"])
     evaluator = ASEvaluation(
         classname="weka.attributeSelection.InfoGainAttributeEval")
     attsel = AttributeSelection()
     attsel.set_search(search)
     attsel.set_evaluator(evaluator)
     attsel.select_attributes(data)
     print("# attributes: " + str(attsel.get_number_attributes_selected()))
     print("attributes: " + str(attsel.get_selected_attributes()))
     print("result string:\n" + attsel.to_results_string())
Exemplo n.º 8
0
def get_search(params):
    # pprint(params)

    L = list([])

    L.append("-D")
    L.append(str(params['direction']))

    L.append("-N")
    L.append(str(params['nodes']))

    search = ASSearch(classname="weka.attributeSelection.BestFirst", options=L)

    return search
Exemplo n.º 9
0
 def filter_data(self, data):
     print("Filtering Data..\n")
     flter = Filter(
         classname="weka.filters.supervised.attribute.AttributeSelection")
     aseval = ASEvaluation(
         classname="weka.attributeSelection.CfsSubsetEval",
         options=["-P", "1", "-E", "1"])
     assearch = ASSearch(classname="weka.attributeSelection.BestFirst",
                         options=["-D", "1", "-N", "5"])
     flter.set_property("evaluator", aseval.jobject)
     flter.set_property("search", assearch.jobject)
     flter.inputformat(data)
     filtered = flter.filter(data)
     return filtered
Exemplo n.º 10
0
 def featureSelection(self):
     alg_search = ASSearch(
         classname="weka.attributeSelection.GeneticSearch",
         options=["-Z", "1024", "-G", "20", "-C", "0.6", "-M", "0.3"])
     alg_evaluation = ASEvaluation(
         classname="weka.attributeSelection.CfsSubsetEval",
         options=["-P", "1", "-E", "1"])
     feature_selection = AttributeSelection()
     feature_selection.search(alg_search)
     feature_selection.evaluator(alg_evaluation)
     feature_selection.select_attributes(self.original_data)
     self.selected_features = feature_selection.selected_attributes
     self.num_features = feature_selection.number_attributes_selected
     self.data_selected = feature_selection.reduce_dimensionality(
         self.original_data)
Exemplo n.º 11
0
def cfs(table, cores):
    loader = Loader("weka.core.converters.CSVLoader")
    anneal_data = loader.load_file(table)
    anneal_data.class_is_last()
    logger.info("Running attribute selection for: " + str(table.split("/")[-1]) + ". Please, wait a moment.")
    search = ASSearch(classname="weka.attributeSelection.BestFirst", options=["-D", "0", "-N", "5"])
    evaluation = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval", options=["-Z", "-P", cores, "-E", cores])
    attsel = AttributeSelection()
    attsel.search(search)
    attsel.evaluator(evaluation)
    attsel.select_attributes(anneal_data)
    logger.info("Selected attributes: " + str(attsel.selected_attributes))
    anneal_data.delete(index=None) # TO-DO: Borrar instancias aun no funciona

    return list(attsel.selected_attributes)
def use_filter(data):
    """
    Uses the AttributeSelection filter for attribute selection.
    :param data: the dataset to use
    :type data: Instances
    """
    print("\n2. Filter")
    flter = Filter(classname="weka.filters.supervised.attribute.AttributeSelection")
    aseval = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval")
    assearch = ASSearch(classname="weka.attributeSelection.GreedyStepwise", options=["-B"])
    flter.set_property("evaluator", aseval.jobject)
    flter.set_property("search", assearch.jobject)
    flter.inputformat(data)
    filtered = flter.filter(data)
    print(str(filtered))
def use_low_level(data):
    """
    Uses the attribute selection API directly.
    :param data: the dataset to use
    :type data: Instances
    """
    print("\n3. Low-level")
    attsel = AttributeSelection()
    aseval = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval")
    assearch = ASSearch(classname="weka.attributeSelection.GreedyStepwise", options=["-B"])
    attsel.jwrapper.setEvaluator(aseval.jobject)
    attsel.jwrapper.setSearch(assearch.jobject)
    attsel.select_attributes(data)
    indices = attsel.selected_attributes
    print("selected attribute indices (starting with 0):\n" + str(indices.tolist()))
Exemplo n.º 14
0
def relieff(filter_data, feature_names):
    # define search and evaluation for ReliefF
    search = ASSearch(classname="weka.attributeSelection.Ranker", options=["-T", "-1.7976931348623157E308", "-N", "-1"])
    # last param is number of nearest neighbors
    evaluation = ASEvaluation(classname="weka.attributeSelection.ReliefFAttributeEval",
                              options=["-M", "-1", "-D", "1", "-K", "10"])

    # run the ReliefF alg
    relieff = AttributeSelection()
    relieff.search(search)
    relieff.evaluator(evaluation)
    relieff.select_attributes(filter_data)
    results = relieff.selected_attributes

    # weka wrapper returns the class col number with the results, so slice -1
    return [feature_names[i] for i in results[:-1]]
Exemplo n.º 15
0
def use_filter(data):
    """
    Uses the AttributeSelection filter for attribute selection.
    :param data: the dataset to use
    :type data: Instances
    """
    print("\n2. Filter")
    flter = wfilters.AttributeSelection()
    aseval = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval")
    assearch = ASSearch(classname="weka.attributeSelection.GreedyStepwise", options=["-B"])
    flter.evaluator = aseval
    flter.search = assearch
    flter.inputformat(data)
    filtered = flter.filter(data)
    print(str(filtered))
    print("Evaluator:\n", flter.evaluator)
    print("Search:\n", flter.search)
Exemplo n.º 16
0
def feature_selection_weka(x_train, y_train, x_test, input_path, features):
    percent = int(x_train.shape[1] * (features / 100.0))
    if not os.path.exists('Weka'):
        os.mkdir('Weka')

    if not os.path.exists(input_path +
                          f'selected_features_weka_{features}.csv'):
        x_train = x_train.loc[:, (x_train != x_train.iloc[0]).any()]
        sava_data = x_train.copy()
        sava_data.columns = [str(a) + "a" for a in range(sava_data.shape[1])]
        sava_data['target'] = y_train
        sava_data.to_csv('Weka/train_weka_format.csv', index=False)

        from weka.attribute_selection import ASEvaluation, AttributeSelection, ASSearch
        from weka.core.converters import Loader, Saver
        loader = Loader(classname="weka.core.converters.CSVLoader")
        data = loader.load_file('Weka/train_weka_format.csv',
                                class_index='last')

        search = ASSearch(classname="weka.attributeSelection.GreedyStepwise",
                          options=["-C", "-R", "-N", f"{percent}"])
        evaluator = ASEvaluation(
            classname="weka.attributeSelection.CfsSubsetEval",
            options=["-P", "1", "-E", "1", "-L"])
        attsel = AttributeSelection()
        attsel.search(search)
        attsel.evaluator(evaluator)
        attsel.select_attributes(data)
        ranked_attributes = pd.DataFrame(attsel.ranked_attributes,
                                         columns=['Feature', 'Rank'])
        ranked_attributes['Feature'] = ranked_attributes['Feature'].astype(int)
        set_of_features = ranked_attributes.loc[:percent - 1, 'Feature']

        x_train.iloc[:, set_of_features].to_csv(
            input_path + f'selected_features_weka_{features}.csv')
        selected_features = x_train.iloc[:, set_of_features].columns
    else:
        selected_features = pd.read_csv(
            input_path + f'selected_features_weka_{features}.csv',
            index_col=0).columns

    x_train_filtered = x_train.loc[:, selected_features]
    x_val_filtered = x_test.loc[:, selected_features]

    return x_train_filtered, x_val_filtered
Exemplo n.º 17
0
def information_gain(filter_data, feature_names):
    # last param determines how many attributes are returned
    # 2nd param controls the score threshold
    search = ASSearch(classname="weka.attributeSelection.Ranker",
                      options=["-T", "-1.7976931348623157E308", "-N", "-1"])
    # has no params
    evaluation = ASEvaluation(
        classname="weka.attributeSelection.InfoGainAttributeEval", options=[])

    # run the Information Gain alg
    info_gain = AttributeSelection()
    info_gain.search(search)
    info_gain.evaluator(evaluation)
    info_gain.select_attributes(filter_data)
    results = info_gain.selected_attributes

    # weka wrapper returns the class col number with the results, so slice -1
    return [feature_names[i] for i in results[:-1]]
Exemplo n.º 18
0
def get_search(params):
    # pprint(params)

    L = list([])

    if params['conservation'] == False:
        L.append("-C")

    if params['backward'] == False:
        L.append("-B")

    if params['ranked'] == False:
        L.append("-R")

    search = ASSearch(classname="weka.attributeSelection.GreedyStepwise",
                      options=L)

    return search
def use_classifier(data):
    """
    Uses the meta-classifier AttributeSelectedClassifier for attribute selection.
    :param data: the dataset to use
    :type data: Instances
    """
    print("\n1. Meta-classifier")
    classifier = Classifier(classname="weka.classifiers.meta.AttributeSelectedClassifier")
    aseval = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval")
    assearch = ASSearch(classname="weka.attributeSelection.GreedyStepwise", options=["-B"])
    base = Classifier(classname="weka.classifiers.trees.J48")
    # setting nested options is always a bit tricky, getting all the escaped double quotes right
    # simply using the bean property for setting Java objects is often easier and less error prone
    classifier.set_property("classifier", base.jobject)
    classifier.set_property("evaluator", aseval.jobject)
    classifier.set_property("search", assearch.jobject)
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(classifier, data, 10, Random(1))
    print(evaluation.summary())
def all_feature(file):
    jvm.start(packages=True)
    data = converters.load_any_file(file)
    data.class_is_last()

    search = ASSearch(classname="weka.attributeSelection.Ranker",
                      options=["-T", "-1.7976931348623157E308", "-N", "-1"])
    attsel = AttributeSelection()
    attsel.search(search)

    evaluator = ASEvaluation(
        classname="weka.attributeSelection.ChiSquaredAttributeEval")
    attsel.evaluator(evaluator)
    attsel.select_attributes(data)
    t = attsel.ranked_attributes[:, 0]
    chi = t.astype(int)

    evaluator = ASEvaluation(
        classname="weka.attributeSelection.InfoGainAttributeEval")
    attsel.evaluator(evaluator)
    attsel.select_attributes(data)
    t = attsel.ranked_attributes[:, 0]
    info_gain = t.astype(int)

    evaluator = ASEvaluation(
        classname="weka.attributeSelection.GainRatioAttributeEval")
    attsel.evaluator(evaluator)
    attsel.select_attributes(data)
    t = attsel.ranked_attributes[:, 0]
    gain_ratio = t.astype(int)

    evaluator = ASEvaluation(
        classname="weka.attributeSelection.SymmetricalUncertAttributeEval")
    attsel.evaluator(evaluator)
    attsel.select_attributes(data)
    t = attsel.ranked_attributes[:, 0]
    symmetric_uncertainty = t.astype(int)

    jvm.stop()

    return chi, info_gain, gain_ratio, symmetric_uncertainty
Exemplo n.º 21
0
def get_IG(ofile_dir, loader):
	data = loader.load_file(ofile_dir)
	data.class_is_last()

	evaluator = ASEvaluation(classname="weka.attributeSelection.InfoGainAttributeEval")
	search = ASSearch(classname="weka.attributeSelection.Ranker", options=["-T", "-1.7976931348623157E308", "-N", "-1"])
	attsel = AttributeSelection()
	attsel.search(search)
	attsel.evaluator(evaluator)

	attsel.select_attributes(data)

	results = {}

	if attsel.number_attributes_selected < 2:
		flag = 0
		output = attsel.results_string
		for i in output.split('\n'):
			if (flag != 0):
				if len(i.split(' '))>2:
					t=[]
					for f in i.split(' '):
						if f!='':
							t.append(f)
					r_tax = ''
					for c in range(len(t)):
						if c>1:
							r_tax = r_tax+t[c]+' '
					results.update({str(r_tax.strip()): float(t[0].strip())})
				else:
					break
			if "Ranked attributes" in i:
				flag = 1
		mean_score = sum(results.values())/len(results.values())
		os.system("rm -r "+ofile_dir)
	else:
		results = dict([(str(data.attribute(attr[0]).name), attr[1]) for attr in attsel.ranked_attributes])
		mean_score = attsel.ranked_attributes[:,1].mean()
	
	return results, mean_score
Exemplo n.º 22
0
def try_params(n_instances, params, base, train, valid, test, istest):

    n_instances = int(round(n_instances))
    # print "n_instances:", n_instances
    pprint(params)

    L = list([])

    if params['weightByDistance'] == True:
        L.append("-W")

    L.append("-M")
    L.append(str(params['sampleSize']))

    L.append("-K")
    L.append(str(params['numNeighbours']))

    L.append("-A")
    L.append(str(params['sigma']))

    # print L

    search = ASSearch(classname="weka.attributeSelection.Ranker")
    evaluator = ASEvaluation(
        classname="weka.attributeSelection.ReliefFAttributeEval", options=L)

    clf = Classifier(
        classname="weka.classifiers.meta.AttributeSelectedClassifier")

    clf.set_property("evaluator", evaluator.jobject)
    clf.set_property("search", search.jobject)
    clf.set_property("base", base.jobject)

    if istest:
        result = test_weka_classifier(clf, train, test)
    else:
        result = train_and_eval_weka_classifier(clf, train, valid, n_instances)

    return result
Exemplo n.º 23
0
def select_attribute(file):
    global Field50
    global Field10
    global Field5
    global Field2
    global a

    filename = file.parts[-1]  # Get filename from Pathlib object
    dir = file.parents[0]  # Data directory currently in

    print("Selecting attributes from %s" % filename)

    if not filename.endswith(".arff"):
        print("%s not ARFF file." % filename)
        return

    filename_base = filename[:-5]  # Removes '.arff' from filename
    data = load_Arff_file(file)  # Load data from arff
    data.class_is_first()  # Set first attr as class

    # Define Attribute selection
    search = ASSearch(classname="weka.attributeSelection.Ranker",
                      options=["-T", "0.01", "-N", "-1"])
    # Define Attribute Evaluator
    evaluator = ASEvaluation(
        classname="weka.attributeSelection.CorrelationAttributeEval",
        options=[])

    # Run attribution selection
    attsel = AttributeSelection()
    attsel.search(search)
    attsel.evaluator(evaluator)
    attsel.select_attributes(data)

    # Define filepath and output results
    attsel_output = filename_base + "_attsel_results.txt"
    output_select_attribute(attsel, dir / attsel_output)

    # Debug Analysis
    print(attsel.selected_attributes)
    for i in range(2):
        Field2.append(attsel.selected_attributes[i])
    for i in range(5):
        Field5.append(attsel.selected_attributes[i])
    for i in range(10):
        Field10.append(attsel.selected_attributes[i])
    for i in range(50):
        Field50.append(attsel.selected_attributes[i])
    print(Field2)
    print(Field5)
    print(Field10)
    print(Field50)

    if len(set(Field10)) == len(Field10):
        print("no duplicates found")

    else:
        print("duplicate found")
        Field50 = list(set(Field50))
        Field10 = list(set(Field10))
        Field5 = list(set(Field5))
        Field2 = list(set(Field2))
def createTrainedModel():
    from weka.core.converters import Loader
    folderList = os.listdir(outputModel)
    i = 0
    classi = ""
    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file(os.path.join(outputModel, "genderTrain.arff"))
    data.class_is_last()
    from weka.classifiers import Classifier
    classi = "weka.classifiers.bayes.NaiveBayes"
    cls = Classifier(classname=classi)
    from weka.attribute_selection import ASSearch, ASEvaluation, AttributeSelection
    search = ASSearch(classname="weka.attributeSelection.Ranker",
                      options=["-1.7976931348623157E308", "-1"])
    #evaluator = ASEvaluation(classname="weka.attributeSelection.ChiSquaredAttributeEval")
    #attsel = AttributeSelection()
    #attsel.search(search)
    #attsel.evaluator(evaluator)
    #attsel.select_attributes(data)
    cls.build_classifier(data)
    import weka.core.serialization as serialization
    from weka.core.dataset import Instances
    serialization.write_all(
        os.path.join(outputModel, "GenderModel" + ".model"),
        [cls, Instances.template_instances(data)])
    from weka.classifiers import Evaluation
    from weka.core.classes import Random
    evl = Evaluation(data)
    evl.crossvalidate_model(cls, data, 10, Random(1))
    print "Gender model predictions"
    print cls
    #print(evl.percent_correct)
    print(evl.summary())
    print(evl.class_details())

    data = loader.load_file(os.path.join(outputModel, "ageTrain.arff"))
    data.class_is_last()
    classi = "weka.classifiers.bayes.NaiveBayes"
    cls = Classifier(classname=classi)
    from weka.attribute_selection import ASSearch, ASEvaluation, AttributeSelection
    search = ASSearch(classname="weka.attributeSelection.Ranker",
                      options=["-1.7976931348623157E308", "-1"])
    #evaluator = ASEvaluation(classname="weka.attributeSelection.ChiSquaredAttributeEval")
    #attsel = AttributeSelection()
    #attsel.search(search)
    #attsel.evaluator(evaluator)
    #attsel.select_attributes(data)
    #classi = "weka.classifiers.trees.J48"
    #classi = "weka.classifiers.functions.Logistic"
    #classi = "weka.classifiers.trees.RandomForest"
    #classi = "weka.classifiers.bayes.NaiveBayes"
    #classi = "weka.classifiers.functions.SMOreg"
    cls.build_classifier(data)
    print "Age model predictions"
    print cls
    import weka.core.serialization as serialization
    from weka.core.dataset import Instances
    serialization.write_all(os.path.join(outputModel, "AgeModel" + ".model"),
                            [cls, Instances.template_instances(data)])
    evl = Evaluation(data)
    evl.crossvalidate_model(cls, data, 10, Random(1))

    #print(evl.percent_correct)
    print(evl.summary())
    print(evl.class_details())
    os._exit(0)
Exemplo n.º 25
0
chisq_installed = False
for p in pkg.installed_packages():
    if p.name == chisq_name:
        chisq_installed = True
if not chisq_installed:
    pkg.install_package(chisq_name)
    print("pkg %s installed, please restart" % chisq_name)
    jvm.stop()
    sys.exit(1)

data_dir = "\\\\egr-1l11qd2\\CLS_lab\\Junya Zhao\\GWAS SNPs_2018\\random50_combo_Nonoverlap_\\"
globbed_files = glob.glob(data_dir + "*.csv")
for csv in globbed_files:
    data = converters.load_any_file(csv)
    data.class_is_last()
    search = ASSearch(classname="weka.attributeSelection.BestFirst",
                      options=["-D", "1", "-N", "10"])
    evaluator = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval",
                             options=["-P", "1", "E", "1"])
    attsel = AttributeSelection()
    attsel.folds(10)
    attsel.crossvalidation(True)
    attsel.seed(1)
    attsel.search(search)
    attsel.evaluator(evaluator)
    attsel.select_attributes(data)
    evl = Evaluation(data)
    print("# attributes: " + str(attsel.number_attributes_selected))
    print("attributes: " + str(attsel.selected_attributes))
    print("result string:\n" + attsel.results_string)
    print(evl)
    # write the report for each file
Exemplo n.º 26
0
chisq_installed = False
for p in pkg.installed_packages():
    if p.name == chisq_name:
        chisq_installed = True
if not chisq_installed:
    pkg.install_package(chisq_name)
    print("pkg %s installed, please restart" % chisq_name)
    #jvm.stop()
    #sys.exit(1)
data_dir = "\\\\egr-1l11qd2\\CLS_lab\\Junya Zhao\\\Data driven model _paper [June 25_2018\\NonOverlap_featureSelection\\mRMRReport\\"
globbed_files = glob.glob(data_dir + "*.csv")
for csv in globbed_files:
    data = converters.load_any_file(csv)
    data.class_is_last()
    search = ASSearch(
        classname="weka.attributeSelection.RerankingSearch",
        options=["-method", "2", "-blockSize", "50", "-rankingMeasure", "0"])
    evaluator = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval",
                             options=["-P", "1", "E", "1"])
    attsel = AttributeSelection()
    attsel.folds(10)
    attsel.crossvalidation(True)
    attsel.seed(1)
    attsel.search(search)
    attsel.evaluator(evaluator)
    attsel.select_attributes(data)
    evl = Evaluation(data)
    print("# attributes: " + str(attsel.number_attributes_selected))
    print("attributes: " + str(attsel.selected_attributes))
    print("result string:\n" + attsel.results_string)
    print(evl)
Exemplo n.º 27
0
def main():
    jvm.start(packages=True, max_heap_size="4g")

    print(
        "Hi! This is a protected command, please insert the password to proceed!"
    )
    for x in range(3):
        password = input('')
        if password.strip() == 'DMMLproject':
            print("All good!")
            break
        else:
            if x == 2:
                print(
                    "This command is protected and can be used only by an administrator, please use another command."
                )
                return
            else:
                print("Wrong password, please provide the correct password")

    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file("dataSources/fraud.arff")
    print("Before Preprocessing: \n")

    classStats = data.attribute_stats(22).nominal_counts
    print("#instances(Class 0): ", classStats[0])
    print("#instances(Class 1): ", classStats[1])

    preProcessedData = preprocess(data)

    print("After Preprocessing: \n")
    classStats = preProcessedData.attribute_stats(
        preProcessedData.class_index).nominal_counts
    print("#instances(Class 0): ", classStats[0])
    print("#instances(Class 1): ", classStats[1])

    # setup classifier with attribute selection
    classifier = Classifier(
        classname="weka.classifiers.meta.AttributeSelectedClassifier")
    aseval = ASEvaluation(
        classname="weka.attributeSelection.InfoGainAttributeEval")
    assearch = ASSearch(classname="weka.attributeSelection.Ranker",
                        options=["-N", "4"])

    classifier.set_property("evaluator", aseval.jobject)
    classifier.set_property("search", assearch.jobject)

    base1 = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
    base2 = Classifier(classname="weka.classifiers.trees.RandomForest",
                       options=[
                           "-P", "70", "-I", "30", "-num-slots", "1", "-K",
                           "0", "-M", "1.0", "-S", "1", "-depth", "50"
                       ])
    base3 = Classifier(classname="weka.classifiers.trees.J48",
                       options=["-C", "0.25", "-M", "2"])
    base4 = Classifier(classname="weka.classifiers.trees.J48",
                       options=["-U", "-M", "2"])
    base5 = Classifier(classname="weka.classifiers.trees.HoeffdingTree",
                       options=[
                           "-L", "2", "-S", "1", "-E", "1.0E7", "-H", "0.05",
                           "-M", "0.01", "-G", "200.0", "-N", "0.0"
                       ])
    base6 = Classifier(classname="weka.classifiers.lazy.IBk",
                       options=['-K', '1', '-W', '0'])
    base7 = Classifier(classname="weka.classifiers.bayes.BayesNet")

    # naive bayes - cross validate - traintestSplit
    print("----------NaiveBayes----------")
    classifier.set_property("classifier", base1.jobject)
    classify(preProcessedData,
             classifier,
             True,
             'models/naiveBayes.model',
             splitPerc=70,
             randomSeed=10)
    classify(preProcessedData,
             classifier,
             False,
             'models/naiveBayes.model',
             splitPerc=70,
             randomSeed=10)

    # random forest - cross validate - traintestSplit
    print("----------RandomForest----------")
    classifier.set_property("classifier", base2.jobject)
    classify(preProcessedData,
             classifier,
             True,
             'models/randomForest.model',
             splitPerc=70,
             randomSeed=10)
    classify(preProcessedData,
             classifier,
             False,
             'models/randomForest.model',
             splitPerc=70,
             randomSeed=10)

    # decision tree (with pruning) - cross validate - traintestSplit
    print("----------DecisionTree----------")
    classifier.set_property("classifier", base3.jobject)
    classify(preProcessedData,
             classifier,
             True,
             'models/prunedJ48.model',
             splitPerc=70,
             randomSeed=10)
    classify(preProcessedData,
             classifier,
             False,
             'models/prunedJ48.model',
             splitPerc=70,
             randomSeed=10)

    # decision tree (without pruning) - cross validate - traintestSplit
    print("----------DecisionTreeUnpruned----------")
    classifier.set_property("classifier", base4.jobject)
    classify(preProcessedData,
             classifier,
             True,
             'models/unprunedJ48.model',
             splitPerc=70,
             randomSeed=10)
    classify(preProcessedData,
             classifier,
             False,
             'models/unprunedJ48.model',
             splitPerc=70,
             randomSeed=10)

    # Hoeffding tree - cross validate - traintestSplit
    print("----------HoeffdingTree----------")
    classify(preProcessedData,
             base5,
             True,
             'models/HoeffdingTree.model',
             splitPerc=70,
             randomSeed=10)
    classify(preProcessedData,
             base5,
             False,
             'models/HoeffdingTree.model',
             splitPerc=70,
             randomSeed=10)

    # K-Nearest-Neighbours - cross validate - traintestSplit
    print("----------KNN----------")
    classifier.set_property("classifier", base6.jobject)
    classify(preProcessedData,
             classifier,
             False,
             'models/knn.model',
             splitPerc=70,
             randomSeed=10)
    classify(preProcessedData,
             classifier,
             True,
             'models/preProcessedJ48.model',
             splitPerc=70,
             randomSeed=10)

    # bayesian belief networks - cross validate - traintestSplit
    print("----------BayesianBelief----------")
    classifier.set_property("classifier", base7.jobject)
    classify(preProcessedData,
             classifier,
             True,
             'models/bayesianBelief.model',
             splitPerc=70,
             randomSeed=10)
    classify(preProcessedData,
             classifier,
             False,
             'models/bayesianBelief.model',
             splitPerc=70,
             randomSeed=10)
                from weka.core.converters import Loader

                loader = Loader(classname="weka.core.converters.CSVLoader")
                dataTrain = loader.load_file(path + '/' + str(Window[window]) +
                                             'd_FOLDS_train_' + str(fold) +
                                             '.csv')
                dataTest = loader.load_file(path + '/' + str(Window[window]) +
                                            'd_FOLDS_test_' + str(fold) +
                                            '.csv')
                dataTrain.class_is_last()
                dataTest.class_is_last()

                from weka.attribute_selection import AttributeSelection, ASEvaluation, ASSearch
                search = ASSearch(
                    classname="weka.attributeSelection.RerankingSearch"
                )  #,options=["-method", "2"])
                evaluator = ASEvaluation(
                    classname='weka.attributeSelection.ClassifierAttributeEval',
                    options=['-B', 'weka.classifiers.bayes.NaiveBayes'])

                Eval = AttributeSelection(
                    classname='weka.attributeSelection.ClassifierAttributeEval',
                    options=[
                        '-B', 'weka.classifiers.bayes.NaiveBayes', '--',
                        "-S 'weka.attributeSelection.RerankingSearch -method 2'"
                    ])

                from weka.filters import Filter

                NominalToBinary = Filter(
Exemplo n.º 29
0
    (["-F", "10", "-T", "-1", "-B", "weka.classifiers.trees.J48"], ["-D",
                                                                    "0"]),
    (["-F", "10", "-T", "-1", "-B",
      "weka.classifiers.trees.J48"], ["-D", "0", "-N", "1"]),
    (["-F", "10", "-T", "-1", "-B",
      "weka.classifiers.trees.J48"], ["-D", "1", "-N", "2"]),
    (["-F", "10", "-T", "-1", "-B",
      "weka.classifiers.trees.J48"], ["-D", "2", "-N", "2"]),
)

# attribute selection
for setup in setups:
    evl, search = setup
    aseval = ASEvaluation(
        classname="weka.attributeSelection.WrapperSubsetEval", options=evl)
    assearch = ASSearch(classname="weka.attributeSelection.BestFirst",
                        options=search)
    print("\n--> Attribute selection\n")
    print(aseval.to_commandline())
    print(assearch.to_commandline())
    attsel = AttributeSelection()
    attsel.evaluator(aseval)
    attsel.search(assearch)
    attsel.select_attributes(data)
    print(attsel.results_string)

# cross-validation
aseval = ASEvaluation(classname="weka.attributeSelection.WrapperSubsetEval",
                      options=["-F", "10", "-B", "weka.classifiers.trees.J48"])
assearch = ASSearch(classname="weka.attributeSelection.BestFirst",
                    options=["-D", "0", "-N", "5"])
print("\n--> Attribute selection (cross-validation)\n")
    if p.name == chisq_name:
        chisq_installed = True
if not chisq_installed:
    pkg.install_package(chisq_name)
    print("pkg %s installed, please restart" % chisq_name)
    jvm.stop()
    sys.exit(1)
"""
data_dir = "\\\\egr-1l11qd2\\CLS_lab\\Junya Zhao\\Data driven model _paper [June 25_2018\\FeatureSelection\\EvlSearch\\"
globbed_files = glob.glob(data_dir + "*.csv")
for csv in globbed_files:
    data = converters.load_any_file(csv)
    data.class_is_last()
    search = ASSearch(classname="weka.attributeSelection.EvolutionarySearch",
                      options=[
                          "-population-size", "200", "-generations", "500",
                          "-crossover-probability", "0.6"
                      ])
    evaluator = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval",
                             options=["-P", "1", "E", "1"])
    attsel = AttributeSelection()
    attsel.folds(10)
    attsel.crossvalidation(True)
    attsel.seed(1)
    attsel.search(search)
    attsel.evaluator(evaluator)
    attsel.select_attributes(data)
    evl = Evaluation(data)
    print("# attributes: " + str(attsel.number_attributes_selected))
    print("attributes: " + str(attsel.selected_attributes))
    print("result string:\n" + attsel.results_string)
Exemplo n.º 31
0
    (
        ["-F", "10", "-T", "-1", "-B", "weka.classifiers.trees.J48"],
        ["-D", "1", "-N", "2"]
    ),
    (
        ["-F", "10", "-T", "-1", "-B", "weka.classifiers.trees.J48"],
        ["-D", "2", "-N", "2"]
    ),
)

# attribute selection
for setup in setups:
    evl, search = setup
    aseval = ASEvaluation(classname="weka.attributeSelection.WrapperSubsetEval",
                          options=evl)
    assearch = ASSearch(classname="weka.attributeSelection.BestFirst",
                        options=search)
    print("\n--> Attribute selection\n")
    print(aseval.to_commandline())
    print(assearch.to_commandline())
    attsel = AttributeSelection()
    attsel.evaluator(aseval)
    attsel.search(assearch)
    attsel.select_attributes(data)
    print(attsel.results_string)

# cross-validation
aseval = ASEvaluation(classname="weka.attributeSelection.WrapperSubsetEval",
                      options=["-F", "10", "-B", "weka.classifiers.trees.J48"])
assearch = ASSearch(classname="weka.attributeSelection.BestFirst",
                    options=["-D", "0", "-N", "5"])
print("\n--> Attribute selection (cross-validation)\n")