def Feature_Selection(infile): directory = os.getcwd() + '/' csvpath = directory + infile jvm.start(packages=True, max_heap_size="4g") print "\n\n" print "Loaded file: ", infile csvloader = Loader(classname="weka.core.converters.CSVLoader") csvdata = csvloader.load_file(csvpath) remover = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", " 1"]) remover.inputformat(csvdata) filtered_data = remover.filter(csvdata) filtered_data.class_is_last() search = ASSearch(classname="weka.attributeSelection.BestFirst", options=["-D", "1", "-N", "5"]) evaluator = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval", options=["-P", "1", "-E", "1"]) attribs = AttributeSelection() attribs.search(search) attribs.evaluator(evaluator) attribs.select_attributes(filtered_data) print "Summary of Attribute Selection: " print attribs.results_string jvm.stop() return
def main(): """ Just runs some example code. """ # load a dataset anneal_file = helper.get_data_dir() + os.sep + "anneal.arff" helper.print_info("Loading dataset: " + anneal_file) loader = Loader("weka.core.converters.ArffLoader") anneal_data = loader.load_file(anneal_file) anneal_data.class_is_last() # perform attribute selection helper.print_title("Attribute selection") search = ASSearch(classname="weka.attributeSelection.BestFirst", options=["-D", "1", "-N", "5"]) evaluation = ASEvaluation( classname="weka.attributeSelection.CfsSubsetEval", options=["-P", "1", "-E", "1"]) attsel = AttributeSelection() attsel.search(search) attsel.evaluator(evaluation) attsel.select_attributes(anneal_data) print("# attributes: " + str(attsel.number_attributes_selected)) print("attributes (as numpy array): " + str(attsel.selected_attributes)) print("attributes (as list): " + str(list(attsel.selected_attributes))) print("result string:\n" + attsel.results_string) # perform ranking helper.print_title("Attribute ranking (2-fold CV)") search = ASSearch(classname="weka.attributeSelection.Ranker", options=["-N", "-1"]) evaluation = ASEvaluation("weka.attributeSelection.InfoGainAttributeEval") attsel = AttributeSelection() attsel.ranking(True) attsel.folds(2) attsel.crossvalidation(True) attsel.seed(42) attsel.search(search) attsel.evaluator(evaluation) attsel.select_attributes(anneal_data) print("ranked attributes:\n" + str(attsel.ranked_attributes)) print("result string:\n" + attsel.results_string)
def showAttributeRanking(self, data): search = ASSearch( classname="weka.attributeSelection.Ranker", options=["-T", "-1.7976931348623157E308", "-N", "-1"]) evaluator = ASEvaluation( classname="weka.attributeSelection.InfoGainAttributeEval") attsel = AttributeSelection() attsel.set_search(search) attsel.set_evaluator(evaluator) attsel.select_attributes(data) print("# attributes: " + str(attsel.get_number_attributes_selected())) print("attributes: " + str(attsel.get_selected_attributes())) print("result string:\n" + attsel.to_results_string())
def cfs(table, cores): loader = Loader("weka.core.converters.CSVLoader") anneal_data = loader.load_file(table) anneal_data.class_is_last() logger.info("Running attribute selection for: " + str(table.split("/")[-1]) + ". Please, wait a moment.") search = ASSearch(classname="weka.attributeSelection.BestFirst", options=["-D", "0", "-N", "5"]) evaluation = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval", options=["-Z", "-P", cores, "-E", cores]) attsel = AttributeSelection() attsel.search(search) attsel.evaluator(evaluation) attsel.select_attributes(anneal_data) logger.info("Selected attributes: " + str(attsel.selected_attributes)) anneal_data.delete(index=None) # TO-DO: Borrar instancias aun no funciona return list(attsel.selected_attributes)
def use_low_level(data): """ Uses the attribute selection API directly. :param data: the dataset to use :type data: Instances """ print("\n3. Low-level") attsel = AttributeSelection() aseval = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval") assearch = ASSearch(classname="weka.attributeSelection.GreedyStepwise", options=["-B"]) attsel.jwrapper.setEvaluator(aseval.jobject) attsel.jwrapper.setSearch(assearch.jobject) attsel.select_attributes(data) indices = attsel.selected_attributes print("selected attribute indices (starting with 0):\n" + str(indices.tolist()))
def featureSelection(self): alg_search = ASSearch( classname="weka.attributeSelection.GeneticSearch", options=["-Z", "1024", "-G", "20", "-C", "0.6", "-M", "0.3"]) alg_evaluation = ASEvaluation( classname="weka.attributeSelection.CfsSubsetEval", options=["-P", "1", "-E", "1"]) feature_selection = AttributeSelection() feature_selection.search(alg_search) feature_selection.evaluator(alg_evaluation) feature_selection.select_attributes(self.original_data) self.selected_features = feature_selection.selected_attributes self.num_features = feature_selection.number_attributes_selected self.data_selected = feature_selection.reduce_dimensionality( self.original_data)
def relieff(filter_data, feature_names): # define search and evaluation for ReliefF search = ASSearch(classname="weka.attributeSelection.Ranker", options=["-T", "-1.7976931348623157E308", "-N", "-1"]) # last param is number of nearest neighbors evaluation = ASEvaluation(classname="weka.attributeSelection.ReliefFAttributeEval", options=["-M", "-1", "-D", "1", "-K", "10"]) # run the ReliefF alg relieff = AttributeSelection() relieff.search(search) relieff.evaluator(evaluation) relieff.select_attributes(filter_data) results = relieff.selected_attributes # weka wrapper returns the class col number with the results, so slice -1 return [feature_names[i] for i in results[:-1]]
def feature_selection_weka(x_train, y_train, x_test, input_path, features): percent = int(x_train.shape[1] * (features / 100.0)) if not os.path.exists('Weka'): os.mkdir('Weka') if not os.path.exists(input_path + f'selected_features_weka_{features}.csv'): x_train = x_train.loc[:, (x_train != x_train.iloc[0]).any()] sava_data = x_train.copy() sava_data.columns = [str(a) + "a" for a in range(sava_data.shape[1])] sava_data['target'] = y_train sava_data.to_csv('Weka/train_weka_format.csv', index=False) from weka.attribute_selection import ASEvaluation, AttributeSelection, ASSearch from weka.core.converters import Loader, Saver loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file('Weka/train_weka_format.csv', class_index='last') search = ASSearch(classname="weka.attributeSelection.GreedyStepwise", options=["-C", "-R", "-N", f"{percent}"]) evaluator = ASEvaluation( classname="weka.attributeSelection.CfsSubsetEval", options=["-P", "1", "-E", "1", "-L"]) attsel = AttributeSelection() attsel.search(search) attsel.evaluator(evaluator) attsel.select_attributes(data) ranked_attributes = pd.DataFrame(attsel.ranked_attributes, columns=['Feature', 'Rank']) ranked_attributes['Feature'] = ranked_attributes['Feature'].astype(int) set_of_features = ranked_attributes.loc[:percent - 1, 'Feature'] x_train.iloc[:, set_of_features].to_csv( input_path + f'selected_features_weka_{features}.csv') selected_features = x_train.iloc[:, set_of_features].columns else: selected_features = pd.read_csv( input_path + f'selected_features_weka_{features}.csv', index_col=0).columns x_train_filtered = x_train.loc[:, selected_features] x_val_filtered = x_test.loc[:, selected_features] return x_train_filtered, x_val_filtered
def information_gain(filter_data, feature_names): # last param determines how many attributes are returned # 2nd param controls the score threshold search = ASSearch(classname="weka.attributeSelection.Ranker", options=["-T", "-1.7976931348623157E308", "-N", "-1"]) # has no params evaluation = ASEvaluation( classname="weka.attributeSelection.InfoGainAttributeEval", options=[]) # run the Information Gain alg info_gain = AttributeSelection() info_gain.search(search) info_gain.evaluator(evaluation) info_gain.select_attributes(filter_data) results = info_gain.selected_attributes # weka wrapper returns the class col number with the results, so slice -1 return [feature_names[i] for i in results[:-1]]
def all_feature(file): jvm.start(packages=True) data = converters.load_any_file(file) data.class_is_last() search = ASSearch(classname="weka.attributeSelection.Ranker", options=["-T", "-1.7976931348623157E308", "-N", "-1"]) attsel = AttributeSelection() attsel.search(search) evaluator = ASEvaluation( classname="weka.attributeSelection.ChiSquaredAttributeEval") attsel.evaluator(evaluator) attsel.select_attributes(data) t = attsel.ranked_attributes[:, 0] chi = t.astype(int) evaluator = ASEvaluation( classname="weka.attributeSelection.InfoGainAttributeEval") attsel.evaluator(evaluator) attsel.select_attributes(data) t = attsel.ranked_attributes[:, 0] info_gain = t.astype(int) evaluator = ASEvaluation( classname="weka.attributeSelection.GainRatioAttributeEval") attsel.evaluator(evaluator) attsel.select_attributes(data) t = attsel.ranked_attributes[:, 0] gain_ratio = t.astype(int) evaluator = ASEvaluation( classname="weka.attributeSelection.SymmetricalUncertAttributeEval") attsel.evaluator(evaluator) attsel.select_attributes(data) t = attsel.ranked_attributes[:, 0] symmetric_uncertainty = t.astype(int) jvm.stop() return chi, info_gain, gain_ratio, symmetric_uncertainty
def get_IG(ofile_dir, loader): data = loader.load_file(ofile_dir) data.class_is_last() evaluator = ASEvaluation(classname="weka.attributeSelection.InfoGainAttributeEval") search = ASSearch(classname="weka.attributeSelection.Ranker", options=["-T", "-1.7976931348623157E308", "-N", "-1"]) attsel = AttributeSelection() attsel.search(search) attsel.evaluator(evaluator) attsel.select_attributes(data) results = {} if attsel.number_attributes_selected < 2: flag = 0 output = attsel.results_string for i in output.split('\n'): if (flag != 0): if len(i.split(' '))>2: t=[] for f in i.split(' '): if f!='': t.append(f) r_tax = '' for c in range(len(t)): if c>1: r_tax = r_tax+t[c]+' ' results.update({str(r_tax.strip()): float(t[0].strip())}) else: break if "Ranked attributes" in i: flag = 1 mean_score = sum(results.values())/len(results.values()) os.system("rm -r "+ofile_dir) else: results = dict([(str(data.attribute(attr[0]).name), attr[1]) for attr in attsel.ranked_attributes]) mean_score = attsel.ranked_attributes[:,1].mean() return results, mean_score
jvm.stop() sys.exit(1) """ data_dir = "\\\\egr-1l11qd2\\CLS_lab\\Junya Zhao\\Data driven model _paper [June 25_2018\\FeatureSelection\\EvlSearch\\" globbed_files = glob.glob(data_dir + "*.csv") for csv in globbed_files: data = converters.load_any_file(csv) data.class_is_last() search = ASSearch(classname="weka.attributeSelection.EvolutionarySearch", options=[ "-population-size", "200", "-generations", "500", "-crossover-probability", "0.6" ]) evaluator = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval", options=["-P", "1", "E", "1"]) attsel = AttributeSelection() attsel.folds(10) attsel.crossvalidation(True) attsel.seed(1) attsel.search(search) attsel.evaluator(evaluator) attsel.select_attributes(data) evl = Evaluation(data) print("# attributes: " + str(attsel.number_attributes_selected)) print("attributes: " + str(attsel.selected_attributes)) print("result string:\n" + attsel.results_string) print(evl) # write the report for each file with open(f"{csv}._report.csv", "a") as outfile: outfile.write(attsel.results_string) #with open(f"{csv}._label.txt","a") as output:
'd_FOLDS_test_' + str(fold) + '.csv') dataTrain.class_is_last() dataTest.class_is_last() from weka.attribute_selection import AttributeSelection, ASEvaluation, ASSearch search = ASSearch( classname="weka.attributeSelection.RerankingSearch" ) #,options=["-method", "2"]) evaluator = ASEvaluation( classname='weka.attributeSelection.ClassifierAttributeEval', options=['-B', 'weka.classifiers.bayes.NaiveBayes']) Eval = AttributeSelection( classname='weka.attributeSelection.ClassifierAttributeEval', options=[ '-B', 'weka.classifiers.bayes.NaiveBayes', '--', "-S 'weka.attributeSelection.RerankingSearch -method 2'" ]) from weka.filters import Filter NominalToBinary = Filter( classname= "weka.filters.unsupervised.attribute.NominalToBinary", options=["-R", "5,7,8"]) NumericToNominal = Filter( classname= "weka.filters.unsupervised.attribute.NumericToNominal") ReplaceMV = Filter( classname= "weka.filters.unsupervised.attribute.ReplaceMissingValues")
def select_attribute(file): global Field50 global Field10 global Field5 global Field2 global a filename = file.parts[-1] # Get filename from Pathlib object dir = file.parents[0] # Data directory currently in print("Selecting attributes from %s" % filename) if not filename.endswith(".arff"): print("%s not ARFF file." % filename) return filename_base = filename[:-5] # Removes '.arff' from filename data = load_Arff_file(file) # Load data from arff data.class_is_first() # Set first attr as class # Define Attribute selection search = ASSearch(classname="weka.attributeSelection.Ranker", options=["-T", "0.01", "-N", "-1"]) # Define Attribute Evaluator evaluator = ASEvaluation( classname="weka.attributeSelection.CorrelationAttributeEval", options=[]) # Run attribution selection attsel = AttributeSelection() attsel.search(search) attsel.evaluator(evaluator) attsel.select_attributes(data) # Define filepath and output results attsel_output = filename_base + "_attsel_results.txt" output_select_attribute(attsel, dir / attsel_output) # Debug Analysis print(attsel.selected_attributes) for i in range(2): Field2.append(attsel.selected_attributes[i]) for i in range(5): Field5.append(attsel.selected_attributes[i]) for i in range(10): Field10.append(attsel.selected_attributes[i]) for i in range(50): Field50.append(attsel.selected_attributes[i]) print(Field2) print(Field5) print(Field10) print(Field50) if len(set(Field10)) == len(Field10): print("no duplicates found") else: print("duplicate found") Field50 = list(set(Field50)) Field10 = list(set(Field10)) Field5 = list(set(Field5)) Field2 = list(set(Field2))
def runner(self, cdat, heap_size = 16384, seed = None, verbose = True): self.set_status(Pipeline.RUNNING) self.logs.append('Initializing Pipeline') para = self.config self.logs.append('Reading Pipeline Configuration') head = '' name = get_rand_uuid_str() self.logs.append('Reading Input File') for i, stage in enumerate(self.stages): if stage.code in ('dat.fle', 'prp.bgc', 'prp.nrm', 'prp.pmc', 'prp.sum'): self.stages[i].status = Pipeline.RUNNING if stage.code == 'dat.fle': head = os.path.abspath(stage.value.path) name, _ = os.path.splitext(stage.value.name) self.logs.append('Parsing to ARFF') path = os.path.join(head, '{name}.arff'.format(name = name)) # This bug, I don't know why, using Config.schema instead. # cdat.toARFF(path, express_config = para.Preprocess.schema, verbose = verbose) for i, stage in enumerate(self.stages): if stage.code in ('dat.fle', 'prp.bgc', 'prp.nrm', 'prp.pmc', 'prp.sum'): self.stages[i].status = Pipeline.COMPLETE self.logs.append('Saved ARFF at {path}'.format(path = path)) self.logs.append('Splitting to Training and Testing Sets') JVM.start(max_heap_size = '{size}m'.format(size = heap_size)) load = Loader(classname = 'weka.core.converters.ArffLoader') # data = load.load_file(path) # save = Saver(classname = 'weka.core.converters.ArffSaver') data = load.load_file(os.path.join(head, 'iris.arff')) # For Debugging Purposes Only data.class_is_last() # For Debugging Purposes Only # data.class_index = cdat.iclss for i, stage in enumerate(self.stages): if stage.code == 'prp.kcv': self.stages[i].status = Pipeline.RUNNING self.logs.append('Splitting Training Set') # TODO - Check if this seed is worth it. seed = assign_if_none(seed, random.randint(0, 1000)) opts = ['-S', str(seed), '-N', str(para.Preprocess.FOLDS)] wobj = Filter(classname = 'weka.filters.supervised.instance.StratifiedRemoveFolds', options = opts + ['-V']) wobj.inputformat(data) tran = wobj.filter(data) self.logs.append('Splitting Testing Set') wobj.options = opts test = wobj.filter(data) for i, stage in enumerate(self.stages): if stage.code == 'prp.kcv': self.stages[i].status = Pipeline.COMPLETE self.logs.append('Performing Feature Selection') feat = [ ] for comb in para.FEATURE_SELECTION: if comb.USE: for i, stage in enumerate(self.stages): if stage.code == 'ats': search = stage.value.search.name evaluator = stage.value.evaluator.name if search == comb.Search.NAME and evaluator == comb.Evaluator.NAME: self.stages[i].status = Pipeline.RUNNING srch = ASSearch(classname = 'weka.attributeSelection.{classname}'.format( classname = comb.Search.NAME, options = assign_if_none(comb.Search.OPTIONS, [ ]) )) ewal = ASEvaluation(classname = 'weka.attributeSelection.{classname}'.format( classname = comb.Evaluator.NAME, options = assign_if_none(comb.Evaluator.OPTIONS, [ ]) )) attr = AttributeSelection() attr.search(srch) attr.evaluator(ewal) attr.select_attributes(tran) meta = addict.Dict() meta.search = comb.Search.NAME meta.evaluator = comb.Evaluator.NAME meta.features = [tran.attribute(index).name for index in attr.selected_attributes] feat.append(meta) for i, stage in enumerate(self.stages): if stage.code == 'ats': search = stage.value.search.name evaluator = stage.value.evaluator.name if search == comb.Search.NAME and evaluator == comb.Evaluator.NAME: self.stages[i].status = Pipeline.COMPLETE models = [ ] for model in para.MODEL: if model.USE: summary = addict.Dict() self.logs.append('Modelling {model}'.format(model = model.LABEL)) summary.label = model.LABEL summary.name = model.NAME summary.options = assign_if_none(model.OPTIONS, [ ]) for i, stage in enumerate(self.stages): if stage.code == 'lrn' and stage.value.name == model.NAME: self.stages[i].status = Pipeline.RUNNING for i, instance in enumerate(data): iclass = list(range(instance.num_classes)) options = assign_if_none(model.OPTIONS, [ ]) classifier = Classifier(classname = 'weka.classifiers.{classname}'.format(classname = model.NAME), options = options) classifier.build_classifier(tran) serializer.write(os.path.join(head, '{name}.{classname}.model'.format( name = name, classname = model.NAME )), classifier) self.logs.append('Testing model {model}'.format(model = model.LABEL)) evaluation = Evaluation(tran) evaluation.test_model(classifier, test) summary.summary = evaluation.summary() frame = pd.DataFrame(data = evaluation.confusion_matrix) axes = sns.heatmap(frame, cbar = False, annot = True) b64str = get_b64_plot(axes) summary.confusion_matrix = addict.Dict({ 'value': evaluation.confusion_matrix.tolist(), 'plot': b64str }) self.logs.append('Plotting Learning Curve for {model}'.format(model = model.LABEL)) buffer = io.BytesIO() plot_classifier_errors(evaluation.predictions, tran, test, outfile = buffer, wait = False) b64str = buffer_to_b64(buffer) summary.learning_curve = b64str buffer = io.BytesIO() plot_roc(evaluation, class_index = iclass, outfile = buffer, wait = False) b64str = buffer_to_b64(buffer) summary.roc_curve = b64str buffer = io.BytesIO() plot_prc(evaluation, class_index = iclass, outfile = buffer, wait = False) b64str = buffer_to_b64(buffer) summary.prc_curve = b64str if classifier.graph: summary.graph = classifier.graph for i, instance in enumerate(test): prediction = classifier.classify_instance(instance) for i, stage in enumerate(self.stages): if stage.code == 'lrn' and stage.value.name == model.NAME: self.stages[i].status = Pipeline.COMPLETE models.append(summary) self.gist.models = models JVM.stop() JSON.write(os.path.join(head, '{name}.cgist'.format(name = name)), self.gist) self.logs.append('Pipeline Complete') self.set_status(Pipeline.COMPLETE)