Python AttributeSelection.AttributeSelection示例，weka.attribute_selection.AttributeSelection.AttributeSelection Python示例

示例#1

0

显示文件

def Feature_Selection(infile):
    directory = os.getcwd() + '/'
    csvpath = directory + infile

    jvm.start(packages=True, max_heap_size="4g")
    print "\n\n"
    print "Loaded file: ", infile
    csvloader = Loader(classname="weka.core.converters.CSVLoader")
    csvdata = csvloader.load_file(csvpath)

    remover = Filter(classname="weka.filters.unsupervised.attribute.Remove",
                     options=["-R", " 1"])
    remover.inputformat(csvdata)
    filtered_data = remover.filter(csvdata)
    filtered_data.class_is_last()

    search = ASSearch(classname="weka.attributeSelection.BestFirst",
                      options=["-D", "1", "-N", "5"])
    evaluator = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval",
                             options=["-P", "1", "-E", "1"])
    attribs = AttributeSelection()
    attribs.search(search)
    attribs.evaluator(evaluator)
    attribs.select_attributes(filtered_data)
    print "Summary of Attribute Selection: "
    print attribs.results_string
    jvm.stop()
    return

示例#2

0

显示文件

def main():
    """
    Just runs some example code.
    """

    # load a dataset
    anneal_file = helper.get_data_dir() + os.sep + "anneal.arff"
    helper.print_info("Loading dataset: " + anneal_file)
    loader = Loader("weka.core.converters.ArffLoader")
    anneal_data = loader.load_file(anneal_file)
    anneal_data.class_is_last()

    # perform attribute selection
    helper.print_title("Attribute selection")
    search = ASSearch(classname="weka.attributeSelection.BestFirst",
                      options=["-D", "1", "-N", "5"])
    evaluation = ASEvaluation(
        classname="weka.attributeSelection.CfsSubsetEval",
        options=["-P", "1", "-E", "1"])
    attsel = AttributeSelection()
    attsel.search(search)
    attsel.evaluator(evaluation)
    attsel.select_attributes(anneal_data)
    print("# attributes: " + str(attsel.number_attributes_selected))
    print("attributes (as numpy array): " + str(attsel.selected_attributes))
    print("attributes (as list): " + str(list(attsel.selected_attributes)))
    print("result string:\n" + attsel.results_string)

    # perform ranking
    helper.print_title("Attribute ranking (2-fold CV)")
    search = ASSearch(classname="weka.attributeSelection.Ranker",
                      options=["-N", "-1"])
    evaluation = ASEvaluation("weka.attributeSelection.InfoGainAttributeEval")
    attsel = AttributeSelection()
    attsel.ranking(True)
    attsel.folds(2)
    attsel.crossvalidation(True)
    attsel.seed(42)
    attsel.search(search)
    attsel.evaluator(evaluation)
    attsel.select_attributes(anneal_data)
    print("ranked attributes:\n" + str(attsel.ranked_attributes))
    print("result string:\n" + attsel.results_string)

示例#3

0

显示文件

 def showAttributeRanking(self, data):
     search = ASSearch(
         classname="weka.attributeSelection.Ranker",
         options=["-T", "-1.7976931348623157E308", "-N", "-1"])
     evaluator = ASEvaluation(
         classname="weka.attributeSelection.InfoGainAttributeEval")
     attsel = AttributeSelection()
     attsel.set_search(search)
     attsel.set_evaluator(evaluator)
     attsel.select_attributes(data)
     print("# attributes: " + str(attsel.get_number_attributes_selected()))
     print("attributes: " + str(attsel.get_selected_attributes()))
     print("result string:\n" + attsel.to_results_string())

示例#4

0

显示文件

文件： analysis.py 项目： AngelChiPoot/ALUSE

def cfs(table, cores):
    loader = Loader("weka.core.converters.CSVLoader")
    anneal_data = loader.load_file(table)
    anneal_data.class_is_last()
    logger.info("Running attribute selection for: " + str(table.split("/")[-1]) + ". Please, wait a moment.")
    search = ASSearch(classname="weka.attributeSelection.BestFirst", options=["-D", "0", "-N", "5"])
    evaluation = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval", options=["-Z", "-P", cores, "-E", cores])
    attsel = AttributeSelection()
    attsel.search(search)
    attsel.evaluator(evaluation)
    attsel.select_attributes(anneal_data)
    logger.info("Selected attributes: " + str(attsel.selected_attributes))
    anneal_data.delete(index=None) # TO-DO: Borrar instancias aun no funciona

    return list(attsel.selected_attributes)

示例#5

0

显示文件

文件： attribute_selection_test.py 项目： tulbadex/python-weka-wrapper-examples

def use_low_level(data):
    """
    Uses the attribute selection API directly.
    :param data: the dataset to use
    :type data: Instances
    """
    print("\n3. Low-level")
    attsel = AttributeSelection()
    aseval = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval")
    assearch = ASSearch(classname="weka.attributeSelection.GreedyStepwise", options=["-B"])
    attsel.jwrapper.setEvaluator(aseval.jobject)
    attsel.jwrapper.setSearch(assearch.jobject)
    attsel.select_attributes(data)
    indices = attsel.selected_attributes
    print("selected attribute indices (starting with 0):\n" + str(indices.tolist()))

示例#6

0

显示文件

 def featureSelection(self):
     alg_search = ASSearch(
         classname="weka.attributeSelection.GeneticSearch",
         options=["-Z", "1024", "-G", "20", "-C", "0.6", "-M", "0.3"])
     alg_evaluation = ASEvaluation(
         classname="weka.attributeSelection.CfsSubsetEval",
         options=["-P", "1", "-E", "1"])
     feature_selection = AttributeSelection()
     feature_selection.search(alg_search)
     feature_selection.evaluator(alg_evaluation)
     feature_selection.select_attributes(self.original_data)
     self.selected_features = feature_selection.selected_attributes
     self.num_features = feature_selection.number_attributes_selected
     self.data_selected = feature_selection.reduce_dimensionality(
         self.original_data)

示例#7

0

显示文件

def relieff(filter_data, feature_names):
    # define search and evaluation for ReliefF
    search = ASSearch(classname="weka.attributeSelection.Ranker", options=["-T", "-1.7976931348623157E308", "-N", "-1"])
    # last param is number of nearest neighbors
    evaluation = ASEvaluation(classname="weka.attributeSelection.ReliefFAttributeEval",
                              options=["-M", "-1", "-D", "1", "-K", "10"])

    # run the ReliefF alg
    relieff = AttributeSelection()
    relieff.search(search)
    relieff.evaluator(evaluation)
    relieff.select_attributes(filter_data)
    results = relieff.selected_attributes

    # weka wrapper returns the class col number with the results, so slice -1
    return [feature_names[i] for i in results[:-1]]

示例#8

0

显示文件

文件： utils.py 项目： noabartal/DL-for-TSC-with-agg-features

def feature_selection_weka(x_train, y_train, x_test, input_path, features):
    percent = int(x_train.shape[1] * (features / 100.0))
    if not os.path.exists('Weka'):
        os.mkdir('Weka')

    if not os.path.exists(input_path +
                          f'selected_features_weka_{features}.csv'):
        x_train = x_train.loc[:, (x_train != x_train.iloc[0]).any()]
        sava_data = x_train.copy()
        sava_data.columns = [str(a) + "a" for a in range(sava_data.shape[1])]
        sava_data['target'] = y_train
        sava_data.to_csv('Weka/train_weka_format.csv', index=False)

        from weka.attribute_selection import ASEvaluation, AttributeSelection, ASSearch
        from weka.core.converters import Loader, Saver
        loader = Loader(classname="weka.core.converters.CSVLoader")
        data = loader.load_file('Weka/train_weka_format.csv',
                                class_index='last')

        search = ASSearch(classname="weka.attributeSelection.GreedyStepwise",
                          options=["-C", "-R", "-N", f"{percent}"])
        evaluator = ASEvaluation(
            classname="weka.attributeSelection.CfsSubsetEval",
            options=["-P", "1", "-E", "1", "-L"])
        attsel = AttributeSelection()
        attsel.search(search)
        attsel.evaluator(evaluator)
        attsel.select_attributes(data)
        ranked_attributes = pd.DataFrame(attsel.ranked_attributes,
                                         columns=['Feature', 'Rank'])
        ranked_attributes['Feature'] = ranked_attributes['Feature'].astype(int)
        set_of_features = ranked_attributes.loc[:percent - 1, 'Feature']

        x_train.iloc[:, set_of_features].to_csv(
            input_path + f'selected_features_weka_{features}.csv')
        selected_features = x_train.iloc[:, set_of_features].columns
    else:
        selected_features = pd.read_csv(
            input_path + f'selected_features_weka_{features}.csv',
            index_col=0).columns

    x_train_filtered = x_train.loc[:, selected_features]
    x_val_filtered = x_test.loc[:, selected_features]

    return x_train_filtered, x_val_filtered

示例#9

0

显示文件

def information_gain(filter_data, feature_names):
    # last param determines how many attributes are returned
    # 2nd param controls the score threshold
    search = ASSearch(classname="weka.attributeSelection.Ranker",
                      options=["-T", "-1.7976931348623157E308", "-N", "-1"])
    # has no params
    evaluation = ASEvaluation(
        classname="weka.attributeSelection.InfoGainAttributeEval", options=[])

    # run the Information Gain alg
    info_gain = AttributeSelection()
    info_gain.search(search)
    info_gain.evaluator(evaluation)
    info_gain.select_attributes(filter_data)
    results = info_gain.selected_attributes

    # weka wrapper returns the class col number with the results, so slice -1
    return [feature_names[i] for i in results[:-1]]

示例#10

0

显示文件

文件： feature_selection.py 项目： mo-fa/Ensemble-Feature-Selection

def all_feature(file):
    jvm.start(packages=True)
    data = converters.load_any_file(file)
    data.class_is_last()

    search = ASSearch(classname="weka.attributeSelection.Ranker",
                      options=["-T", "-1.7976931348623157E308", "-N", "-1"])
    attsel = AttributeSelection()
    attsel.search(search)

    evaluator = ASEvaluation(
        classname="weka.attributeSelection.ChiSquaredAttributeEval")
    attsel.evaluator(evaluator)
    attsel.select_attributes(data)
    t = attsel.ranked_attributes[:, 0]
    chi = t.astype(int)

    evaluator = ASEvaluation(
        classname="weka.attributeSelection.InfoGainAttributeEval")
    attsel.evaluator(evaluator)
    attsel.select_attributes(data)
    t = attsel.ranked_attributes[:, 0]
    info_gain = t.astype(int)

    evaluator = ASEvaluation(
        classname="weka.attributeSelection.GainRatioAttributeEval")
    attsel.evaluator(evaluator)
    attsel.select_attributes(data)
    t = attsel.ranked_attributes[:, 0]
    gain_ratio = t.astype(int)

    evaluator = ASEvaluation(
        classname="weka.attributeSelection.SymmetricalUncertAttributeEval")
    attsel.evaluator(evaluator)
    attsel.select_attributes(data)
    t = attsel.ranked_attributes[:, 0]
    symmetric_uncertainty = t.astype(int)

    jvm.stop()

    return chi, info_gain, gain_ratio, symmetric_uncertainty

示例#11

0

显示文件

def get_IG(ofile_dir, loader):
	data = loader.load_file(ofile_dir)
	data.class_is_last()

	evaluator = ASEvaluation(classname="weka.attributeSelection.InfoGainAttributeEval")
	search = ASSearch(classname="weka.attributeSelection.Ranker", options=["-T", "-1.7976931348623157E308", "-N", "-1"])
	attsel = AttributeSelection()
	attsel.search(search)
	attsel.evaluator(evaluator)

	attsel.select_attributes(data)

	results = {}

	if attsel.number_attributes_selected < 2:
		flag = 0
		output = attsel.results_string
		for i in output.split('\n'):
			if (flag != 0):
				if len(i.split(' '))>2:
					t=[]
					for f in i.split(' '):
						if f!='':
							t.append(f)
					r_tax = ''
					for c in range(len(t)):
						if c>1:
							r_tax = r_tax+t[c]+' '
					results.update({str(r_tax.strip()): float(t[0].strip())})
				else:
					break
			if "Ranked attributes" in i:
				flag = 1
		mean_score = sum(results.values())/len(results.values())
		os.system("rm -r "+ofile_dir)
	else:
		results = dict([(str(data.attribute(attr[0]).name), attr[1]) for attr in attsel.ranked_attributes])
		mean_score = attsel.ranked_attributes[:,1].mean()
	
	return results, mean_score

示例#12

0

显示文件

文件： EvlSearch.py 项目： JunyaZ/Paper1-A-Computational-Model-for-Identifying-Genotype-Markers-Linked-to-Phenotypic-Subgroups-in-ASD

    jvm.stop()
    sys.exit(1)
"""
data_dir = "\\\\egr-1l11qd2\\CLS_lab\\Junya Zhao\\Data driven model _paper [June 25_2018\\FeatureSelection\\EvlSearch\\"
globbed_files = glob.glob(data_dir + "*.csv")
for csv in globbed_files:
    data = converters.load_any_file(csv)
    data.class_is_last()
    search = ASSearch(classname="weka.attributeSelection.EvolutionarySearch",
                      options=[
                          "-population-size", "200", "-generations", "500",
                          "-crossover-probability", "0.6"
                      ])
    evaluator = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval",
                             options=["-P", "1", "E", "1"])
    attsel = AttributeSelection()
    attsel.folds(10)
    attsel.crossvalidation(True)
    attsel.seed(1)
    attsel.search(search)
    attsel.evaluator(evaluator)
    attsel.select_attributes(data)
    evl = Evaluation(data)
    print("# attributes: " + str(attsel.number_attributes_selected))
    print("attributes: " + str(attsel.selected_attributes))
    print("result string:\n" + attsel.results_string)
    print(evl)
    # write the report for each file
    with open(f"{csv}._report.csv", "a") as outfile:
        outfile.write(attsel.results_string)
    #with open(f"{csv}._label.txt","a") as output:

示例#13

0

显示文件

文件： WekaWrapperClassificationFS.py 项目： LinoFernandes/Classification

                                            'd_FOLDS_test_' + str(fold) +
                                            '.csv')
                dataTrain.class_is_last()
                dataTest.class_is_last()

                from weka.attribute_selection import AttributeSelection, ASEvaluation, ASSearch
                search = ASSearch(
                    classname="weka.attributeSelection.RerankingSearch"
                )  #,options=["-method", "2"])
                evaluator = ASEvaluation(
                    classname='weka.attributeSelection.ClassifierAttributeEval',
                    options=['-B', 'weka.classifiers.bayes.NaiveBayes'])

                Eval = AttributeSelection(
                    classname='weka.attributeSelection.ClassifierAttributeEval',
                    options=[
                        '-B', 'weka.classifiers.bayes.NaiveBayes', '--',
                        "-S 'weka.attributeSelection.RerankingSearch -method 2'"
                    ])

                from weka.filters import Filter

                NominalToBinary = Filter(
                    classname=
                    "weka.filters.unsupervised.attribute.NominalToBinary",
                    options=["-R", "5,7,8"])
                NumericToNominal = Filter(
                    classname=
                    "weka.filters.unsupervised.attribute.NumericToNominal")
                ReplaceMV = Filter(
                    classname=
                    "weka.filters.unsupervised.attribute.ReplaceMissingValues")

示例#14

0

显示文件

def select_attribute(file):
    global Field50
    global Field10
    global Field5
    global Field2
    global a

    filename = file.parts[-1]  # Get filename from Pathlib object
    dir = file.parents[0]  # Data directory currently in

    print("Selecting attributes from %s" % filename)

    if not filename.endswith(".arff"):
        print("%s not ARFF file." % filename)
        return

    filename_base = filename[:-5]  # Removes '.arff' from filename
    data = load_Arff_file(file)  # Load data from arff
    data.class_is_first()  # Set first attr as class

    # Define Attribute selection
    search = ASSearch(classname="weka.attributeSelection.Ranker",
                      options=["-T", "0.01", "-N", "-1"])
    # Define Attribute Evaluator
    evaluator = ASEvaluation(
        classname="weka.attributeSelection.CorrelationAttributeEval",
        options=[])

    # Run attribution selection
    attsel = AttributeSelection()
    attsel.search(search)
    attsel.evaluator(evaluator)
    attsel.select_attributes(data)

    # Define filepath and output results
    attsel_output = filename_base + "_attsel_results.txt"
    output_select_attribute(attsel, dir / attsel_output)

    # Debug Analysis
    print(attsel.selected_attributes)
    for i in range(2):
        Field2.append(attsel.selected_attributes[i])
    for i in range(5):
        Field5.append(attsel.selected_attributes[i])
    for i in range(10):
        Field10.append(attsel.selected_attributes[i])
    for i in range(50):
        Field50.append(attsel.selected_attributes[i])
    print(Field2)
    print(Field5)
    print(Field10)
    print(Field50)

    if len(set(Field10)) == len(Field10):
        print("no duplicates found")

    else:
        print("duplicate found")
        Field50 = list(set(Field50))
        Field10 = list(set(Field10))
        Field5 = list(set(Field5))
        Field2 = list(set(Field2))

示例#15

0

显示文件

文件： pipeline.py 项目： swarnimshukla/candis

    def runner(self, cdat, heap_size = 16384, seed = None, verbose = True):
        self.set_status(Pipeline.RUNNING)

        self.logs.append('Initializing Pipeline')

        para = self.config

        self.logs.append('Reading Pipeline Configuration')

        head = ''
        name = get_rand_uuid_str()

        self.logs.append('Reading Input File')

        for i, stage in enumerate(self.stages):
            if stage.code in ('dat.fle', 'prp.bgc', 'prp.nrm', 'prp.pmc', 'prp.sum'):
                self.stages[i].status = Pipeline.RUNNING
            if stage.code ==  'dat.fle':
                head    = os.path.abspath(stage.value.path)
                name, _ = os.path.splitext(stage.value.name)

        self.logs.append('Parsing to ARFF')

        path = os.path.join(head, '{name}.arff'.format(name = name))
        # This bug, I don't know why, using Config.schema instead.
        # cdat.toARFF(path, express_config = para.Preprocess.schema, verbose = verbose)

        for i, stage in enumerate(self.stages):
            if stage.code in ('dat.fle', 'prp.bgc', 'prp.nrm', 'prp.pmc', 'prp.sum'):
                self.stages[i].status = Pipeline.COMPLETE

        self.logs.append('Saved ARFF at {path}'.format(path = path))
        self.logs.append('Splitting to Training and Testing Sets')

        JVM.start(max_heap_size = '{size}m'.format(size = heap_size))

        load = Loader(classname = 'weka.core.converters.ArffLoader')
        # data = load.load_file(path)
        # save =  Saver(classname = 'weka.core.converters.ArffSaver')
        data = load.load_file(os.path.join(head, 'iris.arff')) # For Debugging Purposes Only
        data.class_is_last() # For Debugging Purposes Only
        # data.class_index = cdat.iclss

        for i, stage in enumerate(self.stages):
            if stage.code == 'prp.kcv':
                self.stages[i].status = Pipeline.RUNNING

        self.logs.append('Splitting Training Set')

        # TODO - Check if this seed is worth it.
        seed = assign_if_none(seed, random.randint(0, 1000))
        opts = ['-S', str(seed), '-N', str(para.Preprocess.FOLDS)]
        wobj = Filter(classname = 'weka.filters.supervised.instance.StratifiedRemoveFolds', options = opts + ['-V'])
        wobj.inputformat(data)

        tran = wobj.filter(data)

        self.logs.append('Splitting Testing Set')

        wobj.options = opts
        test = wobj.filter(data)

        for i, stage in enumerate(self.stages):
            if stage.code == 'prp.kcv':
                self.stages[i].status = Pipeline.COMPLETE

        self.logs.append('Performing Feature Selection')

        feat = [ ]
        for comb in para.FEATURE_SELECTION:
            if comb.USE:
                for i, stage in enumerate(self.stages):
                    if stage.code == 'ats':
                        search    = stage.value.search.name
                        evaluator = stage.value.evaluator.name

                        if search == comb.Search.NAME and evaluator == comb.Evaluator.NAME:
                            self.stages[i].status = Pipeline.RUNNING

                srch = ASSearch(classname = 'weka.attributeSelection.{classname}'.format(
                    classname = comb.Search.NAME,
                    options   = assign_if_none(comb.Search.OPTIONS, [ ])
                ))
                ewal = ASEvaluation(classname = 'weka.attributeSelection.{classname}'.format(
                    classname = comb.Evaluator.NAME,
                    options   = assign_if_none(comb.Evaluator.OPTIONS, [ ])
                ))

                attr = AttributeSelection()
                attr.search(srch)
                attr.evaluator(ewal)
                attr.select_attributes(tran)

                meta = addict.Dict()
                meta.search    = comb.Search.NAME
                meta.evaluator = comb.Evaluator.NAME
                meta.features  = [tran.attribute(index).name for index in attr.selected_attributes]

                feat.append(meta)

                for i, stage in enumerate(self.stages):
                    if stage.code == 'ats':
                        search    = stage.value.search.name
                        evaluator = stage.value.evaluator.name

                        if search == comb.Search.NAME and evaluator == comb.Evaluator.NAME:
                            self.stages[i].status = Pipeline.COMPLETE

        models = [ ]
        for model in para.MODEL:
            if model.USE:
                summary         = addict.Dict()

                self.logs.append('Modelling {model}'.format(model = model.LABEL))

                summary.label   = model.LABEL
                summary.name    = model.NAME
                summary.options = assign_if_none(model.OPTIONS, [ ])

                for i, stage in enumerate(self.stages):
                    if stage.code == 'lrn' and stage.value.name == model.NAME:
                        self.stages[i].status = Pipeline.RUNNING

                for i, instance in enumerate(data):
                    iclass = list(range(instance.num_classes))
                
                options    = assign_if_none(model.OPTIONS, [ ])
                classifier = Classifier(classname = 'weka.classifiers.{classname}'.format(classname = model.NAME), options = options)
                classifier.build_classifier(tran)
        
                serializer.write(os.path.join(head, '{name}.{classname}.model'.format(
                        name = name,
                    classname = model.NAME
                )), classifier)

                self.logs.append('Testing model {model}'.format(model = model.LABEL))

                evaluation       = Evaluation(tran)
                evaluation.test_model(classifier, test)

                summary.summary  = evaluation.summary()

                frame  = pd.DataFrame(data = evaluation.confusion_matrix)
                axes   = sns.heatmap(frame, cbar = False, annot = True)
                b64str = get_b64_plot(axes)
                
                summary.confusion_matrix = addict.Dict({
                    'value': evaluation.confusion_matrix.tolist(),
                     'plot': b64str
                })

                self.logs.append('Plotting Learning Curve for {model}'.format(model = model.LABEL))

                buffer = io.BytesIO()
                plot_classifier_errors(evaluation.predictions, tran, test, outfile = buffer, wait = False)
                b64str = buffer_to_b64(buffer)

                summary.learning_curve   = b64str

                buffer = io.BytesIO()
                plot_roc(evaluation, class_index = iclass, outfile = buffer, wait = False)
                b64str = buffer_to_b64(buffer)

                summary.roc_curve        = b64str

                buffer = io.BytesIO()
                plot_prc(evaluation, class_index = iclass, outfile = buffer, wait = False)
                b64str = buffer_to_b64(buffer)

                summary.prc_curve        = b64str

                if classifier.graph:
                    summary.graph = classifier.graph

                for i, instance in enumerate(test):
                    prediction = classifier.classify_instance(instance)

                for i, stage in enumerate(self.stages):
                    if stage.code == 'lrn' and stage.value.name == model.NAME:
                        self.stages[i].status = Pipeline.COMPLETE

                models.append(summary)

        self.gist.models = models

        JVM.stop()

        JSON.write(os.path.join(head, '{name}.cgist'.format(name = name)), self.gist)

        self.logs.append('Pipeline Complete')

        self.set_status(Pipeline.COMPLETE)