Exemplo n.º 1
0
def main():
    """
    Runs a associator from the command-line. Calls JVM start/stop automatically.
    Use -h to see all options.
    """

    parser = argparse.ArgumentParser(
        description='Executes an associator from the command-line. Calls JVM start/stop automatically.')
    parser.add_argument("-j", metavar="classpath", dest="classpath", help="additional classpath, jars/directories")
    parser.add_argument("-X", metavar="heap", dest="heap", help="max heap size for jvm, e.g., 512m")
    parser.add_argument("-t", metavar="train", dest="train", required=True, help="training set file")
    parser.add_argument("associator", help="associator classname, e.g., weka.associations.Apriori")
    parser.add_argument("option", nargs=argparse.REMAINDER, help="additional associator options")
    parsed = parser.parse_args()
    jars = []
    if parsed.classpath is not None:
        jars = parsed.classpath.split(os.pathsep)

    jvm.start(jars, max_heap_size=parsed.heap, packages=True)

    logger.debug("Commandline: " + join_options(sys.argv[1:]))

    try:
        associator = Associator(classname=parsed.associator)
        if len(parsed.option) > 0:
            associator.options = parsed.option
        loader = converters.loader_for_file(parsed.train)
        data = loader.load_file(parsed.train)
        associator.build_associations(data)
        print(str(associator))
    except Exception as e:
        print(e)
    finally:
        jvm.stop()
Exemplo n.º 2
0
def main():
    """
    Runs a datagenerator from the command-line. Calls JVM start/stop automatically.
    Use -h to see all options.
    """
    parser = argparse.ArgumentParser(
        description='Executes a data generator from the command-line. Calls JVM start/stop automatically.')
    parser.add_argument("-j", metavar="classpath", dest="classpath", help="additional classpath, jars/directories")
    parser.add_argument("-X", metavar="heap", dest="heap", help="max heap size for jvm, e.g., 512m")
    parser.add_argument("datagenerator", help="data generator classname, e.g., "
                                              + "weka.datagenerators.classifiers.classification.LED24")
    parser.add_argument("option", nargs=argparse.REMAINDER, help="additional data generator options")
    parsed = parser.parse_args()
    jars = []
    if parsed.classpath is not None:
        jars = parsed.classpath.split(os.pathsep)

    jvm.start(jars, max_heap_size=parsed.heap, packages=True)

    logger.debug("Commandline: " + join_options(sys.argv[1:]))

    try:
        generator = DataGenerator(classname=parsed.datagenerator)
        if len(parsed.option) > 0:
            generator.options = parsed.option
        DataGenerator.make_data(generator, parsed.option)
    except Exception as e:
        print(e)
    finally:
        jvm.stop()
def affective_vectorizer(tweets, filename):
    '''
    Vectorizes the tweets and saves the vectors as csv.
    :param tweets: list of tweets
    :param filename: name of the saved file
    '''
    jvm.start(packages=True)
    install_package('AffectiveTweets')

    data = dataset.create_instances_from_lists([[t] for t in tweets])

    filter = Filter(
        classname=
        'weka.filters.unsupervised.attribute.TweetToLexiconFeatureVector',
        options=[
            '-F', '-D', '-R', '-A', '-T', '-L', '-N', '-P', '-J', '-H', '-Q',
            '-stemmer', 'weka.core.stemmers.NullStemmer', '-stopwords-handler',
            'weka.core.tokenizers.TweetNLPTokenizer', '-I', '1', '-U',
            '-tokenizer', 'weka.core.tokenizers.TweetNLPTokenizer'
        ])
    filter.inputformat(data)
    filtered_data = filter.filter(data)

    converters.save_any_file(filtered_data, 'data/affect-vectors/' + filename)

    jvm.stop()
def start_search(file_name, type):
	start = time.clock() # time Preprocessing
	tile_set, characteristic, nmrClass = read_file(file_name)
	calculate_char_heuristic(tile_set, characteristic) # do before you add the place holder tiles
	tile_set = generate_placeholders(tile_set, characteristic, nmrClass) # gets place holder tiles
	# kill jvm that was started after calling read file
	# the jvm is used for the machine learning filtering 
	# so that weka can be run
	jvm.stop() 
	calculate_order_heuristic(tile_set)

	# add up heuristic from all tiles and make starting node
	heuristic_val = 0
	for tile in tile_set:
		heuristic_val += tile.heuristic_cost
		# print tile.heuristic_order_cost,
		# print tile.get_tile()
	root = Node(tile_set, [], heuristic_val, characteristic,0,0, heuristic=heuristic_val) # makes start state for search

	end = time.clock() # time Preprocessing
	print "Preprocessing Time:  " + str(end-start)
	
	# picks algorithm
	if (int(type) == 0): # uniform cost search
		best_solution, node_count = aStar([root])
		output_soultion(best_solution, node_count)
	elif (int(type) == 1): # puzzle building
		best_solution = puzzle_building_search([root]) 
def PredecirUnaTemporada(path):
    jvm.start()
    insta = CrearInstanciaParaPredecir(path)
    atributos = ""
    file = open('ModelData/wekaHeader.arff', 'r')
    atributos = file.readlines()
    file.close()

    file = open('ModelData/predictionFiles/inst.arff', 'w')
    file.writelines(atributos)
    file.write("\n" + insta + '\n')
    file.close()

    objects = serialization.read_all("ModelData/77PercentModelPaisajes.model")
    classifier = Classifier(jobject=objects[0])

    loader = Loader()
    data = loader.load_file("ModelData/predictionFiles/inst.arff")
    data.class_is_last()

    clases = ["invierno", "verano", "otono", "primavera"]
    prediccion = ""
    for index, inst in enumerate(data):
        pred = classifier.classify_instance(inst)
        dist = classifier.distribution_for_instance(inst)
        prediccion = clases[int(pred)]
    jvm.stop()
    return prediccion
Exemplo n.º 6
0
def main():
    """
    Runs a associator from the command-line. Calls JVM start/stop automatically.
    Use -h to see all options.
    """

    parser = argparse.ArgumentParser(
        description='Executes an associator from the command-line. Calls JVM start/stop automatically.')
    parser.add_argument("-j", metavar="classpath", dest="classpath", help="additional classpath, jars/directories")
    parser.add_argument("-X", metavar="heap", dest="heap", help="max heap size for jvm, e.g., 512m")
    parser.add_argument("-t", metavar="train", dest="train", required=True, help="training set file")
    parser.add_argument("associator", help="associator classname, e.g., weka.associations.Apriori")
    parser.add_argument("option", nargs=argparse.REMAINDER, help="additional associator options")
    parsed = parser.parse_args()
    jars = []
    if parsed.classpath is not None:
        jars = parsed.classpath.split(os.pathsep)

    jvm.start(jars, max_heap_size=parsed.heap, packages=True)

    logger.debug("Commandline: " + join_options(sys.argv[1:]))

    try:
        associator = Associator(classname=parsed.associator)
        if len(parsed.option) > 0:
            associator.options = parsed.option
        loader = converters.loader_for_file(parsed.train)
        data = loader.load_file(parsed.train)
        associator.build_associations(data)
        print(str(associator))
    except Exception as e:
        print(e)
    finally:
        jvm.stop()
Exemplo n.º 7
0
def run():
    jvm.start()
    load_csv = Loader("weka.core.converters.CSVLoader")
    data_csv = load_csv.load_file(
        "/Users/imeiliasantoso/web_graduate_project4/predict_page/predict_data.csv"
    )

    saver = Saver("weka.core.converters.ArffSaver")
    saver.save_file(
        data_csv,
        "/Users/imeiliasantoso/web_graduate_project4/predict_page/predict_data.arff"
    )

    load_arff = Loader("weka.core.converters.ArffLoader")
    data_arff = load_arff.load_file(
        "/Users/imeiliasantoso/web_graduate_project4/predict_page/predict_data.arff"
    )
    data_arff.class_is_last()

    global j48
    J48_class = Classifier(classname="weka.classifiers.trees.J48",
                           options=["-C", "0.25", "-M", "2"])
    J48_class.build_classifier(data_arff)
    evaluationj48 = Evaluation(data_arff)
    evaluationj48.crossvalidate_model(J48_class, data_arff, 10, Random(100))
    j48 = str(evaluationj48.percent_correct)
    jvm.stop()
    return j48
Exemplo n.º 8
0
def main():
    data_provider = DataProvider(test_size=0.2)
    jvm.start()
    trainig_data = data_provider.get_weka_training_data()  # for weka
    try:
        weka_bayes_network = WekaBayesNetwork(trainig_data['labels'],
                                              ALL_ALGORITHMS['k2'])
        weka_bayes_network.train_bayes(trainig_data['train_set'])
        weka_bayes_network.draw_graph()
        results = weka_bayes_network.predict_and_compare(
            trainig_data['test_set'])

        # trainig_data = data_provider.get_training_data() # for simple
        # chow_liu_bayes_network = SimpleBayesNetwork(trainig_data['feature_names'], ALL_ALGORITHMS['chowLiu'])
        # chow_liu_bayes_network.train_bayes(trainig_data['x_train'], trainig_data['y_train'])
        # chow_liu_bayes_network.draw_graph()
        # results = chow_liu_bayes_network.predict_and_compare(trainig_data['x_test'], trainig_data['y_test'])

        print('correct_recurrences:', results['correct_recurrences'])
        print('correct_no_recurrences:', results['correct_no_recurrences'])
        print('incorrect_recurrences:', results['incorrect_recurrences'])
        print('incorrect_no_recurrences:', results['incorrect_no_recurrences'])
    except:
        print('Exception was caught!')
        tb = traceback.format_exc()
        print(tb)

    jvm.stop()
def main():

    try:
        jvm.start()

        loader = Loader(classname="weka.core.converters.CSVLoader")
        data = loader.load_file("./data/adult.csv")

        data.class_is_last()  # set class attribute

        # randomize data
        folds = k
        seed = 1
        rnd = Random(seed)
        rand_data = Instances.copy_instances(data)
        rand_data.randomize(rnd)
        if rand_data.class_attribute.is_nominal:
            rand_data.stratify(folds)

        NaiveBayes(rand_data, folds, seed, data)
        DecisionTree(rand_data, folds, seed, data)
    except Exception as e:
        raise e
    finally:
        jvm.stop()
Exemplo n.º 10
0
def Feature_Selection(infile):
    directory = os.getcwd() + '/'
    csvpath = directory + infile

    jvm.start(packages=True, max_heap_size="4g")
    print "\n\n"
    print "Loaded file: ", infile
    csvloader = Loader(classname="weka.core.converters.CSVLoader")
    csvdata = csvloader.load_file(csvpath)

    remover = Filter(classname="weka.filters.unsupervised.attribute.Remove",
                     options=["-R", " 1"])
    remover.inputformat(csvdata)
    filtered_data = remover.filter(csvdata)
    filtered_data.class_is_last()

    search = ASSearch(classname="weka.attributeSelection.BestFirst",
                      options=["-D", "1", "-N", "5"])
    evaluator = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval",
                             options=["-P", "1", "-E", "1"])
    attribs = AttributeSelection()
    attribs.search(search)
    attribs.evaluator(evaluator)
    attribs.select_attributes(filtered_data)
    print "Summary of Attribute Selection: "
    print attribs.results_string
    jvm.stop()
    return
def remove_package(pkg):
    if packages.is_installed(pkg):
        print("Removing %s..." % pkg)
        packages.uninstall_package(pkg)
        print("Removed %s, please re-run script!" % pkg)
        jvm.stop()
        sys.exit(0)
    print('No such package is installed')
Exemplo n.º 12
0
def start_up_weka(path):
    try:
        jvm.start()

        uninstall_unofficial_packages(path)
        install_packages(path)
    finally:
        jvm.stop()
Exemplo n.º 13
0
def main():
    """
    Runs a clusterer from the command-line. Calls JVM start/stop automatically.
    Use -h to see all options.
    """
    parser = argparse.ArgumentParser(
        description='Performs clustering from the command-line. Calls JVM start/stop automatically.')
    parser.add_argument("-j", metavar="classpath", dest="classpath", help="additional classpath, jars/directories")
    parser.add_argument("-X", metavar="heap", dest="heap", help="max heap size for jvm, e.g., 512m")
    parser.add_argument("-t", metavar="train", dest="train", required=True, help="training set file")
    parser.add_argument("-T", metavar="test", dest="test", help="test set file")
    parser.add_argument("-d", metavar="outmodel", dest="outmodel", help="model output file name")
    parser.add_argument("-l", metavar="inmodel", dest="inmodel", help="model input file name")
    parser.add_argument("-p", metavar="attributes", dest="attributes", help="attribute range")
    parser.add_argument("-x", metavar="num folds", dest="numfolds", help="number of folds")
    parser.add_argument("-s", metavar="seed", dest="seed", help="seed value for randomization")
    parser.add_argument("-c", metavar="class index", dest="classindex", help="1-based class attribute index")
    parser.add_argument("-g", metavar="graph", dest="graph", help="graph output file (if supported)")
    parser.add_argument("clusterer", help="clusterer classname, e.g., weka.clusterers.SimpleKMeans")
    parser.add_argument("option", nargs=argparse.REMAINDER, help="additional clusterer options")
    parsed = parser.parse_args()
    jars = []
    if parsed.classpath is not None:
        jars = parsed.classpath.split(os.pathsep)
    params = []
    if parsed.train is not None:
        params.extend(["-t", parsed.train])
    if parsed.test is not None:
        params.extend(["-T", parsed.test])
    if parsed.outmodel is not None:
        params.extend(["-d", parsed.outmodel])
    if parsed.inmodel is not None:
        params.extend(["-l", parsed.inmodel])
    if parsed.attributes is not None:
        params.extend(["-p", parsed.attributes])
    if parsed.numfolds is not None:
        params.extend(["-x", parsed.numfolds])
    if parsed.seed is not None:
        params.extend(["-s", parsed.seed])
    if parsed.classindex is not None:
        params.extend(["-c", parsed.classindex])
    if parsed.graph is not None:
        params.extend(["-g", parsed.graph])

    jvm.start(jars, max_heap_size=parsed.heap, packages=True)

    logger.debug("Commandline: " + join_options(sys.argv[1:]))

    try:
        clusterer = Clusterer(classname=parsed.clusterer)
        if len(parsed.option) > 0:
            clusterer.options = parsed.option
        print(ClusterEvaluation.evaluate_clusterer(clusterer, params))
    except Exception as e:
        print(e)
    finally:
        jvm.stop()
Exemplo n.º 14
0
def stop():
    """
    Stop a weka connection.

    May be called multiple times, but note that a new connection
    cannot be started after calling this.
    """
    if MODULE_SUPPORTED:
        jvm.stop()
def install_package(pkg):
    # install weka package if necessary
    if not packages.is_installed(pkg):
        print("Installing %s..." % pkg)
        packages.install_package(pkg)
        print("Installed %s, please re-run script!" % pkg)
        jvm.stop()
        sys.exit(0)
    print('Package already installed.')
Exemplo n.º 16
0
def metadata_path_start(now, args, datasets_names, queue=None):
    """
    Creates a metadata path for this run.

    :param now: current time, as a datetime.datetime object.
    :param args: args object, as generated by argparse library
    :param datasets_names: names of datasets, as a list
    :param queue: queue of processes
    """

    jvm.start()

    str_time = now.strftime('%d-%m-%Y-%H-%M-%S')

    joined = os.getcwd() if not os.path.isabs(args.metadata_path) else ''
    to_process = [args.metadata_path, str_time]

    for path in to_process:
        joined = os.path.join(joined, path)
        if not os.path.exists(joined):
            os.mkdir(joined)

    with open(os.path.join(joined, 'parameters.json'), 'w') as f:
        json.dump({k: getattr(args, k) for k in args.__dict__}, f, indent=2)

    these_paths = []
    for dataset_name in datasets_names:
        local_joined = os.path.join(joined, dataset_name)
        these_paths += [local_joined]

        if not os.path.exists(local_joined):
            os.mkdir(local_joined)
            os.mkdir(os.path.join(local_joined, 'overall'))

        y_tests = []
        class_name = None
        for n_fold in range(1, 11):
            train_data, test_data = read_datasets(
                os.path.join(args.datasets_path, dataset_name), n_fold)
            y_tests += [test_data.values(test_data.class_attribute.index)]
            class_name = train_data.class_attribute.name

        # concatenates array of y's
        pd.DataFrame(np.concatenate(y_tests),
                     columns=[class_name
                              ]).to_csv(os.path.join(local_joined, 'overall',
                                                     'y_test.txt'),
                                        index=False)

    jvm.stop()

    if queue is not None:
        queue.put(these_paths)

    return joined
def run_multi_subset_experiment(alg_names, ds_names):
    try:
        jvm.start()

        for alg_name in alg_names:
            run_subset_experiment(alg_name, ds_names)

    except Exception as e:
        print(traceback.format_exc())
    finally:
        jvm.stop()
Exemplo n.º 18
0
 def quit(self):
     """Ask if user want to quit. If yes, close the GUI.
     """
     if tkMessageBox.askokcancel("Quit", "Do you want to quit?"):
         try:
             import weka.core.jvm as jvm
             jvm.stop()
         except:
             pass
         self._root.quit()
         self._root.destroy()
def main(argv):
    global input
    global outputModel
    global inputModel
    global outputPrediction
    global isTest
    global my_list

    input = " "
    outputModel = " "
    inputModel = " "
    outputPrediction = " "
    counter = 0
    isTest = 0
    my_list = []

    try:
        opts, args = getopt.getopt(argv, "hi:o:m:",
                                   ["ifile=", "ofile=", "mfile="])
        print len(opts)
    except getopt.GetoptError:
        sys.exit(2)
    if len(opts) == 3:
        isTest = 1
    else:
        isTest = 0

    for opt, arg in opts:
        print arg
        if len(opts) == 3:
            if opt == "-i":
                input = arg
            elif opt == "-o":
                outputPrediction = arg
            elif opt == "-m":
                inputModel = arg
        else:
            if opt == "-i":
                input = arg
            elif opt == "-o":
                outputModel = arg

    if isTest == 0:
        import weka.core.jvm as jvm
        jvm.start()
        excractFeatures()

    if isTest == 1:
        import weka.core.jvm as jvm
        jvm.start()
        excractFeatures()
        jvm.stop()
        sys.exit()
Exemplo n.º 20
0
def generate_folds(dataset_path, output_folder, n_folds=10, random_state=None):
    """
    Given a dataset df, generate n_folds for it and store them in <output_folder>/<dataset_name>.

    :type dataset_path: str
    :param dataset_path: Path to dataset with .arff file extension (i.e my_dataset.arff)
    :type output_folder: str
    :param output_folder: Path to store both index file with folds and fold files.
    :type n_folds: int
    :param n_folds: Optional - Number of folds to split the dataset into. Defaults to 10.
    :type random_state: int
    :param random_state: Optional - Seed to use in the splitting process. Defaults to None (no seed).
    """

    import warnings
    warnings.filterwarnings('error')

    dataset_name = dataset_path.split('/')[-1].split('.')[0]

    af = load_arff(dataset_path)
    df = load_dataframe(af)

    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_state)
    fold_iter = skf.split(df[df.columns[:-1]], df[df.columns[-1]])

    fold_index = dict()

    jvm.start()

    csv_loader = Loader(classname="weka.core.converters.CSVLoader")
    arff_saver = Saver(classname='weka.core.converters.ArffSaver')

    for i, (arg_rest, arg_test) in enumerate(fold_iter):
        fold_index[i] = list(arg_test)

        _temp_path = 'temp_%s_%d.csv' % (dataset_name, i)

        fold_data = df.loc[arg_test]  # type: pd.DataFrame
        fold_data.to_csv(_temp_path, sep=',', index=False)

        java_arff_dataset = csv_loader.load_file(_temp_path)
        java_arff_dataset.relationname = af['relation']
        java_arff_dataset.class_is_last()
        arff_saver.save_file(java_arff_dataset, os.path.join(output_folder, '%s_fold_%d.arff' % (dataset_name, i)))

        os.remove(_temp_path)

    json.dump(
        fold_index, open(os.path.join(output_folder, dataset_name + '.json'), 'w'), indent=2
    )

    jvm.stop()
    warnings.filterwarnings('default')
def start_search(file_name, type):
	tile_set, characteristic, nmrClass = read_file(file_name) # gets data from file
	tile_set = generate_placeholders(tile_set, characteristic, nmrClass) # gets place holder tiles
	jvm.stop()
	root = Node(tile_set, [], 0.0, characteristic,0,0) # makes start state for search

	# picks algorithm
	if (int(type) == 0): # uniform cost search
		best_solution, node_count = uniform_cost([root])
		output_soultion(best_solution, node_count)
	elif (int(type) == 1): # puzzle building
		best_solution = puzzle_building_search([root])
Exemplo n.º 22
0
def evaluate(request, data):
    decoded_data = unquote(data) if data else []
    parsed_keys = parse_qs(decoded_data)
    parsed_string = parsed_keys['keywords'][0]
    filename = parsed_keys['filename'][0]
    quantity = int(parsed_keys['quantity'][0])
    startdate = parsed_keys['startdate'][0]
    enddate = parsed_keys['enddate'][0]
    language = parsed_keys['language'][0]
    file_location = get_file_destinantion(filename)

    if not path.exists(file_location):
        return render(request, 'model_not_found.html')

    keywords_parsed = parsed_string.split(',')
    query = ' OR '.join(keywords_parsed)
    tweets = []

    jvm.start()

    model = load_classification_model(filename)

    tweetCriteria = manager\
        .TweetCriteria()\
        .setQuerySearch(query)\
        .setSince(startdate)\
        .setUntil(enddate)\
        .setLang(language)\
        .setMaxTweets(quantity)
    tweets = manager.TweetManager.getTweets(tweetCriteria)
    cleaned_tweets = get_cleaned_tweets(tweets)
    dataset = create_dataset(cleaned_tweets)

    predictions = evaluate_model_and_testset(model, dataset)

    jvm.stop()

    labels = model['classes']
    values = get_value_of_classes(model['classes'], predictions)
    fill_color = generate_random_colors(model['classes'])
    predictions_colors = get_predictions_colors(predictions, model['classes'], fill_color)

    data = {
        "keywords": keywords_parsed if keywords_parsed else [],
        "tweets": zip(tweets, cleaned_tweets, predictions, predictions_colors),
        "size": len(tweets),
        "labels": labels,
        "values": values,
        "fill_color": fill_color,
    }

    return render(request, 'evaluator.html', { "data": data })
Exemplo n.º 23
0
def assign_classify(file_location, output="classified.out", model="naivebayes.model"):
    data = read_csv_file(file_location)
    jvm.start()
    # load clusters
    obj = serialization.read(model)
    classifier = Classifier(jobject=obj)
    # create file with cluster group
    with open(output, 'w') as cluster_file:
        for index, attrs in enumerate(data):
            inst = Instance.create_instance(attrs[1:])
            pred = classifier.classify_instance(inst)
            print(str(index + 1) + ": label index=" + str(pred))
    jvm.stop()
Exemplo n.º 24
0
def predict(attributes):
    jvm.start()
    file_path = print_to_file(attributes)
    # load the saved model
    objects = serialization.read_all("/Users/hosyvietanh/Desktop/data_mining/trained_model.model")
    classifier = Classifier(jobject=objects[0])
    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file(file_path)
    data.class_is_last()
    for index, inst in enumerate(data):
        pred = classifier.classify_instance(inst)
        dist = classifier.distribution_for_instance(inst)
        return int(pred)
    jvm.stop()
def playback_speed_checker(inputFile, dirRef):
    
    TRAINING_ARFF = 'dataset_playback.arff'
    inputRef = ""

    # Start JVM
    jvm.start()
    jvm.start(system_cp=True, packages=True)
    jvm.start(max_heap_size="512m")
    
    # Find reference file
    for file in os.listdir(dirRef):
        if str(file).find(str(os.path.basename(inputFile))) != -1:
            inputRef = os.path.join(dirRef, file)
            break

    # Calculation distance
    (result, distance) = dtw_checker(inputFile, inputRef)

    # Loading data
    loader = Loader(classname="weka.core.converters.ArffLoader")    
    data = loader.load_file(TRAINING_ARFF)
    data.class_is_last()                    # set class attribute

    # Train the classifier
    #cls = Classifier(classname="weka.classifiers.functions.SMO")
    cls = Classifier(classname="weka.classifiers.trees.J48", options = ["-C", "0.3", "-M", "10"])
    cls.build_classifier(data)

    # Classify instance
    speed_instance = Instance.create_instance(numpy.ndarray(distance), classname='weka.core.DenseInstance', weight=1.0)
    speed_instance.dataset = data
    
    # Classify instance
    speed_flag = cls.classify_instance(speed_instance)
    
    if (distance == 0):
        speed_class = 'nominal'
    else:
        if speed_flag == 0: speed_class = 'down_speed'
        if speed_flag == 0: speed_class = 'up_speed'
        
#    print os.path.basename(inputFile) + ' --- ' + speed_class
    
    # Stop JVM
    jvm.stop()    

    print "SPEED IS: " + speed_class

    return speed_class
Exemplo n.º 26
0
def start_search(file_name, type):
    tile_set, characteristic, nmrClass = read_file(
        file_name)  # gets data from file
    tile_set = generate_placeholders(tile_set, characteristic,
                                     nmrClass)  # gets place holder tiles
    jvm.stop()
    root = Node(tile_set, [], 0.0, characteristic, 0,
                0)  # makes start state for search

    # picks algorithm
    if (int(type) == 0):  # uniform cost search
        best_solution, node_count = uniform_cost([root])
        output_soultion(best_solution, node_count)
    elif (int(type) == 1):  # puzzle building
        best_solution = puzzle_building_search([root])
Exemplo n.º 27
0
def query_instance(attributes, model="out.model"):
    """
        get the cluster for defined attributes
        :params attributes: array or list
        :returns: cluster id
    """
    jvm.start()
    # create instance
    inst = Instance(attributes)
    # load model
    obj = serialization.read(model)
    # load cluster and get the cluster_id
    cluster = Clusterer(jobject=obj)
    cluster_id = cluster.cluster_instance(inst)
    jvm.stop()
    return cluster_id
def main():
    """
    Runs attribute selection from the command-line. Calls JVM start/stop automatically.
    Use -h to see all options.
    """
    parser = argparse.ArgumentParser(
        description='Performs attribute selection from the command-line. Calls JVM start/stop automatically.')
    parser.add_argument("-j", metavar="classpath", dest="classpath", help="additional classpath, jars/directories")
    parser.add_argument("-X", metavar="heap", dest="heap", help="max heap size for jvm, e.g., 512m")
    parser.add_argument("-i", metavar="input", dest="input", required=True, help="input file")
    parser.add_argument("-c", metavar="class index", dest="classindex", help="1-based class attribute index")
    parser.add_argument("-s", metavar="search", dest="search", help="search method, classname and options")
    parser.add_argument("-x", metavar="num folds", dest="numfolds", help="number of folds")
    parser.add_argument("-n", metavar="seed", dest="seed", help="the seed value for randomization")
    parser.add_argument("evaluator", help="evaluator classname, e.g., weka.attributeSelection.CfsSubsetEval")
    parser.add_argument("option", nargs=argparse.REMAINDER, help="additional evaluator options")
    parsed = parser.parse_args()
    jars = []
    if parsed.classpath is not None:
        jars = parsed.classpath.split(os.pathsep)
    params = []
    if parsed.input is not None:
        params.extend(["-i", parsed.input])
    if parsed.classindex is not None:
        params.extend(["-c", parsed.classindex])
    if parsed.search is not None:
        params.extend(["-s", parsed.search])
    if parsed.numfolds is not None:
        params.extend(["-x", parsed.numfolds])
    if parsed.seed is not None:
        params.extend(["-n", parsed.seed])

    jvm.start(jars, max_heap_size=parsed.heap, packages=True)

    logger.debug("Commandline: " + join_options(sys.argv[1:]))

    try:
        evaluation = ASEvaluation(classname=parsed.evaluator)
        if len(parsed.option) > 0:
            evaluation.options = parsed.option
        print(AttributeSelection.attribute_selection(evaluation, params))
    except Exception as e:
        print(e)
    finally:
        jvm.stop()
Exemplo n.º 29
0
 def dict2arff(self, fileIn, fileOut):
     '''
     :param fileIn: name of csv file
     :param fileOut: name of new arff file
     :return:
     '''
     dataIn = os.path.join(self.dataDir, fileIn)
     dataOut = os.path.join(self.dataDir, fileOut)
     logger.info('[%s] : [INFO] Starting conversion of %s to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), dataIn, dataOut)
     try:
         jvm.start()
         convertCsvtoArff(dataIn, dataOut)
     except Exception as inst:
         pass
     finally:
         logger.error('[%s] : [ERROR] Exception occured while converting to arff with %s and %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)
         jvm.stop()
     logger.info('[%s] : [INFO] Finished conversion of %s to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), dataIn, dataOut)
Exemplo n.º 30
0
def riaa_checker(inputFile):
    
    TRAINING_ARFF = 'C:\Users\ASUS\Desktop\IGNASI\SMC\Workspace\dataset_riaa.arff'

    # Start JVM
    jvm.start()
    jvm.start(system_cp=True, packages=True)
    jvm.start(max_heap_size="512m")

    # Calculation of bark bands information
    (absolute_bark, relative_bark, bark_ratios) = compute_bark_spectrum(inputFile)

    # Loading data
    loader = Loader(classname="weka.core.converters.ArffLoader")    
    data = loader.load_file(TRAINING_ARFF)
    data.class_is_last()                    # set class attribute

    # Train the classifier
    cls = Classifier(classname="weka.classifiers.functions.SMO")
    #cls = Classifier(classname="weka.classifiers.trees.J48", options = ["-C", "0.3", "-M", "10"])
    cls.build_classifier(data)

    # Classify instance
    bark_instance = Instance.create_instance(bark_ratios, classname='weka.core.DenseInstance', weight=1.0)
    bark_instance.dataset = data
    
    # Classify instance
    riaa_flag = cls.classify_instance(bark_instance)
    
    if riaa_flag == 0:
        riaa_class = 'riaa_ok'
    else:
        riaa_class = 'riaa_ko'
        
#    print os.path.basename(inputFile) + ' --- ' + riaa_class
    
    # Stop JVM
    jvm.stop()   

    print "RIAA FILTERING?: " + riaa_class

    return riaa_class
Exemplo n.º 31
0
def batch_riaa_checking(inputDir):

    # Start JVM
    jvm.start()
    jvm.start(system_cp=True, packages=True)
    jvm.start(max_heap_size="512m")

    riaa_ok = 0
    riaa_ko = 0

    for file in os.listdir(inputDir):
        if file.endswith(".wav"):
            riaa_flag = riaa_checker(os.path.join(inputDir, file))
            if (riaa_flag == 'riaa_ko'): riaa_ko+=1
            if (riaa_flag == 'riaa_ok'): riaa_ok+=1
    
    # Stop JVM
    jvm.stop()      
    
    return (riaa_ko, riaa_ok)
Exemplo n.º 32
0
def classify(train, test, name="RF", tuning=False):
    jvm.start()

    if isinstance(train, list) and isinstance(test, list):
        train = weka_instance(train)
        trn_data = converters.load_any_file(train)
        test = weka_instance(test)
        tst_data = converters.load_any_file(test)

    elif os.path.isfile(train) and os.path.isfile(test):
        trn_data = converters.load_any_file(train)
        tst_data = converters.load_any_file(test)

    else:
        trn = csv_as_ndarray(train)
        tst = csv_as_ndarray(test)

        trn_data = converters.ndarray_to_instances(trn, relation="Train")
        tst_data = converters.ndarray_to_instances(tst, relation="Test")

    trn_data.class_is_last()
    tst_data.class_is_last()

    # t = time()
    if tuning:
        opt = tune(train)
    else:
        opt = default_opt
    # print("Time to tune: {} seconds".format(time() - t))

    cls = Classifier(classname=classifiers[name.lower()], options=opt)

    cls.build_classifier(trn_data)

    distr = [cls.distribution_for_instance(inst)[1] for inst in tst_data]
    preds = [cls.classify_instance(inst) for inst in tst_data]

    jvm.stop()

    return preds, distr
Exemplo n.º 33
0
import weka.core.jvm as jvm
jvm.start()

jvm.start(system_cp=True, packages=True)
jvm.start(packages="/usr/local/lib/python2.7/dist-packages/weka")

jvm.start(max_heap_size="512m")

data_dir="CSDMC2010_SPAM/CSDMC2010_SPAM/TRAINING"


from  weka.classifiers  import  Classifier 
cls = Classifier(classname="weka.classifiers.trees.J48")
cls.options = ["-C", "0.3"]
print(cls.options)


jvm.stop()
Exemplo n.º 34
0
def evaluate_j48(datasets_path, intermediary_path):
    # for examples on how to use this function, refer to
    # http://pythonhosted.org/python-weka-wrapper/examples.html#build-classifier-on-dataset-output-predictions
    import weka.core.jvm as jvm
    from weka.core.converters import Loader
    from weka.classifiers import Classifier
    from sklearn.metrics import precision_score, accuracy_score, f1_score

    from networkx.drawing.nx_agraph import graphviz_layout

    jvm.start()

    json_results = {
        'runs': {
            '1': dict()
        }
    }

    try:
        for dataset in os.listdir(datasets_path):
            dataset_name = dataset.split('.')[0]

            json_results['runs']['1'][dataset_name] = dict()

            loader = Loader(classname="weka.core.converters.ArffLoader")

            y_pred_all = []
            y_true_all = []
            heights = []
            n_nodes = []

            for n_fold in it.count():
                try:
                    train_s = loader.load_file(
                        os.path.join(intermediary_path, '%s_fold_%d_train.arff' % (dataset_name, n_fold)))
                    val_s = loader.load_file(
                        os.path.join(intermediary_path, '%s_fold_%d_val.arff' % (dataset_name, n_fold)))
                    test_s = loader.load_file(
                        os.path.join(intermediary_path, '%s_fold_%d_test.arff' % (dataset_name, n_fold)))

                    train_s.relationname = dataset_name
                    val_s.relationname = dataset_name
                    test_s.relationname = dataset_name

                    train_s.class_is_last()
                    val_s.class_is_last()
                    test_s.class_is_last()

                    warnings.warn('WARNING: appending validation set in training set.')
                    for inst in val_s:
                        train_s.add_instance(inst)

                    cls = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"])
                    # cls = Classifier(classname="weka.classifiers.trees.REPTree",
                    # options=["-M", "2", "-V", "0.001", "-N", "3", "-S", "1", "-L", "-1", "-I", "0.0"])
                    cls.build_classifier(train_s)

                    warnings.warn('WARNING: will only work for binary splits!')
                    graph = cls.graph.encode('ascii')
                    out = StringIO.StringIO(graph)
                    G = nx.Graph(nx.nx_pydot.read_dot(out))

                    # TODO plotting!
                    # fig = plt.figure(figsize=(40, 30))
                    # pos = graphviz_layout(G, root='N0', prog='dot')
                    #
                    # edgelist = G.edges(data=True)
                    # nodelist = G.nodes(data=True)
                    #
                    # edge_labels = {(x1, x2): v['label'] for x1, x2, v in edgelist}
                    # node_colors = {node_id: ('#98FB98' if 'shape' in _dict else '#0099FF') for node_id, _dict in nodelist}
                    # node_colors['N0'] = '#FFFFFF'
                    # node_colors = node_colors.values()
                    #
                    # nx.draw_networkx_nodes(G, pos, node_color=node_colors)
                    # nx.draw_networkx_edges(G, pos, style='dashed', arrows=False)
                    # nx.draw_networkx_labels(G, pos, {k: v['label'] for k, v in G.node.iteritems()})
                    # nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
                    # plt.axis('off')
                    # plt.show()
                    # exit(0)
                    # TODO plotting!

                    heights += [max(map(len, nx.shortest_path(G, source='N0').itervalues()))]
                    n_nodes += [len(G.node)]

                    y_test_true = []
                    y_test_pred = []

                    # y_train_true = []
                    # y_train_pred = []

                    # for index, inst in enumerate(train_s):
                    #     y_train_true += [inst.get_value(inst.class_index)]
                    #     y_train_pred += [cls.classify_instance(inst)]

                    for index, inst in enumerate(test_s):
                        y_test_true += [inst.get_value(inst.class_index)]
                        y_test_pred += [cls.classify_instance(inst)]

                    y_true_all += y_test_true
                    y_pred_all += y_test_pred

                except Exception as e:
                    break

            json_results['runs']['1'][dataset_name] = {
                'confusion_matrix': confusion_matrix(y_true_all, y_pred_all).tolist(),
                'height': heights,
                'n_nodes': n_nodes,
            }

        # interprets
        json_results = json.load(open('/home/henry/Desktop/j48/j48_results.json', 'r'))

        n_runs = len(json_results['runs'].keys())
        some_run = json_results['runs'].keys()[0]
        n_datasets = len(json_results['runs'][some_run].keys())

        df = pd.DataFrame(
            columns=['run', 'dataset', 'test_acc', 'height mean', 'height std', 'n_nodes mean', 'n_nodes std'],
            index=np.arange(n_runs * n_datasets),
            dtype=np.float32
        )

        df['dataset'] = df['dataset'].astype(np.object)

        count_row = 0
        for n_run, run in json_results['runs'].iteritems():
            for dataset_name, dataset in run.iteritems():
                conf_matrix = np.array(dataset['confusion_matrix'], dtype=np.float32)

                test_acc = np.diag(conf_matrix).sum() / conf_matrix.sum()

                height_mean = np.mean(dataset['height'])
                height_std = np.std(dataset['height'])
                n_nodes_mean = np.mean(dataset['n_nodes'])
                n_nodes_std = np.std(dataset['n_nodes'])

                df.loc[count_row] = [
                    int(n_run), str(dataset_name), float(test_acc),
                    float(height_mean), float(height_std), float(n_nodes_mean), float(n_nodes_std)
                ]
                count_row += 1

        print df
        json.dump(json_results, open('j48_results.json', 'w'), indent=2)
        df.to_csv('j48_results.csv', sep=',', quotechar='\"', index=False)

    finally:
        jvm.stop()
Exemplo n.º 35
0
    def runner(self, cdat, heap_size = 16384, seed = None, verbose = True):
        self.set_status(Pipeline.RUNNING)

        self.logs.append('Initializing Pipeline')

        para = self.config

        self.logs.append('Reading Pipeline Configuration')

        head = ''
        name = get_rand_uuid_str()

        self.logs.append('Reading Input File')

        for i, stage in enumerate(self.stages):
            if stage.code in ('dat.fle', 'prp.bgc', 'prp.nrm', 'prp.pmc', 'prp.sum'):
                self.stages[i].status = Pipeline.RUNNING
            if stage.code ==  'dat.fle':
                head    = os.path.abspath(stage.value.path)
                name, _ = os.path.splitext(stage.value.name)

        self.logs.append('Parsing to ARFF')

        path = os.path.join(head, '{name}.arff'.format(name = name))
        # This bug, I don't know why, using Config.schema instead.
        # cdat.toARFF(path, express_config = para.Preprocess.schema, verbose = verbose)

        for i, stage in enumerate(self.stages):
            if stage.code in ('dat.fle', 'prp.bgc', 'prp.nrm', 'prp.pmc', 'prp.sum'):
                self.stages[i].status = Pipeline.COMPLETE

        self.logs.append('Saved ARFF at {path}'.format(path = path))
        self.logs.append('Splitting to Training and Testing Sets')

        JVM.start(max_heap_size = '{size}m'.format(size = heap_size))

        load = Loader(classname = 'weka.core.converters.ArffLoader')
        # data = load.load_file(path)
        # save =  Saver(classname = 'weka.core.converters.ArffSaver')
        data = load.load_file(os.path.join(head, 'iris.arff')) # For Debugging Purposes Only
        data.class_is_last() # For Debugging Purposes Only
        # data.class_index = cdat.iclss

        for i, stage in enumerate(self.stages):
            if stage.code == 'prp.kcv':
                self.stages[i].status = Pipeline.RUNNING

        self.logs.append('Splitting Training Set')

        # TODO - Check if this seed is worth it.
        seed = assign_if_none(seed, random.randint(0, 1000))
        opts = ['-S', str(seed), '-N', str(para.Preprocess.FOLDS)]
        wobj = Filter(classname = 'weka.filters.supervised.instance.StratifiedRemoveFolds', options = opts + ['-V'])
        wobj.inputformat(data)

        tran = wobj.filter(data)

        self.logs.append('Splitting Testing Set')

        wobj.options = opts
        test = wobj.filter(data)

        for i, stage in enumerate(self.stages):
            if stage.code == 'prp.kcv':
                self.stages[i].status = Pipeline.COMPLETE

        self.logs.append('Performing Feature Selection')

        feat = [ ]
        for comb in para.FEATURE_SELECTION:
            if comb.USE:
                for i, stage in enumerate(self.stages):
                    if stage.code == 'ats':
                        search    = stage.value.search.name
                        evaluator = stage.value.evaluator.name

                        if search == comb.Search.NAME and evaluator == comb.Evaluator.NAME:
                            self.stages[i].status = Pipeline.RUNNING

                srch = ASSearch(classname = 'weka.attributeSelection.{classname}'.format(
                    classname = comb.Search.NAME,
                    options   = assign_if_none(comb.Search.OPTIONS, [ ])
                ))
                ewal = ASEvaluation(classname = 'weka.attributeSelection.{classname}'.format(
                    classname = comb.Evaluator.NAME,
                    options   = assign_if_none(comb.Evaluator.OPTIONS, [ ])
                ))

                attr = AttributeSelection()
                attr.search(srch)
                attr.evaluator(ewal)
                attr.select_attributes(tran)

                meta = addict.Dict()
                meta.search    = comb.Search.NAME
                meta.evaluator = comb.Evaluator.NAME
                meta.features  = [tran.attribute(index).name for index in attr.selected_attributes]

                feat.append(meta)

                for i, stage in enumerate(self.stages):
                    if stage.code == 'ats':
                        search    = stage.value.search.name
                        evaluator = stage.value.evaluator.name

                        if search == comb.Search.NAME and evaluator == comb.Evaluator.NAME:
                            self.stages[i].status = Pipeline.COMPLETE

        models = [ ]
        for model in para.MODEL:
            if model.USE:
                summary         = addict.Dict()

                self.logs.append('Modelling {model}'.format(model = model.LABEL))

                summary.label   = model.LABEL
                summary.name    = model.NAME
                summary.options = assign_if_none(model.OPTIONS, [ ])

                for i, stage in enumerate(self.stages):
                    if stage.code == 'lrn' and stage.value.name == model.NAME:
                        self.stages[i].status = Pipeline.RUNNING

                for i, instance in enumerate(data):
                    iclass = list(range(instance.num_classes))
                
                options    = assign_if_none(model.OPTIONS, [ ])
                classifier = Classifier(classname = 'weka.classifiers.{classname}'.format(classname = model.NAME), options = options)
                classifier.build_classifier(tran)
        
                serializer.write(os.path.join(head, '{name}.{classname}.model'.format(
                        name = name,
                    classname = model.NAME
                )), classifier)

                self.logs.append('Testing model {model}'.format(model = model.LABEL))

                evaluation       = Evaluation(tran)
                evaluation.test_model(classifier, test)

                summary.summary  = evaluation.summary()

                frame  = pd.DataFrame(data = evaluation.confusion_matrix)
                axes   = sns.heatmap(frame, cbar = False, annot = True)
                b64str = get_b64_plot(axes)
                
                summary.confusion_matrix = addict.Dict({
                    'value': evaluation.confusion_matrix.tolist(),
                     'plot': b64str
                })

                self.logs.append('Plotting Learning Curve for {model}'.format(model = model.LABEL))

                buffer = io.BytesIO()
                plot_classifier_errors(evaluation.predictions, tran, test, outfile = buffer, wait = False)
                b64str = buffer_to_b64(buffer)

                summary.learning_curve   = b64str

                buffer = io.BytesIO()
                plot_roc(evaluation, class_index = iclass, outfile = buffer, wait = False)
                b64str = buffer_to_b64(buffer)

                summary.roc_curve        = b64str

                buffer = io.BytesIO()
                plot_prc(evaluation, class_index = iclass, outfile = buffer, wait = False)
                b64str = buffer_to_b64(buffer)

                summary.prc_curve        = b64str

                if classifier.graph:
                    summary.graph = classifier.graph

                for i, instance in enumerate(test):
                    prediction = classifier.classify_instance(instance)

                for i, stage in enumerate(self.stages):
                    if stage.code == 'lrn' and stage.value.name == model.NAME:
                        self.stages[i].status = Pipeline.COMPLETE

                models.append(summary)

        self.gist.models = models

        JVM.stop()

        JSON.write(os.path.join(head, '{name}.cgist'.format(name = name)), self.gist)

        self.logs.append('Pipeline Complete')

        self.set_status(Pipeline.COMPLETE)
Exemplo n.º 36
0
 def stop(self):
     jvm.stop()
 def __del__(self):
     #stop JVM
     print "jvm stop"
     jvm.stop() 
def main():
    """
    Specify list of files to multi_file_curve, classify, and export results as csv.
    """
    try:
        # start up a JVM to run weka on
        jvm.start(max_heap_size='512m')

        # classifiers
        naive_bayes = Classifier(classname='weka.classifiers.bayes.NaiveBayes')
        zero_r = Classifier(classname='weka.classifiers.rules.ZeroR')
        bayes_net = Classifier(classname='weka.classifiers.bayes.BayesNet',
                               options=['-D', '-Q', 'weka.classifiers.bayes.net.search.local.K2',
                                        '--', '-P', '1', '-S', 'BAYES', '-E',
                                        'weka.classifiers.bayes.net.estimate.SimpleEstimator',
                                        '--', '-A', '0.5'])
        d_tree = Classifier(classname='weka.classifiers.trees.J48',
                            options=['-C', '0.25', '-M', '2'])

        file_list = [
            'data/aggregated_data.csv'
        ]

        name_list = [
            'multi-class'
        ]

        # classify and export
        percent_range = range(1, 101, 1)
        zero_r_curves = multi_file_curve(classifier=zero_r, classifier_name='zero_r',
                                         name_list=name_list, in_file_list=file_list,
                                         percentages=percent_range)
        naive_bayes_curves = multi_file_curve(classifier=naive_bayes, classifier_name='naive_bayes',
                                              name_list=name_list, in_file_list=file_list,
                                              percentages=percent_range)
        bayes_net_curves = multi_file_curve(classifier=bayes_net, classifier_name='bayes_net',
                                            name_list=name_list, in_file_list=file_list,
                                            percentages=percent_range)
        d_tree_curves = multi_file_curve(classifier=d_tree, classifier_name='d_tree',
                                         name_list=name_list, in_file_list=file_list,
                                         percentages=percent_range)

        # export
        csv_header = [
            'approach',
            'classifier',
            'percentage_dataset_training',
            'accuracy',
            'f_measure'
        ]
        with open('analysis/learning_curves.csv', 'wb') as f:
            csv_writer = csv.writer(f, delimiter=',')
            csv_writer.writerow(csv_header)
            for r in zero_r_curves:
                csv_writer.writerow(r)
            for r in naive_bayes_curves:
                csv_writer.writerow(r)
            for r in bayes_net_curves:
                csv_writer.writerow(r)
            for r in d_tree_curves:
                csv_writer.writerow(r)

    except RuntimeError:
        typ, value, tb = sys.exc_info()
        print typ
        print value
        print tb

        traceback.print_exc()
        pdb.post_mortem(tb)
    finally:
        jvm.stop()
Exemplo n.º 39
0
def run_classifier(path, prot, sel, cols, prot_vals, beta):
        
    DIs = dict()
    jvm.start()

    for i in range(len(cols)-1):
        loader = Loader(classname="weka.core.converters.CSVLoader")
        data = loader.load_file(path)
    
        # remove selected attribute from the data
        # NOTE: options are ONE indexed, not ZERO indexed
        remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", \
                        options=["-R", str(sel[2]+1)])
        remove.inputformat(data)
        data = remove.filter(data)

        # if running for only one attribue, remove all others (except protected)
        if i > 0:
            for j in range(1, prot[2]+1):
                if i != j:
                    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", \
                                    options=["-R", ("1" if i>j else "2")])
                    remove.inputformat(data)
                    data = remove.filter(data)

        # set prot attribute as Class attribute
        data.class_is_last()
        
        # run classifier
        cls = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
        cls.build_classifier(data)
    
        # count the number of each combination
        pos_and_pred = float(0.0)
        pos_and_not_pred = float(0.0)
        neg_and_pred = float(0.0)
        neg_and_not_pred = float(0.0)
        for ind, inst in enumerate(data):
            if cls.classify_instance(inst):
                if prot_vals[ind] == prot[1]:
                    pos_and_pred += 1
                else:
                    neg_and_pred += 1
            else:
                if prot_vals[ind] == prot[1]:
                    pos_and_not_pred += 1
                else:
                    neg_and_not_pred += 1

        # calculate DI
        BER = ((pos_and_not_pred / (pos_and_pred + pos_and_not_pred)) + \
               (neg_and_pred / (neg_and_pred + neg_and_not_pred))) * 0.5
        if BER > 0.5:
            BER = 1 - BER
        DI = 1 - ((1 - 2 * BER) / (beta + 1 - 2 * BER))

        if i == 0: # consider changing this to a 'code word' instead of 'all'
            DIs["all"] = DI
        else:
            DIs[cols[i-1]] = DI

    jvm.stop()

    return DIs
Exemplo n.º 40
0
def main():
    """
    Runs a filter from the command-line. Calls JVM start/stop automatically.
    Use -h to see all options.
    """
    parser = argparse.ArgumentParser(
        description='Executes a filter from the command-line. Calls JVM start/stop automatically.')
    parser.add_argument("-j", metavar="classpath", dest="classpath", help="additional classpath, jars/directories")
    parser.add_argument("-X", metavar="heap", dest="heap", help="max heap size for jvm, e.g., 512m")
    parser.add_argument("-i", metavar="input1", dest="input1", required=True, help="input file 1")
    parser.add_argument("-o", metavar="output1", dest="output1", required=True, help="output file 1")
    parser.add_argument("-r", metavar="input2", dest="input2", help="input file 2")
    parser.add_argument("-s", metavar="output2", dest="output2", help="output file 2")
    parser.add_argument("-c", metavar="classindex", default="-1", dest="classindex",
                        help="1-based class attribute index")
    parser.add_argument("filter", help="filter classname, e.g., weka.filters.AllFilter")
    parser.add_argument("option", nargs=argparse.REMAINDER, help="additional filter options")
    parsed = parser.parse_args()
    if parsed.input2 is None and parsed.output2 is not None:
        raise Exception("No second input file provided ('-r ...')!")

    jars = []
    if parsed.classpath is not None:
        jars = parsed.classpath.split(os.pathsep)
    params = []
    if parsed.input1 is not None:
        params.extend(["-i", parsed.input1])
    if parsed.output1 is not None:
        params.extend(["-o", parsed.output1])
    if parsed.input2 is not None:
        params.extend(["-r", parsed.input2])
    if parsed.output2 is not None:
        params.extend(["-s", parsed.output2])
    if parsed.classindex is not None:
        params.extend(["-c", parsed.classindex])

    jvm.start(jars, max_heap_size=parsed.heap, packages=True)

    logger.debug("Commandline: " + join_options(sys.argv[1:]))

    try:
        flter = Filter(parsed.filter)
        if len(parsed.option) > 0:
            flter.options = parsed.option
        loader = Loader(classname="weka.core.converters.ArffLoader")
        in1 = loader.load_file(parsed.input1)
        cls = parsed.classindex
        if str(parsed.classindex) == "first":
            cls = "0"
        if str(parsed.classindex) == "last":
            cls = str(in1.num_attributes - 1)
        in1.class_index = int(cls)
        flter.inputformat(in1)
        out1 = flter.filter(in1)
        saver = Saver(classname="weka.core.converters.ArffSaver")
        saver.save_file(out1, parsed.output1)
        if parsed.input2 is not None:
            in2 = loader.load_file(parsed.input2)
            in2.class_index = int(cls)
            out2 = flter.filter(in2)
            saver.save_file(out2, parsed.output2)
    except Exception as e:
        print(e)
    finally:
        jvm.stop()
def testing():
    logging.disable("weka")

    print "PROSES KLASIFIKASI\n------------------"

    jvm.start()

    pruning = 0
    while pruning < 2:

        persen_train = 0
        while persen_train < 4:

            fitur_hapus = 15
            while fitur_hapus >= 0:

                list_akurasi = []
                list_recall = []
                list_presisi = []
                list_fmeasure = []
                list_roc = []
                count = 0

                nama = "hasilTest/"
                if(pruning == 0):
                    nama += "unpruning"
                    if(persen_train == 0):
                        nama += "40"
                    elif(persen_train == 1):
                        nama += "50"
                    elif(persen_train == 2):
                        nama += "60"
                    else:
                        nama += "70"
                else:
                    nama += "pruning"
                    if(persen_train == 0):
                        nama += "40"
                    elif(persen_train == 1):
                        nama += "50"
                    elif(persen_train == 2):
                        nama += "60"
                    else:
                        nama += "70"

                if(fitur_hapus > 0):
                    nama += "removeF" + str(fitur_hapus) + ".txt"
                else:
                    nama += "normal.txt"

                f = open(nama, "w")

                if(pruning == 0):
                    nama = "unpruning"
                    print "Tanpa Pruning"
                    f.write("Hasil Decision Tree C4.5 tanpa Pruning (unpruning)\n")
                    if(persen_train == 0):
                        nama += "40"
                        f.write("Dengan Training Set sebesar 40%\n")
                    elif(persen_train == 1):
                        nama += "50"
                        f.write("Dengan Training Set sebesar 50%\n")
                    elif(persen_train == 2):
                        nama += "60"
                        f.write("Dengan Training Set sebesar 60%\n")
                    else:
                        nama += "70"
                        f.write("Dengan Training Set sebesar 70%\n")
                else:
                    nama = "pruning"
                    print "Dengan Pruning"
                    f.write("Hasil Decision Tree C4.5 Pruning\n")
                    if(persen_train == 0):
                        nama += "40"
                        f.write("Dengan Training Set sebesar 40%\n")
                    elif(persen_train == 1):
                        nama += "50"
                        f.write("Dengan Training Set sebesar 50%\n")
                    elif(persen_train == 2):
                        nama += "60"
                        f.write("Dengan Training Set sebesar 60%\n")
                    else:
                        nama += "70"
                        f.write("Dengan Training Set sebesar 70%\n")

                if(fitur_hapus > 0):
                    f.write("Menggunakan remove pada fitur " + str(fitur_hapus) + "\n\n")
                else:
                    f.write("\n")

                f.write("No. Akurasi Recall Presisi F-Measure ROC\n")

                if persen_train == 0:
                    print "40% Data Training"
                elif persen_train == 1:
                    print "50% Data Training"
                elif persen_train == 2:
                    print "60% Data Training"
                else:
                    print "70% Data Training"

                print "Fitur yang dihapus:", fitur_hapus
                print "\nNo.\tAkurasi\tRecall\tPresisi\tF-Measure\tROC"
                while count < 100:
                    loader = Loader(classname = "weka.core.converters.ArffLoader")
                    data = loader.load_file("hasil.arff")
                    data.class_is_last()

                    if(fitur_hapus > 0):
                        remove = Filter(classname = "weka.filters.unsupervised.attribute.Remove", options = ["-R", str(fitur_hapus)])
                        remove.inputformat(data)
                        data_baru = remove.filter(data)
                        data_baru.class_is_last()
                    else:
                        data_baru = loader.load_file("hasil.arff")
                        data_baru.class_is_last()

                    filter = Filter(classname = "weka.filters.unsupervised.instance.Randomize", options = ["-S", str(int(time.time()))])
                    filter.inputformat(data_baru)
                    data_random = filter.filter(data_baru)
                    data_random.class_is_last()

                    if(pruning == 0):
                        classifier = Classifier(classname = "weka.classifiers.trees.J48", options = ["-U"])
                    else:
                        classifier = Classifier(classname = "weka.classifiers.trees.J48", options = ["-C", "0.25"])

                    evaluation = Evaluation(data_random)
                    if(persen_train == 0):
                        evaluation.evaluate_train_test_split(classifier, data_random, percentage = 40)
                    elif(persen_train == 1):
                        evaluation.evaluate_train_test_split(classifier, data_random, percentage = 50)
                    elif(persen_train == 2):
                        evaluation.evaluate_train_test_split(classifier, data_random, percentage = 60)
                    else:
                        evaluation.evaluate_train_test_split(classifier, data_random, percentage = 70)

                    f.write(str(count + 1) + str( ". " ) + str(evaluation.weighted_true_positive_rate) + str( " " ) + str(evaluation.weighted_recall) + str( " " ) + str(evaluation.weighted_precision) + str( " " ) + str(evaluation.weighted_f_measure) + str( " " ) + str(evaluation.weighted_area_under_roc) + "\n")
                    print count + 1, evaluation.weighted_true_positive_rate, evaluation.weighted_recall, evaluation.weighted_precision, evaluation.weighted_f_measure, evaluation.weighted_area_under_roc

                    list_akurasi.append(evaluation.weighted_true_positive_rate)
                    list_recall.append(evaluation.weighted_recall)
                    list_presisi.append(evaluation.weighted_precision)
                    list_fmeasure.append(evaluation.weighted_f_measure)
                    list_roc.append(evaluation.weighted_area_under_roc)

                    count += 1
                    time.sleep(1)

                list_akurasi.sort()
                list_recall.sort()
                list_presisi.sort()
                list_fmeasure.sort()
                list_roc.sort()

                f.write( ""  + "\n")
                f.write( "Rata-Rata"  + "\n")
                f.write( "Akurasi:" + str(sum(list_akurasi) / 100.0)  + "\n")
                f.write( "Recall:" + str(sum(list_recall) / 100.0)  + "\n")
                f.write( "Presisi:" + str(sum(list_presisi) / 100.0)  + "\n")
                f.write( "F-Measure:" + str(sum(list_fmeasure) / 100.0)  + "\n")
                f.write( "ROC:" + str(sum(list_roc) / 100.0)  + "\n")
                f.write( ""  + "\n")
                f.write( "Max"  + "\n")
                f.write( "Akurasi:" + str(list_akurasi[-1] ) + "\n")
                f.write( "Recall:" + str(list_recall[-1] ) + "\n")
                f.write( "Presisi:" + str(list_presisi[-1] ) + "\n")
                f.write( "F-Measure:" + str(list_fmeasure[-1] ) + "\n")
                f.write( "ROC:" + str(list_roc[-1] ) + "\n")
                f.write( ""  + "\n")
                f.write( "Min"  + "\n")
                f.write( "Akurasi:" + str(list_akurasi[0] ) + "\n")
                f.write( "Recall:" + str(list_recall[0] ) + "\n")
                f.write( "Presisi:" + str(list_presisi[0] ) + "\n")
                f.write( "F-Measure:" + str(list_fmeasure[0] ) + "\n")
                f.write( "ROC:" + str(list_roc[0] ) + "\n")
                f.write( ""  + "\n")

                print ""
                print "Rata-Rata"
                print "Akurasi:", sum(list_akurasi) / 100.0
                print "Recall:", sum(list_recall) / 100.0
                print "Presisi:", sum(list_presisi) / 100.0
                print "F-Measure:", sum(list_fmeasure) / 100.0
                print "ROC:", sum(list_roc) / 100.0
                print ""
                print "Max"
                print "Akurasi:", list_akurasi[-1]
                print "Recall:", list_recall[-1]
                print "Presisi:", list_presisi[-1]
                print "F-Measure:", list_fmeasure[-1]
                print "ROC:", list_roc[-1]
                print ""
                print "Min"
                print "Akurasi:", list_akurasi[0]
                print "Recall:", list_recall[0]
                print "Presisi:", list_presisi[0]
                print "F-Measure:", list_fmeasure[0]
                print "ROC:", list_roc[0]
                print ""

                f.close()
                fitur_hapus -= 1

            persen_train += 1

        pruning += 1

    jvm.stop()