def main(): """ Runs a datagenerator from the command-line. Calls JVM start/stop automatically. Use -h to see all options. """ parser = argparse.ArgumentParser( description='Executes a data generator from the command-line. Calls JVM start/stop automatically.') parser.add_argument("-j", metavar="classpath", dest="classpath", help="additional classpath, jars/directories") parser.add_argument("-X", metavar="heap", dest="heap", help="max heap size for jvm, e.g., 512m") parser.add_argument("datagenerator", help="data generator classname, e.g., " + "weka.datagenerators.classifiers.classification.LED24") parser.add_argument("option", nargs=argparse.REMAINDER, help="additional data generator options") parsed = parser.parse_args() jars = [] if parsed.classpath is not None: jars = parsed.classpath.split(os.pathsep) jvm.start(jars, max_heap_size=parsed.heap, packages=True) logger.debug("Commandline: " + join_options(sys.argv[1:])) try: generator = DataGenerator(classname=parsed.datagenerator) if len(parsed.option) > 0: generator.options = parsed.option DataGenerator.make_data(generator, parsed.option) except Exception, e: print(e)
def read_file(file_name): tile_set_list = [] characteristic = [] jvm.start() nmrClass = Classifier(jobject=serialization.read("models/lmt_3sd.model")) with open(file_name) as f: # opens file # reads in characteristic protein sequence and coverts it to expected chemical shift values tile_characteristic = f.readline() characteristic = re.findall(r'\b[A-Za-z]{3,4}\b', tile_characteristic) characteristic = letters_to_numbers(characteristic) for line in f: # reads in NMR Data #reads each line and grabs numbers and na data #file format "a b c d" a, b, c, d = re.findall(r'\b\d+\.\d*\b|\bna\b', line) # Dealing with missing data if (a == "na"): a = -1 if (b == "na"): b = -1 if (c == "na"): c = -1 if (d == "na"): d = -1 # adds a new Tile to tile_set_list if (not (a==-1 and b==-1 and c==-1 and d==-1)): tile_set_list.append(Tile(a, b, c, d, nmrClass)) return tile_set_list, characteristic, nmrClass
def simpleKMeansTrain(self, dataf, options, mname, temp=True): ''' :param data: -> data to be clustered :param options: -> SimpleKMeans options N -> number of clusters A -> Distance function to use (ex: default is "weka.core.EuclideanDistance -R first-last") l -> maximum number of iterations default 500 num-slots -> number of execution slots, 1 means no parallelism S -> Random number seed (default 10) example => ["-N", "10", "-S", "10"] :return: ''' try: jvm.start(max_heap_size=self.wHeap) data = self.loadData(dataf, temp=True) clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=options) clusterer.build_clusterer(data) print clusterer # cluster the data for inst in data: cl = clusterer.cluster_instance(inst) # 0-based cluster index dist = clusterer.distribution_for_instance(inst) # cluster membership distribution print("cluster=" + str(cl) + ", distribution=" + str(dist)) self.saveModel(clusterer, 'skm', mname) except Exception, e: print(traceback.format_exc())
def runclustermodel(self, model, method, dataf, temp=True): anomalies = [] try: jvm.start(max_heap_size=self.wHeap) data = self.loadData(dataf, temp) cluster = self.loadClusterModel(model, method) clusterMembership = [] print cluster.number_of_clusters for inst in data: try: cl = cluster.cluster_instance(inst) except Exception as inst: logger.error('[%s] : [ERROR] Mismatch model and data attributes', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) dist = cluster.distribution_for_instance(inst) print ("cluster=" + str(cl) + ", distribution=" + str(dist)) clusterMembership.append(cl) # print data.attribute_by_name('key') # print data.num_instances # print data.get_instance(3) pa = self.calcThreashold(dict(Counter(clusterMembership)), 21) if pa == 0: logger.warning('[%s] : [WARN] Most instances are computed as anomalies, possible error encountered!', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'),) print "Most instances are computed as anomalies, possible error encountered!" else: for a in pa: # print data.get_instance(a).get_value(0) #todo always set key as first atribute anomalies.append(data.get_instance(a).get_value(0)) print "Detected using %s anomalies at timestamp(s) %s" % (model, str(anomalies)) except Exception, e: print(traceback.format_exc())
def main(): """ Runs a associator from the command-line. Calls JVM start/stop automatically. Use -h to see all options. """ parser = argparse.ArgumentParser( description='Executes an associator from the command-line. Calls JVM start/stop automatically.') parser.add_argument("-j", metavar="classpath", dest="classpath", help="additional classpath, jars/directories") parser.add_argument("-X", metavar="heap", dest="heap", help="max heap size for jvm, e.g., 512m") parser.add_argument("-t", metavar="train", dest="train", required=True, help="training set file") parser.add_argument("associator", help="associator classname, e.g., weka.associations.Apriori") parser.add_argument("option", nargs=argparse.REMAINDER, help="additional associator options") parsed = parser.parse_args() jars = [] if parsed.classpath is not None: jars = parsed.classpath.split(os.pathsep) jvm.start(jars, max_heap_size=parsed.heap, packages=True) logger.debug("Commandline: " + join_options(sys.argv[1:])) try: associator = Associator(classname=parsed.associator) if len(parsed.option) > 0: associator.options = parsed.option loader = converters.loader_for_file(parsed.train) data = loader.load_file(parsed.train) associator.build_associations(data) print(str(associator)) except Exception, e: print(e)
def main(): """ Runs a clusterer from the command-line. Calls JVM start/stop automatically. Use -h to see all options. """ parser = argparse.ArgumentParser( description='Performs clustering from the command-line. Calls JVM start/stop automatically.') parser.add_argument("-j", metavar="classpath", dest="classpath", help="additional classpath, jars/directories") parser.add_argument("-X", metavar="heap", dest="heap", help="max heap size for jvm, e.g., 512m") parser.add_argument("-t", metavar="train", dest="train", required=True, help="training set file") parser.add_argument("-T", metavar="test", dest="test", help="test set file") parser.add_argument("-d", metavar="outmodel", dest="outmodel", help="model output file name") parser.add_argument("-l", metavar="inmodel", dest="inmodel", help="model input file name") parser.add_argument("-p", metavar="attributes", dest="attributes", help="attribute range") parser.add_argument("-x", metavar="num folds", dest="numfolds", help="number of folds") parser.add_argument("-s", metavar="seed", dest="seed", help="seed value for randomization") parser.add_argument("-c", metavar="class index", dest="classindex", help="1-based class attribute index") parser.add_argument("-g", metavar="graph", dest="graph", help="graph output file (if supported)") parser.add_argument("clusterer", help="clusterer classname, e.g., weka.clusterers.SimpleKMeans") parser.add_argument("option", nargs=argparse.REMAINDER, help="additional clusterer options") parsed = parser.parse_args() jars = [] if parsed.classpath is not None: jars = parsed.classpath.split(os.pathsep) params = [] if parsed.train is not None: params.extend(["-t", parsed.train]) if parsed.test is not None: params.extend(["-T", parsed.test]) if parsed.outmodel is not None: params.extend(["-d", parsed.outmodel]) if parsed.inmodel is not None: params.extend(["-l", parsed.inmodel]) if parsed.attributes is not None: params.extend(["-p", parsed.attributes]) if parsed.numfolds is not None: params.extend(["-x", parsed.numfolds]) if parsed.seed is not None: params.extend(["-s", parsed.seed]) if parsed.classindex is not None: params.extend(["-c", parsed.classindex]) if parsed.graph is not None: params.extend(["-g", parsed.graph]) jvm.start(jars, max_heap_size=parsed.heap, packages=True) logger.debug("Commandline: " + join_options(sys.argv[1:])) try: clusterer = Clusterer(classname=parsed.clusterer) if len(parsed.option) > 0: clusterer.options = parsed.option print(ClusterEvaluation.evaluate_clusterer(clusterer, params)) except Exception as e: print(e) finally: jvm.stop()
def generate_folds(dataset_path, output_folder, n_folds=10, random_state=None): """ Given a dataset df, generate n_folds for it and store them in <output_folder>/<dataset_name>. :type dataset_path: str :param dataset_path: Path to dataset with .arff file extension (i.e my_dataset.arff) :type output_folder: str :param output_folder: Path to store both index file with folds and fold files. :type n_folds: int :param n_folds: Optional - Number of folds to split the dataset into. Defaults to 10. :type random_state: int :param random_state: Optional - Seed to use in the splitting process. Defaults to None (no seed). """ import warnings warnings.filterwarnings('error') dataset_name = dataset_path.split('/')[-1].split('.')[0] af = load_arff(dataset_path) df = load_dataframe(af) skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_state) fold_iter = skf.split(df[df.columns[:-1]], df[df.columns[-1]]) fold_index = dict() jvm.start() csv_loader = Loader(classname="weka.core.converters.CSVLoader") arff_saver = Saver(classname='weka.core.converters.ArffSaver') for i, (arg_rest, arg_test) in enumerate(fold_iter): fold_index[i] = list(arg_test) _temp_path = 'temp_%s_%d.csv' % (dataset_name, i) fold_data = df.loc[arg_test] # type: pd.DataFrame fold_data.to_csv(_temp_path, sep=',', index=False) java_arff_dataset = csv_loader.load_file(_temp_path) java_arff_dataset.relationname = af['relation'] java_arff_dataset.class_is_last() arff_saver.save_file(java_arff_dataset, os.path.join(output_folder, '%s_fold_%d.arff' % (dataset_name, i))) os.remove(_temp_path) json.dump( fold_index, open(os.path.join(output_folder, dataset_name + '.json'), 'w'), indent=2 ) jvm.stop() warnings.filterwarnings('default')
def run(arff_path, model_out): jvm.start() loader = Loader(classname = "weka.core.converters.ArffLoader") data = loader.load_file(arff_path) data.class_is_last() cls = Logistic() cls.build_classifier(data) cls.save_model(model_out) coefficients = cls.coefficients for coeff in coefficients: print str(coeff) return coefficients
def start(*args, **kwargs): """ Open a weka connection. May be called multiple times, but not after calling stop(). Arguments: *args, **kwargs: Any additional arguments to pass to jvm.start(). """ if MODULE_SUPPORTED: jvm.start(*args, **kwargs)
def assign_classify(file_location, output="classified.out", model="naivebayes.model"): data = read_csv_file(file_location) jvm.start() # load clusters obj = serialization.read(model) classifier = Classifier(jobject=obj) # create file with cluster group with open(output, 'w') as cluster_file: for index, attrs in enumerate(data): inst = Instance.create_instance(attrs[1:]) pred = classifier.classify_instance(inst) print(str(index + 1) + ": label index=" + str(pred)) jvm.stop()
def predict(attributes): jvm.start() file_path = print_to_file(attributes) # load the saved model objects = serialization.read_all("/Users/hosyvietanh/Desktop/data_mining/trained_model.model") classifier = Classifier(jobject=objects[0]) loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(file_path) data.class_is_last() for index, inst in enumerate(data): pred = classifier.classify_instance(inst) dist = classifier.distribution_for_instance(inst) return int(pred) jvm.stop()
def run(self): jvm.start() # start Java VM for WEKA self.show_banner() # call function to draw the banner on the UI self.generate_csv( ) # call function to generate the csv file which will contain the url which will be tested self.generate_arff( ) # call function which will use another class to extract features from the url and save to a .arff file self.show_banner() # call function to draw the banner on the UI self.weka_predict( ) # call function to predict whether the url is "phishy" or not via the generated .arff file jvm.stop() return self.prediction # return the predicted result back to MainHandler
def main(): """ Runs a associator from the command-line. Calls JVM start/stop automatically. Use -h to see all options. """ parser = argparse.ArgumentParser( description= 'Executes an associator from the command-line. Calls JVM start/stop automatically.' ) parser.add_argument("-j", metavar="classpath", dest="classpath", help="additional classpath, jars/directories") parser.add_argument("-X", metavar="heap", dest="heap", help="max heap size for jvm, e.g., 512m") parser.add_argument("-t", metavar="train", dest="train", required=True, help="training set file") parser.add_argument( "associator", help="associator classname, e.g., weka.associations.Apriori") parser.add_argument("option", nargs=argparse.REMAINDER, help="additional associator options") parsed = parser.parse_args() jars = [] if parsed.classpath is not None: jars = parsed.classpath.split(os.pathsep) jvm.start(jars, max_heap_size=parsed.heap, packages=True) logger.debug("Commandline: " + join_options(sys.argv[1:])) try: associator = Associator(classname=parsed.associator) if len(parsed.option) > 0: associator.options = parsed.option loader = converters.loader_for_file(parsed.train) data = loader.load_file(parsed.train) associator.build_associations(data) print(str(associator)) except Exception, e: print(e)
def __init__(self, classname="weka.classifiers.functions.SMO", options='default'): """Constructor. Parameters ---------- classname : string, optional, default = 'weka.classifiers.functions.SMO' Classifier initialized as default. options : string, optional, default = 'default' Classifier options initialized as default. Use the string 'default' to default options. """ if not jvm.started: jvm.start() self.classname = Config("ClassName", classname, str) self.options = Config("Options", options, str) self.reset()
def main(): try: jvm.start() loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file("./data/adult.csv") data.class_is_last() # set class attribute folds = k learning_curve(k, data) except Exception as e: raise e finally: jvm.stop()
def query_instance(attributes, model="out.model"): """ get the cluster for defined attributes :params attributes: array or list :returns: cluster id """ jvm.start() # create instance inst = Instance(attributes) # load model obj = serialization.read(model) # load cluster and get the cluster_id cluster = Clusterer(jobject=obj) cluster_id = cluster.cluster_instance(inst) jvm.stop() return cluster_id
def main(): try: jvm.start() loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file("./data/adult.csv") # load training data data.class_is_last() # set class attribute NaiveBayes(data) DecisionTree(data) except Exception as e: raise e finally: jvm.stop()
def weka_algorithm(algorithm, type, minsup, minconf): # Weka worker start jvm.start() loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(arr_file) associator = Associator(classname=f"weka.associations.{algorithm}", options=["-M", minsup, "-C", minconf, "-N", "100"]) associator.build_associations(data) with open(f"results/{type}_weka_{algorithm}.output", "w") as fw: fw.write(associator.__str__()) fw.close() # Weka worker end jvm.stop()
def TestClassification(arff, modelInput, results, sampleName): # 启动java虚拟机 jvm.start() # 导入分析模型 objects = serialization.read_all(modelInput) clsf = Classifier(jobject=objects[0]) print(clsf) # 导入测试组 loader = Loader(classname="weka.core.converters.ArffLoader") test = loader.load_file(arff) test.class_is_first() # 分析结果 resultsFile = open(results, "w") if sampleName: resultsFile.write("样本编号\t原判断\t预测\t良性概率\t恶性概率\n") print("样本编号\t原判断\t预测\t良性概率\t恶性概率") sampleNameListFile = open(getAbsPath() + "/temp.txt", "r") sampleNameList = [] for snlf in sampleNameListFile: sampleNameList.append(snlf.split("\n")[0]) sampleNameListFile.close() else: resultsFile.write("序号\t原判断\t预测\t良性概率\t恶性概率\n") print("序号\t原判断\t预测\t良性概率\t恶性概率") for index, inst in enumerate(test): pred = clsf.classify_instance(inst) dist = clsf.distribution_for_instance(inst) if sampleName: sampleID = sampleNameList[index] else: sampleID = str(index + 1) origin = inst.get_string_value(inst.class_index) prediction = inst.class_attribute.value(int(pred)) sameAsOrigin = "yes" if pred != inst.get_value( inst.class_index) else "no" NRate = dist.tolist()[0] PRate = dist.tolist()[1] resultsFile.write( "%s\t%s\t%s\t%s\t%s" % (sampleID, origin, prediction, str(NRate), str(PRate)) + "\n") print("%s\t%s\t%s\t%s\t%s" % (sampleID, origin, prediction, str(NRate), str(PRate))) resultsFile.close() # 退出java虚拟机 jvm.stop() print("检测完成")
def __init__(self, index = 0, inference = "ExactInference", ghostAgents = None): BustersAgent.__init__(self, index, inference, ghostAgents) self.previousDistances = [0,0,0,0] jvm.start(max_heap_size="512m") self.loader = Loader(classname="weka.core.converters.ArffLoader") self.data = self.loader.load_file("data/game_toCluster.arff") self.data.delete_last_attribute() self.clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "10", "-S", "4", "-I", "500"]) self.clusterer.build_clusterer(self.data) self.inst = "" self.data = self.loader.load_file("data/game_toCluster.arff") addCluster = Filter(classname="weka.filters.unsupervised.attribute.AddCluster", options=["-W", "weka.clusterers.SimpleKMeans -N 10 -S 4 -I 500", "-I", "last"]) addCluster.inputformat(self.data) filtered = addCluster.filter(self.data) self.f = open('data/addCluster.arff', 'w+') self.f.write(str(filtered)) self.clustered_data = self.classifyData('data/addCluster.arff')
def index(): if request.method == "GET": return render_template('bot.html') if request.method == "POST": # jvm.stop() jvm.start() f = open("instances.arff", "a") args = request.form.to_dict() weight_lb = float(args['weight']) * 2.20462 bmi = (weight_lb / pow(float(args['height']), 2)) * 703 hypertensive_status = args['hypertensive_status'] heart_disease_status = args['heart_disease_status'] if heart_disease_status == "Yes": heart_disease_status = '1' else: heart_disease_status = '0' if hypertensive_status == "Yes": hypertensive_status = '1' else: hypertensive_status = '0' st = "\n"+args['gender']+","+args['age']+","+hypertensive_status+","+heart_disease_status+","+args['marrital_status'] + \ ","+args['work_type']+","+args['residence']+"," + \ args['hypertension']+","+str(bmi)+",'"+args['smoking_status'].lower()+"',?" print(st) f.write(st) f.close() objects = serialization.read_all("J48.model") loader = Loader(classname="weka.core.converters.ArffLoader") csr = Classifier(jobject=objects[0]) output_results = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.CSV") data1 = loader.load_file("instances.arff") data1.class_is_last() ev2 = Evaluation(data1) ev2.test_model(csr, data1, output_results) TESTDATA = StringIO("Instance,Actual,Predicted," + output_results.buffer_content()) df = pd.read_csv(TESTDATA) prediction = list(df.Predicted).pop().split(":")[1] print(prediction) # jvm.stop() response = {"status": "200", "prediction": prediction} return Response(json.dumps(response, indent=2), mimetype="application/json")
def functionProcessamento(self, ca1_r, ca1_l, ca2_ca3_r, ca2_ca3_l, sub_r, sub_l, sexo, id): jvm.start() path = os.path.dirname(os.path.abspath(__file__)) # TODO: verificar qual o sexo do individuo para carregar o modelo corretamente modelo = path + "\\naive_bayes_feminino_novo.model" if (sexo == "Male"): print("É masculino") modelo = path + "\\naive_bayes_feminino_novo.model" objects = serialization.read_all(modelo) classifier = Classifier(jobject=objects[0]) loader = Loader(classname="weka.core.converters.ArffLoader") arquivo = open(path + "\\novo_individuo.arff", "w") conteudo = list() conteudo.append("@relation alzheimer \n\n") conteudo.append("@attribute doente {SIM, NAO} \n") conteudo.append("@attribute ca1_right real \n") conteudo.append("@attribute ca1_left real \n") conteudo.append("@attribute ca2_ca3_right real\n") conteudo.append("@attribute ca2_ca3_left real \n") conteudo.append("@attribute subic_right real \n") conteudo.append("@attribute subic_left real \n\n") conteudo.append("@data \n") #aqui passar as variáveis conteudo.append("SIM," + str(ca1_r) + "," + str(ca1_l) + "," + str(ca2_ca3_r) + "," + str(ca2_ca3_l) + "," + str(sub_r) + "," + str(sub_l)) print(conteudo) arquivo.writelines(conteudo) arquivo.close() data = loader.load_file(path + "\\novo_individuo.arff") data.class_is_last() for index, inst in enumerate(data): pred = classifier.classify_instance(inst) dist = classifier.distribution_for_instance(inst) pc_doenca = round(((pred) * 100), 2) pc_saudavel = round(((100 - pc_doenca)), 2) print(" Porcentagem de alzheimer=" + str(pc_doenca) + "%, porcentagem saudavel=" + str(pc_saudavel) + "%") alzheimer = Alzheimer.objects.get(id=id) alzheimer.resultado_ad = pc_doenca alzheimer.resultado_cn = pc_saudavel alzheimer.status_seg = 2 alzheimer.save() jvm.stop()
def main(args=None): """ Runs a datagenerator from the command-line. Calls JVM start/stop automatically. Use -h to see all options. :param args: the command-line arguments to use, uses sys.argv if None :type args: list """ parser = argparse.ArgumentParser( description= 'Executes a data generator from the command-line. Calls JVM start/stop automatically.' ) parser.add_argument("-j", metavar="classpath", dest="classpath", help="additional classpath, jars/directories") parser.add_argument("-X", metavar="heap", dest="heap", help="max heap size for jvm, e.g., 512m") parser.add_argument("datagenerator", help="data generator classname, e.g., " + "weka.datagenerators.classifiers.classification.LED24") parser.add_argument("option", nargs=argparse.REMAINDER, help="additional data generator options") parsed = parser.parse_args(args=args) jars = [] if parsed.classpath is not None: jars = parsed.classpath.split(os.pathsep) jvm.start(jars, max_heap_size=parsed.heap, packages=True) logger.debug("Commandline: " + join_options(sys.argv[1:])) try: generator = DataGenerator(classname=parsed.datagenerator) if len(parsed.option) > 0: generator.options = parsed.option DataGenerator.make_data(generator, parsed.option) except Exception: print(traceback.format_exc()) finally: jvm.stop()
def main(): """ Runs attribute selection from the command-line. Calls JVM start/stop automatically. Use -h to see all options. """ parser = argparse.ArgumentParser( description='Performs attribute selection from the command-line. Calls JVM start/stop automatically.') parser.add_argument("-j", metavar="classpath", dest="classpath", help="additional classpath, jars/directories") parser.add_argument("-X", metavar="heap", dest="heap", help="max heap size for jvm, e.g., 512m") parser.add_argument("-i", metavar="input", dest="input", required=True, help="input file") parser.add_argument("-c", metavar="class index", dest="classindex", help="1-based class attribute index") parser.add_argument("-s", metavar="search", dest="search", help="search method, classname and options") parser.add_argument("-x", metavar="num folds", dest="numfolds", help="number of folds") parser.add_argument("-n", metavar="seed", dest="seed", help="the seed value for randomization") parser.add_argument("evaluator", help="evaluator classname, e.g., weka.attributeSelection.CfsSubsetEval") parser.add_argument("option", nargs=argparse.REMAINDER, help="additional evaluator options") parsed = parser.parse_args() jars = [] if parsed.classpath is not None: jars = parsed.classpath.split(os.pathsep) params = [] if parsed.input is not None: params.extend(["-i", parsed.input]) if parsed.classindex is not None: params.extend(["-c", parsed.classindex]) if parsed.search is not None: params.extend(["-s", parsed.search]) if parsed.numfolds is not None: params.extend(["-x", parsed.numfolds]) if parsed.seed is not None: params.extend(["-n", parsed.seed]) jvm.start(jars, max_heap_size=parsed.heap, packages=True) logger.debug("Commandline: " + join_options(sys.argv[1:])) try: evaluation = ASEvaluation(classname=parsed.evaluator) if len(parsed.option) > 0: evaluation.options = parsed.option print(AttributeSelection.attribute_selection(evaluation, params)) except Exception as e: print(e) finally: jvm.stop()
def dict2arff(self, fileIn, fileOut): ''' :param fileIn: name of csv file :param fileOut: name of new arff file :return: ''' dataIn = os.path.join(self.dataDir, fileIn) dataOut = os.path.join(self.dataDir, fileOut) logger.info('[%s] : [INFO] Starting conversion of %s to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), dataIn, dataOut) try: jvm.start() convertCsvtoArff(dataIn, dataOut) except Exception as inst: pass finally: logger.error('[%s] : [ERROR] Exception occured while converting to arff with %s and %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) jvm.stop() logger.info('[%s] : [INFO] Finished conversion of %s to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), dataIn, dataOut)
def SimpleLogistic(obj): jvm.start(packages=True) # TODO: First_trial_classification.arff First_trial_regression.arff 데이터 셋에 피쳐로 Date 있는데 최신걸로 다 수정좀 (new_models 폴더 전체) # TODO: obj의 내용을 바탕으로 input_classification.arff을 작성하는 코드 생성하기 # load model cls = Classifier(jobject=serialization.read("new_models/SimpleLogistic.model")) loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file("new_models/input_classification.arff") # 분류이기 때문에 예측 데이터 타입이 nominal이라 numpy로 받을 수 없음. data.class_is_last() for index, inst in enumerate(data): audience_class = cls.classify_instance(inst) jvm.stop() return audience_class + 1 # index가 0부터라 +1
def predicaoCluster(matricula, curso, tipo_predicao): dados = retornarDadosCurso(curso) # selecionando as caracteristicas do aluno aluno = dados.loc[dados['MATRICULA'] == matricula][:] aluno.drop('MATRICULA', axis=1, inplace=True) aluno.drop('APROVADO', axis=1, inplace=True) aluno.drop('COD_DISCIPLINA', axis=1, inplace=True) aluno.drop('SIT_MATRICULA', axis=1, inplace=True) aluno = aluno.head(1) aluno.to_csv('aluno_temp.csv', index=False) from weka.clusterers import Clusterer import weka.core.jvm as jvm from weka.core.converters import Loader import weka.core.serialization as serialization jvm.start() if curso == 'si': if tipo_predicao == 'reprovacao': model = serialization.read_all("model/kmeans_si_reprovacao.model") elif tipo_predicao == 'evasao': model = serialization.read_all("model/kmeans_si_evasao.model") elif curso == 'eca': if tipo_predicao == 'reprovacao': model = serialization.read_all("model/kmeans_eca_reprovacao.model") elif tipo_predicao == 'evasao': model = serialization.read_all("model/kmeans_eca_evasao.model") cluster = Clusterer(jobject=model[0]) loader = Loader(classname="weka.core.converters.CSVLoader") dado_aluno = loader.load_file("aluno_temp.csv") for aluno in dado_aluno: cluster_aluno_pertence = cluster.cluster_instance(aluno) #jvm.stop() caracteristica = retornarCaracteristicaCluster(curso, tipo_predicao, cluster_aluno_pertence) return caracteristica
def extract_features(): jvm.start() # carrega dados da requisição content = request.json # define o caminho para o som selecionado path = './src/sounds/test/' + content['file'] # extrai as caracterísiticas e armazena features = Features.extract_feature_sound(path) # excluir a ultima posição = classe features.pop(6) # adiciona um valor para ultima posição para evitar incompatibilidade de registros features.append(0) # armazena os parametros passados na requisicao - learning rate e traning time settings = content['settings'] # armazena o resultado da classificação classification = NeuralNetwork.perceptron_classifier(features, settings) # armazena todas as caracteristicas extraidas all_features = [{ 'title': "Zero Crossing", 'value': features[0] }, { 'title': "Spectral Centroid", 'value': features[1] }, { 'title': "Spectral Rolloff", 'value': features[2] }, { 'title': "Mel Spectrogram", 'value': features[3] }, { 'title': "MFCC", 'value': features[4] }, { 'title': "Chroma STFT", 'value': features[5] }] jvm.stop() # retorna o resultado da classificação e as características extraidas return jsonify({ 'result': classification, 'features': all_features, 'status': 200 }), 200
def associateRule(request): jvm.start() data_dir = os.path.dirname(os.path.abspath(__file__)) data = converters.load_any_file(data_dir + "/templates/upload_files/export.csv") data.class_is_last() associator = Associator(classname="weka.associations.Apriori", options=["-C", "-1", "-I"]) # associator = Associator(classname="weka.associations.Apriori", options=["-N", "9", "-I"]) associator.build_associations(data) rules = str(associator) jvm.stop() return HttpResponse(rules)
def run(): jvm.start() load_csv = Loader("weka.core.converters.CSVLoader") data_csv = load_csv.load_file( "/Users/imeiliasantoso/web_graduate_project5/register_page/bank-full_input.csv" ) saver = Saver("weka.core.converters.ArffSaver") saver.save_file( data_csv, "/Users/imeiliasantoso/web_graduate_project5/register_page/bank-full_input.arff" ) load_arff = Loader("weka.core.converters.ArffLoader") data_arff = load_arff.load_file( "/Users/imeiliasantoso/web_graduate_project5/register_page/bank-full_input.arff" ) data_arff.class_is_last() cls = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.5"]) cls.build_classifier(data_arff) for index, inst in enumerate(data_arff): pred = cls.classify_instance(inst) dist = cls.distribution_for_instance(inst) # save tree prune in txt file saveFile = open( "/Users/imeiliasantoso/web_graduate_project5/register_page/bank-full_input.txt", "w") saveFile.write(str(cls)) # print(cls) saveFile.close() global j48 J48_class = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"]) J48_class.build_classifier(data_arff) evaluationj48 = Evaluation(data_arff) evaluationj48.crossvalidate_model(J48_class, data_arff, 10, Random(100)) j48 = str(evaluationj48.percent_correct) jvm.stop() return j48
def detectarSpam(tuitsConDatos, modeloFilename): """ # # @tuitsConDatos : lista de diccionarios status con los indices # tweetText, tweet_id, favorite_count y retweet_count # # @return predicciones : lista de predicciones por cada tuit de input. # Cada prediccion es un diccionario con los indices # index, actual, predicted, error y distribution # """ predicciones = [] try: jvm.start() jvm.start(system_cp=True, packages=True) predicciones = detectarSpam_(tuitsConDatos, modeloFilename) except Exception, e: print(traceback.format_exc())
def e_model_tree(): # train_data, test_data = b_i_impute_data() # train_data.to_csv("./train_data.csv", index=False) # test_data.to_csv("./test_data.csv",index=False) jvm.start() train_data = converters.load_any_file("train_data.csv") train_data.class_is_first() test_data = converters.load_any_file("test_data.csv") test_data.class_is_first() print("1") cls = Classifier(classname="weka.classifiers.trees.LMT") print("2") cls.build_classifier(train_data) print("3") evl = Evaluation(train_data) evl.crossvalidate_model(cls, train_data, 5, Random(1)) print("Train Accuracy:", evl.percent_correct) print("Train summary") print(evl.summary()) print("Train class details") print(evl.class_details()) print("Train confusion matrix") print(evl.confusion_matrix) plcls.plot_roc(evl, class_index=[0, 1], wait=True) plt.suptitle("Train ROC Curve", fontsize=20, y=0.95) savefig("./plots/e_train_roc_curve.png") evl = Evaluation(test_data) evl.test_model(cls, test_data) print("Test Accuracy:", evl.percent_correct) print("Test summary") print(evl.summary()) print(" Testclass details") print(evl.class_details()) print("Testconfusion matrix") print(evl.confusion_matrix) plcls.plot_roc(evl, class_index=[0, 1], wait=True) plt.suptitle("Test ROC Curve", fontsize=20, y=0.95) savefig("./plots/e_test_roc_curve.png")
def main(): try: jvm.start() loader = Loader(classname="weka.core.converters.CSVLoader") training_data = loader.load_file( "./data/adult.csv") # load training set testing_data = loader.load_file( "./data/adult_test.csv") # load test set training_data.class_is_last() testing_data.class_is_last() testNB(training_data, testing_data) testDtree(training_data, testing_data) except Exception as e: raise e finally: jvm.stop()
def all_feature(file): jvm.start(packages=True) data = converters.load_any_file(file) data.class_is_last() search = ASSearch(classname="weka.attributeSelection.Ranker", options=["-T", "-1.7976931348623157E308", "-N", "-1"]) attsel = AttributeSelection() attsel.search(search) evaluator = ASEvaluation( classname="weka.attributeSelection.ChiSquaredAttributeEval") attsel.evaluator(evaluator) attsel.select_attributes(data) t = attsel.ranked_attributes[:, 0] chi = t.astype(int) evaluator = ASEvaluation( classname="weka.attributeSelection.InfoGainAttributeEval") attsel.evaluator(evaluator) attsel.select_attributes(data) t = attsel.ranked_attributes[:, 0] info_gain = t.astype(int) evaluator = ASEvaluation( classname="weka.attributeSelection.GainRatioAttributeEval") attsel.evaluator(evaluator) attsel.select_attributes(data) t = attsel.ranked_attributes[:, 0] gain_ratio = t.astype(int) evaluator = ASEvaluation( classname="weka.attributeSelection.SymmetricalUncertAttributeEval") attsel.evaluator(evaluator) attsel.select_attributes(data) t = attsel.ranked_attributes[:, 0] symmetric_uncertainty = t.astype(int) jvm.stop() return chi, info_gain, gain_ratio, symmetric_uncertainty
def main(): """ Runs a datagenerator from the command-line. Calls JVM start/stop automatically. Use -h to see all options. """ parser = argparse.ArgumentParser( description= 'Executes a data generator from the command-line. Calls JVM start/stop automatically.' ) parser.add_argument("-j", metavar="classpath", dest="classpath", help="additional classpath, jars/directories") parser.add_argument("-X", metavar="heap", dest="heap", help="max heap size for jvm, e.g., 512m") parser.add_argument( "datagenerator", help= "data generator classname, e.g., weka.datagenerators.classifiers.classification.LED24" ) parser.add_argument("option", nargs=argparse.REMAINDER, help="additional data generator options") parsed = parser.parse_args() jars = [] if not parsed.classpath is None: jars = parsed.classpath.split(os.pathsep) jvm.start(jars, max_heap_size=parsed.heap, packages=True) logger.debug("Commandline: " + utils.join_options(sys.argv[1:])) try: generator = DataGenerator(classname=parsed.datagenerator) if len(parsed.option) > 0: generator.set_options(parsed.option) DataGenerator.make_data(generator, parsed.option) except Exception, e: print(e)
def main(read_path, write_path, fileformat='png'): some_exception = None try: jvm.start() dataset_names = os.listdir(read_path) for dataset in dataset_names: print(dataset) train_data, test_data = read_datasets(os.path.join(read_path, dataset), n_fold=1) for inst in test_data: train_data.add_instance(inst) y = train_data.values(train_data.class_attribute.index) fig, ax = plt.subplots(figsize=(1, 1)) # type: (plt.Figure, plt.Axes) classes = sorted(np.unique(y)) xticks = np.arange(len(classes)) counts = Counter(y) ax.bar(xticks, height=[counts[c] for c in classes]) ax.set_xticks(xticks) ax.set_xticklabels(classes) plt.axis('off') plt.savefig(os.path.join(write_path, '.'.join([dataset, fileformat])), format=fileformat, transparent=True) plt.clf() plt.close() except Exception as e: some_exception = e finally: jvm.stop() if some_exception is not None: raise some_exception
def TrainingModel(arff, modelOutput, clsfier): # 启动java虚拟机 jvm.start() # 导入训练集 loader = Loader(classname="weka.core.converters.ArffLoader") train = loader.load_file(arff) train.class_is_first() # 使用RandomForest算法进行训练,因为在GUI版本weka中使用多种方式训练后发现此方式TPR与TNR较高 cls_name = "weka.classifiers." + clsfier clsf = Classifier(classname=cls_name) clsf.build_classifier(train) print(clsf) # 建立模型 fc = FilteredClassifier() fc.classifier = clsf evl = Evaluation(train) evl.crossvalidate_model(fc, train, 10, Random(1)) print(evl.percent_correct) print(evl.summary()) print(evl.class_details()) print(evl.matrix()) # 结果统计 matrixResults = evl.confusion_matrix TN = float(matrixResults[0][0]) FP = float(matrixResults[0][1]) FN = float(matrixResults[1][0]) TP = float(matrixResults[1][1]) TPR = TP / (TP + FN) TNR = TN / (FP + TN) PPV = TP / (TP + FP) NPV = TN / (TN + FN) print("算法: " + clsfier) print("敏感度 TPR: " + str(TPR)) print("特异度 TNR: " + str(TNR)) print("PPV: " + str(PPV)) print("NPV: " + str(NPV)) # 保存模型 clsf.serialize(modelOutput, header=train) # 退出虚拟机 jvm.stop() print("分析模型建立完成")
def main(argv): if len(argv) <= 1: print 'op action testfile/batch' return jvm.start() global op op = argv[1] action = argv[2] if action == "train": train(getpara(op, 'heaptime'), getpara(op, 'op')+getpara('hash', 'new'), argv[3:]) if action == "output_model": output_model(getpara(op, 'heaptime'), getpara(op, 'op')+getpara('hash', 'new'), argv[3:]) if action == "test": pred = {} real = {} objs = getpara(op, 'heaptime') for obj in objs: pred[obj] = [] real[obj] = [] #for i in [101,102,103,104,105,106,108,109,110,111,112,114,115,116,117,118,119]: for i in [1,2,3,4,5,6,8,9,10,11,12,14,15,16,17,18,19] + [101,102,103,104,105,106,108,109,110,111,112,114,115,116,117,118,119]: #for i in [4,5,8,9,10,12,18,19]: test(objs, getpara(op, 'grid'), ['randtime_tpch_%d.csv' % i], pred, real) for obj in objs: print obj, metric(pred[obj], real[obj]), len(pred[obj]) if action == "test_manual": pred = {} real = {} objs = getpara(op, 'heaptime') for obj in objs: pred[obj] = [] real[obj] = [] for i in [1,2,3,4,5,6,8,9,10,11,12,14,15,16,17,18,19]: #for i in [4,5,8,9,10,12,18,19]: test_manual(objs, getpara(op, 'grid'), ['randtime_tpch_%d.csv' % i], pred, real) for obj in objs: print obj, metric(pred[obj], real[obj]), len(pred[obj]), min(real[obj]), max(real[obj]), sum(real[obj])/len(real[obj]) if action == "testsingle": test_single() jvm.stop()
def classify(train, test, name="RF", tuning=False): jvm.start() if isinstance(train, list) and isinstance(test, list): train = weka_instance(train) trn_data = converters.load_any_file(train) test = weka_instance(test) tst_data = converters.load_any_file(test) elif os.path.isfile(train) and os.path.isfile(test): trn_data = converters.load_any_file(train) tst_data = converters.load_any_file(test) else: trn = csv_as_ndarray(train) tst = csv_as_ndarray(test) trn_data = converters.ndarray_to_instances(trn, relation="Train") tst_data = converters.ndarray_to_instances(tst, relation="Test") trn_data.class_is_last() tst_data.class_is_last() # t = time() if tuning: opt = tune(train) else: opt = default_opt # print("Time to tune: {} seconds".format(time() - t)) cls = Classifier(classname=classifiers[name.lower()], options=opt) cls.build_classifier(trn_data) distr = [cls.distribution_for_instance(inst)[1] for inst in tst_data] preds = [cls.classify_instance(inst) for inst in tst_data] jvm.stop() return preds, distr
def main(): args = parse() jvm.start(max_heap_size=args.max_heap_size) #loading data print("**************************************************") print("* LOADING DATA *") print("**************************************************") data_loader = DataLoader(datapath = args.Datapath, target_name = args.target_agent, arff_data_path = args.arff_data_path, num_games = args.num_games_source, ) #data_loader.load_target_source_data() print("**************************************************") print("* TRAINING *") print("**************************************************") model = Classifier(classname="weka.classifiers.trees.REPTree") classifier = TwoStageTransfer(arff_data_path = args.arff_data_path, savepath = args.savepath, target_name = args.target_agent, num_target = args.num_games_target, num_source = args.num_games_source, boosting_iter=args.boosting_iter, fold=args.fold, max_source_dataset=args.max_source, model = model) classifier.load_data_from_arff() classifier.train() print("**************************************************") print("* EVALUATING *") print("**************************************************") print("Evaluate for ", args.target_agent) classifier.evaluate_model() jvm.stop()
def dbscanTrain(self, dataf, options, mname, temp=True): ''' :param data: -> data to be clustered :param options: -> dbscan options E -> epsilon (default = 0.9) M -> minPoints (default = 6) D -> default weka.clusterers.forOPTICSAndDBScan.DataObjects.EuclideanDataObject I -> index (database) used for DBSCAN (default = weka.clusterers.forOPTICSAndDBScan.Databases.SequentialDatabase) example => ["-E", "0.9", "-M", "6", "-I", "weka.clusterers.forOPTICSAndDBScan.Databases.SequentialDatabase", "-D", "weka.clusterers.forOPTICSAndDBScan.DataObjects.EuclideanDataObject"] :return: ''' try: jvm.start(max_heap_size=self.wHeap) data = self.loadData(dataf, temp) clusterDBSCAN = Clusterer(classname="weka.clusterers.DBSCAN", options=options) clusterDBSCAN.build_clusterer(data) print clusterDBSCAN self.saveModel(clusterDBSCAN, 'dbscan', mname) # cluster the data except Exception, e: print(traceback.format_exc())
def save_all_scores_on_test(): jvm.start() for user in user_list: user_test_dir = os.listdir("../data/arff_files/" + str(user) + "/test/") user_test_dir.sort() n = len(user_test_dir) c = 0 for expression_index in range(n): print "\n", expression_index, "=>", str(expression_list[expression_index]), ':', str(user_test_dir[expression_index]) id = str(expression_list[expression_index]) + '_' + str(user) target_dir = '../results_test/' + str(expression_list[expression_index]) + '/' + str(user) + '/' model_dir = '../models/' + str(expression_list[expression_index]) + '/' + str(user) + '/' test_data_file = "../data/arff_files/" + str(user) + "/test/" + str(user_test_dir[expression_index]) print test_data_file, "=>", model_dir, "all algos", "=>", target_dir, "\n" loader = Loader(classname="weka.core.converters.ArffLoader") test_data = loader.load_file(test_data_file) test_data.class_is_last() for algo in algo_func_dict.keys(): print "Algorithm: " + algo.upper() #if algo.upper()=="MLP_CLASSIFIER_10": # continue model_file = model_dir + algo + ".model" print model_file j_obj = serialization.read(model_file) print j_obj trained_model = Classifier(jobject=j_obj) scores_matrix = get_classifier_score(trained_model, test_data) #print scores_matrix[:5] out_file = target_dir + algo + "_scores.csv" #writing scores to target file #scores_matrix = scores_matrix.astype(np.str) np.savetxt(out_file, scores_matrix, delimiter=",", fmt="%s") c = c + 1 print str(c) + ": Test Scores Saved =>" + str(out_file) #pass jvm.stop()
def runclustermodel(self, model, method, dataf, temp=True): anomalies = [] try: jvm.start(max_heap_size=self.wHeap) data = self.loadData(dataf, temp) cluster = self.loadClusterModel(model, method) clusterMembership = [] print(cluster.number_of_clusters) for inst in data: try: cl = cluster.cluster_instance(inst) except Exception as inst: logger.error('[%s] : [ERROR] Mismatch model and data attributes', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) dist = cluster.distribution_for_instance(inst) print(("cluster=" + str(cl) + ", distribution=" + str(dist))) clusterMembership.append(cl) # print data.attribute_by_name('key') # print data.num_instances # print data.get_instance(3) pa = self.calcThreashold(dict(Counter(clusterMembership)), 21) if pa == 0: logger.warning('[%s] : [WARN] Most instances are computed as anomalies, possible error encountered!', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'),) print("Most instances are computed as anomalies, possible error encountered!") else: for a in pa: # print data.get_instance(a).get_value(0) #todo always set key as first atribute anomalies.append(data.get_instance(a).get_value(0)) print("Detected using %s anomalies at timestamp(s) %s" % (model, str(anomalies))) except Exception as e: print((traceback.format_exc())) finally: jvm.stop() return anomalies
def main(): jvm.start() vote_classifier_train('./data/final/bolean_for_weka.csv', 'boolean_target', True) vote_classifier_train('./data/final/bolean_for_weka.csv', 'boolean_target', False) j48('./data/final/bolean_for_weka.csv', 'boolean_target', True) j48('./data/final/bolean_for_weka.csv', 'boolean_target', False) naive_bayse('./data/final/bolean_for_weka.csv', 'boolean_target', True) naive_bayse('./data/final/bolean_for_weka.csv', 'boolean_target', False) random_tree('./data/final/bolean_for_weka.csv', 'boolean_target', True) random_tree('./data/final/bolean_for_weka.csv', 'boolean_target', False) vote_classifier_train( './data/final/20 Percent Training Set reducedAttacks_data feature selected with normalized data.csv', 'reduced attacks to 4', True) vote_classifier_train( './data/final/20 Percent Training Set reducedAttacks_data feature selected with normalized data.csv', 'reduced attacks to 4', False) j48( './data/final/20 Percent Training Set reducedAttacks_data feature selected with normalized data.csv', 'reduced attacks to 4', True) j48( './data/final/20 Percent Training Set reducedAttacks_data feature selected with normalized data.csv', 'reduced attacks to 4', False) naive_bayse( './data/final/20 Percent Training Set reducedAttacks_data feature selected with normalized data.csv', 'reduced attacks to 4', True) naive_bayse( './data/final/20 Percent Training Set reducedAttacks_data feature selected with normalized data.csv', 'reduced attacks to 4', False) random_tree( './data/final/20 Percent Training Set reducedAttacks_data feature selected with normalized data.csv', 'reduced attacks to 4', True) random_tree( './data/final/20 Percent Training Set reducedAttacks_data feature selected with normalized data.csv', 'reduced attacks to 4', False) jvm.stop()
def registerInitialState(self, gameState): BustersAgent.registerInitialState(self, gameState) self.distancer = Distancer(gameState.data.layout, False) #Para calcular los valores de la clase en las politicas. self.clusters = 8 self.classes = 4 self.classCounts = [[0 for i in range(self.classes)] for j in range(self.clusters)] self.classIndex = 2 self.clusterIndex = 3 self.readInstances() #Esto nos servira para guardar las instancias de entrenamiento. self.numInstances = 52 self.numAttributes = 4 #self.instances = [[" " for i in range(self.numAttributes)] for j in range(self.numInstances)] self.ins = [" " for i in range(self.numInstances)] #Para usar la libreria debemos usar la maquina virtual de java, JVM jvm.start() #Creamos el modelo loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file( "/home/dot/Escritorio/Universidad/Machine Learning/practica 2/Outputs/agent_header.arff" ) self.clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", str(self.clusters)]) self.clusterer.build_clusterer(data) print(self.clusterer) #Aplicamos la politica self.politicaMax()
def detectarSpam(tuitsConDatos,modeloFilename): """ # # @tuitsConDatos : lista de diccionarios status con los indices # tweetText, tweet_id, favorite_count y retweet_count # # @return predicciones : lista de predicciones por cada tuit de input. # Cada prediccion es un diccionario con los indices # index, actual, predicted, error y distribution # """ bashCommand = "python manage.py runserver;" predicciones = [] try: jvm.start() jvm.start(system_cp=True, packages=True) predicciones = detectarSpam_(tuitsConDatos,modeloFilename) except Exception, e: print(traceback.format_exc()) os.kill(os.getpid(), signal.SIGKILL) process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) output, error = process.communicate()
def emTrain(self, dataf, options, mname, temp=True): ''' :param data: -> data to be clustered :param options: -> EM options I -> number of iterations N -> number of clusters M -> Minimum standard deviation for normal density (default=1.0E-6) num-slots -> number of execution slots, 1 means no parallelism S -> random seed (default=100) example => ["-I", "1000", "-N", "6", "-X", "10", "-max", "-1", "-ll-cv", "1.0E-6", "-ll-iter", "1.0E-6", "-M", "1.0E-6", "-num-slots", "1", "-S", "100"] :return: ''' try: jvm.start(max_heap_size=self.wHeap) data = self.loadData(dataf, temp) clusterEM = Clusterer(classname="weka.clusterers.EM", options=options) clusterEM.build_clusterer(data) print clusterEM self.saveModel(clusterEM, 'em', mname, ) except Exception, e: print(traceback.format_exc())
def playback_speed_checker(inputFile, dirRef): TRAINING_ARFF = 'dataset_playback.arff' inputRef = "" # Start JVM jvm.start() jvm.start(system_cp=True, packages=True) jvm.start(max_heap_size="512m") # Find reference file for file in os.listdir(dirRef): if str(file).find(str(os.path.basename(inputFile))) != -1: inputRef = os.path.join(dirRef, file) break # Calculation distance (result, distance) = dtw_checker(inputFile, inputRef) # Loading data loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(TRAINING_ARFF) data.class_is_last() # set class attribute # Train the classifier #cls = Classifier(classname="weka.classifiers.functions.SMO") cls = Classifier(classname="weka.classifiers.trees.J48", options = ["-C", "0.3", "-M", "10"]) cls.build_classifier(data) # Classify instance speed_instance = Instance.create_instance(numpy.ndarray(distance), classname='weka.core.DenseInstance', weight=1.0) speed_instance.dataset = data # Classify instance speed_flag = cls.classify_instance(speed_instance) if (distance == 0): speed_class = 'nominal' else: if speed_flag == 0: speed_class = 'down_speed' if speed_flag == 0: speed_class = 'up_speed' # print os.path.basename(inputFile) + ' --- ' + speed_class # Stop JVM jvm.stop() print "SPEED IS: " + speed_class return speed_class
def riaa_checker(inputFile): TRAINING_ARFF = 'C:\Users\ASUS\Desktop\IGNASI\SMC\Workspace\dataset_riaa.arff' # Start JVM jvm.start() jvm.start(system_cp=True, packages=True) jvm.start(max_heap_size="512m") # Calculation of bark bands information (absolute_bark, relative_bark, bark_ratios) = compute_bark_spectrum(inputFile) # Loading data loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(TRAINING_ARFF) data.class_is_last() # set class attribute # Train the classifier cls = Classifier(classname="weka.classifiers.functions.SMO") #cls = Classifier(classname="weka.classifiers.trees.J48", options = ["-C", "0.3", "-M", "10"]) cls.build_classifier(data) # Classify instance bark_instance = Instance.create_instance(bark_ratios, classname='weka.core.DenseInstance', weight=1.0) bark_instance.dataset = data # Classify instance riaa_flag = cls.classify_instance(bark_instance) if riaa_flag == 0: riaa_class = 'riaa_ok' else: riaa_class = 'riaa_ko' # print os.path.basename(inputFile) + ' --- ' + riaa_class # Stop JVM jvm.stop() print "RIAA FILTERING?: " + riaa_class return riaa_class
def batch_riaa_checking(inputDir): # Start JVM jvm.start() jvm.start(system_cp=True, packages=True) jvm.start(max_heap_size="512m") riaa_ok = 0 riaa_ko = 0 for file in os.listdir(inputDir): if file.endswith(".wav"): riaa_flag = riaa_checker(os.path.join(inputDir, file)) if (riaa_flag == 'riaa_ko'): riaa_ko+=1 if (riaa_flag == 'riaa_ok'): riaa_ok+=1 # Stop JVM jvm.stop() return (riaa_ko, riaa_ok)
capabilities = classifier.capabilities print(capabilities) # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() data_capabilities = Capabilities.for_instances(iris_data) print(data_capabilities) print("classifier handles dataset: " + str(capabilities.supports(data_capabilities))) # disable/enable helper.print_title("Disable/Enable") capability = Capability(member="UNARY_ATTRIBUTES") capabilities.disable(capability) capabilities.min_instances = 10 print("Removing: " + str(capability)) print(capabilities) if __name__ == "__main__": try: jvm.start() main() except Exception as e: print(traceback.format_exc()) finally: jvm.stop()
def main(): jvm.start(class_path=['./python-weka-wrapper.jar', './weka.jar'],max_heap_size="1024m") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file("testoneuser.arff") print("Number of Attributes: " + str(data.num_attributes())) print("Number of Items: " + str(data.num_instances())) data.set_class_index(0) #print(data) #c = Classifier(classname='weka.classifiers.trees.J48', options=['-C', '0.3']) #c.build_classifier(data) # TODO: Load the data set, and extract the features ''' ''' ''' dataset = loadarff(open('testoneuser.arff', 'r')) data = dataset[0] #print("Data Length: " + len(data)) v = data['rollon'] print("Data: " + str(v)) ''' # Temporary: Load the data set iris = datasets.load_iris() # Get the data length dataLength = iris.data.shape[0] #iris.data data = np.random.random((dataLength, 1)) Zmax, Zmin = data.max(), data.min() data = (data - Zmin) / (Zmax - Zmin) data *= 6 data = np.around(data) # Add the new column data = np.hstack((data, iris.data)) data = np.hstack((data, np.reshape(iris.target, (-1, 1)))) total_correct = collections.defaultdict(int) total = collections.defaultdict(int) # Iterate through the users leaving one out at a time for ignoredUserId in range(0, 7): #get the training set training_data = data[data[:, 0] != ignoredUserId, :] training_label = training_data[:, -1] classifiers = cls.getAllClassifiers(splitDataOnFeatures(training_data), training_label) # Get the test set testData = data[data[:, 0] == ignoredUserId, :] allTestData = splitDataOnFeatures(testData) correct_values = testData[:, -1] # Predict the value based on the classifier results = predictAll(classifiers, allTestData, correct_values) # Find any differences for type, result in results.iteritems(): total_correct[type] = total_correct[type] + result[0] total[type] = total[type] + result[1] for type, correct in total_correct.iteritems(): print("Accuracy for " + str(type) + ": " + str(correct/total[type]))
from weka.core.dataset import Attribute, Instances import javabridge import numpy as np import pdb import json import os from nemoApi import nemoApi, AIParam from nemoConfig import nemoConfig # Start JVM on file load # Required that only ONE jvm exist for all threads jvm.start(class_path=["mysql-connector-java-5.1.38-bin.jar"]) class WekaWrapper: def __init__(self, questionID, algorithm, classifier, parameters, modelParams, optimizer, predict = 0): self.questionID = questionID self.algorithm = algorithm self.classifier = classifier self.parameters = parameters self.modelParams = modelParams self.api = nemoApi() self.config = nemoConfig() self.optimizer = optimizer self.predict = predict self.prediction = None
""" self.assertEqual("\\n\\t", str(classes.backquote("\n\t"))) self.assertEqual("hello\\tworld", str(classes.backquote("hello\tworld"))) self.assertEqual("\t\n", str(classes.unbackquote("\\t\\n"))) self.assertEqual("hello\tworld\n", str(classes.unbackquote("hello\\tworld\\n"))) def test_from_and_to_commandline(self): """ Tests the from_commandline and to_commandline methods. """ cmdline = "weka.classifiers.trees.J48 -C 0.3 -M 4" cls = classes.from_commandline( cmdline=cmdline, classname="weka.classifiers.Classifier") self.assertIsNotNone(cls) self.assertEqual(cmdline, cls.to_commandline()) def suite(): """ Returns the test suite. :return: the test suite :rtype: unittest.TestSuite """ return unittest.TestLoader().loadTestsFromTestCase(TestClasses) if __name__ == '__main__': jvm.start(packages=True) # necessary for setupgenerator unittest.TextTestRunner().run(suite()) jvm.stop()
cls = classifiers.Classifier( classname="weka.classifiers.functions.SMOreg", options=["-K", "weka.classifiers.functions.supportVector.RBFKernel"]) ms.classifier = cls self.assertEqual(cls.to_commandline(), ms.classifier.to_commandline(), msg="classifiers differ") cls = classifiers.Classifier(classname="weka.classifiers.functions.LinearRegression") ms.classifier = cls ms.evaluation = ms.tags_evaluation.find("RMSE") self.assertEqual("RMSE", str(ms.evaluation), "evaluation differs: " + str(ms.evaluation)) ms.evaluation = "ACC" self.assertEqual("ACC", str(ms.evaluation), "evaluation differs: " + str(ms.evaluation)) cls = classifiers.Classifier(classname="weka.classifiers.trees.J48") ms.classifier = cls def suite(): """ Returns the test suite. :return: the test suite :rtype: unittest.TestSuite """ return unittest.TestLoader().loadTestsFromTestCase(TestClassifiers) if __name__ == '__main__': jvm.start(packages=True) # necessary for multisearch unittest.TextTestRunner().run(suite()) jvm.stop()
#!/usr/bin/env python import weka.core.jvm as jvm from weka.core.converters import Loader from weka.classifiers import Classifier, Evaluation jvm.logger.setLevel(jvm.logging.WARNING) jvm.start(packages=True, max_heap_size="512m") # Each instance has nominal class and numeric attributes loader = Loader(classname="weka.core.converters.ArffLoader") trainData = loader.load_file('segment-challenge.arff') trainData.class_is_last() testData = loader.load_file('segment-test.arff') testData.class_is_last() # Default C4.5 tree classifier = Classifier(classname="weka.classifiers.trees.J48") # Search for the best parameters and build a classifier with them classifier.build_classifier(trainData) print("\n\n=========== Classifier information ================\n\n") print(classifier.options) print(classifier) print("\n\n=========== Train results ================\n\n") evaluation = Evaluation(trainData) evaluation.test_model(classifier, trainData) print(classifier.to_commandline()) print(evaluation.matrix())
def __init__(self, dataDir = '.'): self.dataDir = dataDir jvm.start()
mparam.expression = "pow(BASE,I)" lparam = ListParameter() lparam.prop = "classifier.C" lparam.values = ["-2.0", "-1.0", "0.0", "1.0", "2.0"] multi.parameters = [mparam, lparam] cls = Classifier( classname="weka.classifiers.functions.SMOreg", options=["-K", "weka.classifiers.functions.supportVector.RBFKernel"]) multi.classifier = cls multi.build_classifier(train) print("Model:\n" + str(multi)) print("\nBest setup:\n" + multi.best.to_commandline()) def main(): """ Calls the parameter optimization method(s). """ #gridsearch() multisearch() if __name__ == "__main__": try: jvm.start(packages=True) main() except Exception, e: print(traceback.format_exc()) finally: jvm.stop()
import weka.core.jvm as jvm jvm.start() jvm.start(system_cp=True, packages=True) jvm.start(packages="/usr/local/lib/python2.7/dist-packages/weka") jvm.start(max_heap_size="512m") data_dir="CSDMC2010_SPAM/CSDMC2010_SPAM/TRAINING" from weka.classifiers import Classifier cls = Classifier(classname="weka.classifiers.trees.J48") cls.options = ["-C", "0.3"] print(cls.options) jvm.stop()
from utilities import * import weka.core.jvm as jvm from weka.core.converters import Loader, Saver from weka.classifiers import Classifier, Evaluation from weka.core.classes import Random jvm.start(max_heap_size="3072m") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file("./Dataset/trainGrid.arff") data.class_is_last() #classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"]) classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes") evaluation = Evaluation(data) #evaluation.crossvalidate_model(classifier, data, 10, Random(42)) evaluation.evaluate_train_test_split(classifier, data, 66, Random(42)) res = evaluation.summary() res += "\n" + evaluation.matrix() #f = open('./Dataset/resultsGrid.txt', 'w') #f.write(res) print res jvm.stop()