def get_weka_instance(self, categorical=False): """ Converts this BoardDataModel to a weka.core.datasets.Instance object Instance objects must be tied to some dataset. The continuous version of our board dataset is used by default. If the 'categorical' param is True then the categorical dataset will be used. :param categorical: boolean: use the categorical dataset when constructing this instance (default: False) :return: a weka.core.datasets.Instance object representing this instance """ if categorical: instance_vector = self.representation + [ self.next_player, 5 ] # the five is a fake score attribute weka_instance = Instance.create_instance(instance_vector) weka_instance.dataset = categorical_dataset weka_instance.set_missing(weka_instance.class_index) else: instance_vector = self.representation + [ self.next_player, 0 ] # the zero is a fake score attribute weka_instance = Instance.create_instance(instance_vector) weka_instance.dataset = continuous_dataset return weka_instance
def assign_cluster(file_location, file_out="clustered.csv", model="kmeans.model", last_filename=False): data = read_csv_file(file_location) check_jvm() # load clusters obj = serialization.read(model) clusterer = Clusterer(jobject=obj) # create file with cluster group with open(file_out, 'w') as output: for index, attrs in enumerate(data): tmp = [] if last_filename: inst = Instance.create_instance(attrs[:-2]) else: inst = Instance.create_instance(attrs[1:]) pred = clusterer.cluster_instance(inst) dist = clusterer.distribution_for_instance(inst) if last_filename : tmp.append(attrs[-1]) tmp.append(pred) tmp.extend(attrs[:-2]) else: tmp.append(attrs[0]) tmp.append(pred) tmp.extend(attrs[1:]) print(str(index + 1) + ": label index=" + str(pred) + ", class distribution=" + str(dist)) output.write('%s\n'%(','.join(map(str,tmp)) ))
def predict(obj, opstats, tpch=True): threshold = { 'ylsize': 1, 'ydsize': 1, 'olsize': 1, 'odsize': 1, 'yreal': 0.01, 'oreal': 0.01 } s = 0.0 for op in opstats: if len(opstats[op]) <= 1: continue values = [ opstats[op][k] for k in ['long', 'nK', 'nK_delta', 'nT', 'nT_delta', 'str', 'strsum'] ] values.append(0) # should be obj for k in addf(): values.append(opstats[op][k]) v = classifiers['hash,' + obj].classify_instance( Instance.create_instance(values)) #print obj, op, values, v s += v #else: # zeroref = {'nT':1,'nT_delta':0,'nK':1,'nK_delta':0,'long':1,'str':0,'strsum':0} # s = manual_pred(obj, zeroref) # for op in opstats: # prediction = manual_pred(obj, opstats[op]) # s = s + prediction - manual_pred(obj, zeroref) return max(s, threshold[obj])
def predBtn_clicked(self): gender = self.gender_entry.get() age = int(self.age_entry.get()) height = int(self.height_entry.get()) weight = int(self.weight_entry.get()) sociability = self.sociability_entry.get() stability = self.stability_entry.get() '''Create the model''' objects = serialization.read_all("J48.model") cls = Classifier(jobject=objects[0]) data = Instances(jobject=objects[1]) '''Create the test set to be classified''' gender_values = ["Man", "Woman"] sociability_values = ["Introvert", "Extrovert"] stability_values = ["Stable", "Unstable"] values = [ gender_values.index(gender), age, height, weight, self.BMI(weight, height), stability_values.index(stability), sociability_values.index(sociability), Instance.missing_value() ] inst = Instance.create_instance(values) inst.dataset = data '''Classification''' prediction = int(cls.classify_instance(inst)) self.controller.show_frame("Result").show(prediction) self.clear()
def df_to_instances(self): ''' transform pandas data frame to arff style data :param df: panda data frame :param relation: relation, string :param attr_label: label attribute, string :return: arff style data ''' atts = [] for col in self.df.columns: if col != self.attr_label: att = Attribute.create_numeric(col) else: att = Attribute.create_nominal(col, ['0', '1']) atts.append(att) nrow = len(self.df) result = Instances.create_instances(self.relation, atts, nrow) # data for i in range(nrow): inst = Instance.create_instance( self.df.iloc[i].astype('float64').to_numpy().copy(order='C')) result.add_instance(inst) return result
def getIntent(self,user_input): ''' Identifica el intent por medio de una entrada de usuario y una data haciendo una predicción. :param str entrada del usuario :param data representación del dataset de GLaDOS :return cadena con el intent identificado :rtype str ''' vector_input = self.transformUserInput(user_input) inst = Instance.create_instance(vector_input) #print(inst) self.data.add_instance(inst) for index, inst in enumerate(self.data): pred = int(self.cls.classify_instance(inst)) dist = self.cls.distribution_for_instance(inst) #print("{}: label index={}, class distribution={}".format(index+1, pred, dist)) intent = "desconocido" pred = int(self.cls.classify_instance(inst)) dist = self.cls.distribution_for_instance(inst) #print("{}: label index={}, class distribution={}".format(index+1, pred, dist)) if max(dist) > 0.7: intent = self.intens.value(pred) return intent
def transfer_example_to_instance(self, input_values): value_list = copy.deepcopy(input_values) # dimension을 맞추기 위해 dummy label 값을 추가한다 value_list.append(-1) # Instance.new_instance() return Instance.create_instance(value_list)
def test_single(): #['long', 'nK', 'nK_delta', 'nT', 'nT_delta', 'str', 'strsum']: objs = ['olsize', 'ylsize'] for obj in objs: c = Classifier(jobject=serialization.read(model_file('hash', obj))) values = [3.0, 192.0, 124.0, 192.0, 124.0, 6.0, 144.0] values.append(0) # should be obj ins = Instance.create_instance(values) prediction = c.classify_instance(ins) print obj, prediction
def assign_classify(file_location, output="classified.out", model="naivebayes.model"): data = read_csv_file(file_location) jvm.start() # load clusters obj = serialization.read(model) classifier = Classifier(jobject=obj) # create file with cluster group with open(output, 'w') as cluster_file: for index, attrs in enumerate(data): inst = Instance.create_instance(attrs[1:]) pred = classifier.classify_instance(inst) print(str(index + 1) + ": label index=" + str(pred)) jvm.stop()
def playback_speed_checker(inputFile, dirRef): TRAINING_ARFF = 'dataset_playback.arff' inputRef = "" # Start JVM jvm.start() jvm.start(system_cp=True, packages=True) jvm.start(max_heap_size="512m") # Find reference file for file in os.listdir(dirRef): if str(file).find(str(os.path.basename(inputFile))) != -1: inputRef = os.path.join(dirRef, file) break # Calculation distance (result, distance) = dtw_checker(inputFile, inputRef) # Loading data loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(TRAINING_ARFF) data.class_is_last() # set class attribute # Train the classifier #cls = Classifier(classname="weka.classifiers.functions.SMO") cls = Classifier(classname="weka.classifiers.trees.J48", options = ["-C", "0.3", "-M", "10"]) cls.build_classifier(data) # Classify instance speed_instance = Instance.create_instance(numpy.ndarray(distance), classname='weka.core.DenseInstance', weight=1.0) speed_instance.dataset = data # Classify instance speed_flag = cls.classify_instance(speed_instance) if (distance == 0): speed_class = 'nominal' else: if speed_flag == 0: speed_class = 'down_speed' if speed_flag == 0: speed_class = 'up_speed' # print os.path.basename(inputFile) + ' --- ' + speed_class # Stop JVM jvm.stop() print "SPEED IS: " + speed_class return speed_class
def query_instance(attributes, model="kmeans.model"): """ get the cluster for defined attributes :params attributes: array or list :returns: cluster id """ check_jvm() # create instance inst = Instance.create_instance(attributes) # load model obj = serialization.read(model) # load cluster and get the cluster_id cluster = Clusterer(jobject=obj) cluster_id = cluster.cluster_instance(inst) return cluster_id
def create_dataset(tweets): text_att = Attribute.create_string('TEXT') nom_att = Attribute.create_nominal('CLASS', class_values) dataset = Instances.create_instances("tweets", [text_att, nom_att], len(tweets)) for tweet in tweets: values = [] values.append(dataset.attribute(0).add_string_value(tweet)) values.append(Instance.missing_value()) inst = Instance.create_instance(values) dataset.add_instance(inst) dataset.class_is_last() return dataset
def calculate_amino_type(self, model, pro): if pro: return [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] i = Instance.create_instance(values=[1.0, self.a, self.b]) if (self.a==-1 and self.b==-1 ): return [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] elif (self.a==-1): i.set_missing(1) elif (self.b==-1): i.set_missing(2) from weka.core.converters import Loader loader = Loader("weka.core.converters.ArffLoader") myDataset = loader.load_file("weka/testingthisthingout.arff") myDataset.set_class_index(0) i.set_dataset(myDataset) return model.distribution_for_instance(i)
def classify_level(sent, classifier, stats, params={}, match={}): """ Classifies the CEFR level of 'sent'. 2016 june - based on check_readability() in sent_match.py @ sent: @ stats: SentStatistics instance @ params: parameters for SentMatch (HitEx) @ match: SentMatch instance # TO DO: add argument for choosing bw WEKA and sklearn adapt to both sents and texts in- vs cross-domain setups """ sent_feats = SentFeatures(sent, stats, params) fs = sent_feats.features feature_names = fs.keys() # set the order of training attributes for values with codecs.open("auxiliaries/feature_names.txt") as f: train_fn = [l.strip("\n") for l in f.readlines()] f_list = [fs[tfn] for tfn in train_fn] # create Instance, attributes and a dummy dataset (required for prediction) inst = Instance.create_instance(f_list) attributes = [] for feat_n in train_fn: attributes.append(Attribute.create_numeric(feat_n)) attributes.append( Attribute.create_nominal("level", ["A1", "A2", "B1", "B2", "C1"])) dataset = Instances.create_instances("readability", attributes, 0) dataset.add_instance(inst) dataset.class_is_last() # make prediction cefr_mapping = {"A1": 1.0, "A2": 2.0, "B1": 3.0, "B2": 4.0, "C1": 5.0} trg_cefr_fl = cefr_mapping[params["target_cefr"]] for instance in dataset: pred = classifier.classify_instance(instance) pred_cefr = pred + 1 #if pred_cefr < 1 or pred_cefr > 5: level_diff = pred_cefr - trg_cefr_fl # negative value = easier than target nominal_level = [k for k, v in cefr_mapping.items() if v == pred_cefr][0] return (level_diff, nominal_level, fs ) #return also fs -> for detailed info in webservice
def ndarray_to_instances(array, relation, att_template="Att-#", att_list=None): """ Converts the numpy matrix into an Instances object and returns it. :param array: the numpy ndarray to convert :type array: numpy.darray :param relation: the name of the dataset :type relation: str :param att_template: the prefix to use for the attribute names, "#" is the 1-based index, "!" is the 0-based index, "@" the relation name :type att_template: str :param att_list: the list of attribute names to use :type att_list: list :return: the generated instances object :rtype: Instances """ if len(numpy.shape(array)) != 2: raise Exception("Number of array dimensions must be 2!") rows, cols = numpy.shape(array) # header atts = [] if att_list is not None: if len(att_list) != cols: raise Exception( "Number columns and provided attribute names differ: " + str(cols) + " != " + len(att_list)) for name in att_list: att = Attribute.create_numeric(name) atts.append(att) else: for i in range(cols): name = att_template.replace("#", str(i + 1)).replace( "!", str(i)).replace("@", relation) att = Attribute.create_numeric(name) atts.append(att) result = Instances.create_instances(relation, atts, rows) # data for i in range(rows): inst = Instance.create_instance(array[i]) result.add_instance(inst) return result
def main(): """ Creates a dataset from scratch using random data and outputs it. """ atts = [] for i in xrange(5): atts.append(Attribute.create_numeric("x" + str(i))) data = Instances.create_instances("data", atts, 10) for n in xrange(10): values = [] for i in xrange(5): values.append(n*100 + i) inst = Instance.create_instance(values) data.add_instance(inst) print(data)
def main(): """ Creates a dataset from scratch using random data and outputs it. """ atts = [] for i in range(5): atts.append(Attribute.create_numeric("x" + str(i))) data = Instances.create_instances("data", atts, 10) for n in range(10): values = [] for i in range(5): values.append(n * 100 + i) inst = Instance.create_instance(values) data.add_instance(inst) print(data)
def predict(self, modelName, x, arffName, debug=False): # Carga el arrf para conocer la estructura de las instancias loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(arffName) # Se asume que la clase es el ultimo atributo data.class_is_last() # Carga del modelo generado en Weka objects = serialization.read_all(modelName) cls = Classifier(jobject=objects[0]) if(debug): print("Loaded model...") print(cls) # Se crea la instancia correspondiente a la entrada y se clasifica if(debug): print("Input", x) # Anyade un valor tonto para la clase de la instancia if data.class_attribute.is_nominal: x.append('a') else: x.append(0) # Convierte los valores nominales a la posicion entera que ocupa dentro de sus lista #print data.num_attributes for i in range(0, data.num_attributes): attribute = data.attribute(i) if attribute.is_nominal: x[i] = attribute.index_of(x[i]) '''print x[i] print ''''' # Realiza la prediccion inst = Instance.create_instance(x) inst.dataset = data pred = cls.classify_instance(inst) if data.class_attribute.is_nominal: pred = data.class_attribute.value(pred) if(debug): print("Prediction", pred) return pred
def riaa_checker(inputFile): TRAINING_ARFF = 'C:\Users\ASUS\Desktop\IGNASI\SMC\Workspace\dataset_riaa.arff' # Start JVM jvm.start() jvm.start(system_cp=True, packages=True) jvm.start(max_heap_size="512m") # Calculation of bark bands information (absolute_bark, relative_bark, bark_ratios) = compute_bark_spectrum(inputFile) # Loading data loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(TRAINING_ARFF) data.class_is_last() # set class attribute # Train the classifier cls = Classifier(classname="weka.classifiers.functions.SMO") #cls = Classifier(classname="weka.classifiers.trees.J48", options = ["-C", "0.3", "-M", "10"]) cls.build_classifier(data) # Classify instance bark_instance = Instance.create_instance(bark_ratios, classname='weka.core.DenseInstance', weight=1.0) bark_instance.dataset = data # Classify instance riaa_flag = cls.classify_instance(bark_instance) if riaa_flag == 0: riaa_class = 'riaa_ok' else: riaa_class = 'riaa_ko' # print os.path.basename(inputFile) + ' --- ' + riaa_class # Stop JVM jvm.stop() print "RIAA FILTERING?: " + riaa_class return riaa_class
def ndarray_to_instances(array, relation, att_template="Att-#", att_list=None): """ Converts the numpy matrix into an Instances object and returns it. :param array: the numpy ndarray to convert :type array: numpy.darray :param relation: the name of the dataset :type relation: str :param att_template: the prefix to use for the attribute names, "#" is the 1-based index, "!" is the 0-based index, "@" the relation name :type att_template: str :param att_list: the list of attribute names to use :type att_list: list :return: the generated instances object :rtype: Instances """ if len(numpy.shape(array)) != 2: raise Exception("Number of array dimensions must be 2!") rows, cols = numpy.shape(array) # header atts = [] if att_list is not None: if len(att_list) != cols: raise Exception( "Number columns and provided attribute names differ: " + str(cols) + " != " + len(att_list)) for name in att_list: att = Attribute.create_numeric(name) atts.append(att) else: for i in xrange(cols): name = att_template.replace("#", str(i+1)).replace("!", str(i)).replace("@", relation) att = Attribute.create_numeric(name) atts.append(att) result = Instances.create_instances(relation, atts, rows) # data for i in xrange(rows): inst = Instance.create_instance(array[i]) result.add_instance(inst) return result
def sklearn_input_to_weka(X, y=None, labels=None): from weka.core.dataset import Attribute, Instances, Instance attribs = [] for i in range(len(X[0])): attribs.append(Attribute.create_numeric('x_{}'.format(i))) if labels is None and y is not None: labels = [str(label) for label in np.unique(y)] attribs.append(Attribute.create_nominal('y', labels)) n_rows = len(X) instances = Instances.create_instances('data', attribs, n_rows) for i in range(n_rows): if y is None: row = [*X[i], '0'] elif isinstance(y, pd.Series): row = [*X[i], y.iloc[i]] else: row = [*X[i], y[i]] instances.add_instance(Instance.create_instance(row)) instances.class_is_last() return instances, labels
def bayes_classifier(features): #carrega o dataset instancias = load_any_file("caracteristicas.arff") # sinaliza que o ultimo atributo é a classe instancias.class_is_last() # Carrega o classificafor Naive Bayes e Classifica com base nas características da imagem classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes") classifier.build_classifier(instancias) # Cria uma nova instância com base nas caracteristicas extraidas new_instance = Instance.create_instance(features) # Adiciona a nova instância ao dataset instancias.add_instance(new_instance) # Liga a nova instancia ao dataset new_instance.dataset = instancias # Classifica a nova instância trazendo as probabilidades de ela pertencer as classes definidas classification = classifier.distribution_for_instance(new_instance) print("Classificação", " - Apu: ", round(classification[0] * 100, 2), " Nelson: ", round(classification[1], 2)) return classification
def calculate_amino_type(self, model, pro): if pro: # the 12th index is 2 so we can pick it out. all others are zero so it is not place in other locations return [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] # builds a instance for the model i = Instance.create_instance(values=[1.0, self.a, self.b]) if (self.a==-1 and self.b==-1 ): # place holder return [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] elif (self.a==-1): # update instance for missing data i.set_missing(1) elif (self.b==-1): # update instance for missing data i.set_missing(2) # read in blank dataset from weka.core.converters import Loader loader = Loader("weka.core.converters.ArffLoader") myDataset = loader.load_file("weka/testingthisthingout.arff") myDataset.set_class_index(0) # use model to predict amino acid type i.set_dataset(myDataset) return model.distribution_for_instance(i)
def calculate_amino_type(self, model, pro): if pro: return [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] i = Instance.create_instance(values=[1.0, self.a, self.b]) if (self.a == -1 and self.b == -1): return [ 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 ] elif (self.a == -1): i.set_missing(1) elif (self.b == -1): i.set_missing(2) from weka.core.converters import Loader loader = Loader("weka.core.converters.ArffLoader") myDataset = loader.load_file("weka/testingthisthingout.arff") myDataset.set_class_index(0) i.set_dataset(myDataset) return model.distribution_for_instance(i)
def to_instance(header, x, y=None, weight=1.0): """ Generates an Instance from the data. :param header: the data structure to adhere to :type header: Instances :param x: the 1D vector with input variables :type x: ndarray :param y: the optional class value :type y: object :param weight: the weight for the Instance :type weight: float :return: the generate Instance :rtype: Instance """ values = [] for i in range(len(x)): if header.attribute(i).is_nominal: values.append(header.attribute(i).index_of(str(x[i]))) elif header.attribute(i).is_numeric: values.append(x[i]) else: raise Exception("Unsupported attribute type for column %d: %s" % ((i+1), header.attribute(i).type_str())) if y is not None and header.has_class(): if y == missing_value(): values.append(missing_value()) elif header.class_attribute.is_nominal: values.append(header.class_attribute.index_of(str(y))) elif header.class_attribute.is_numeric: values.append(y) else: raise Exception("Unsupported attribute type for class attribute: %s" % header.class_attribute.type_str()) result = Instance.create_instance(values, weight=weight) result.dataset = header return result
def classify_json_object(lang, tag, json_data): model = load_classifier(lang, tag) # create dataset attr = create_attributes(lang, tag) dataset = Instances.create_instances(lang + "_dataset", attr, 0) # create an instance n_feature = 0 tag_list = "" tag_feature = "" if lang == LANG_ID: n_feature = ID_N_FEATURE tag_list = ID_TAG tag_feature = ID_TAG_FEATURE elif lang == LANG_EN: n_feature = EN_N_FEATURE tag_list = EN_TAG tag_feature = EN_TAG_FEATURE # print (attr) val = [] for tag in tag_list: for i in range(0, n_feature): for ftr in tag_feature: cur_key = tag + str(i + 1) val.append(json_data[cur_key][cur_key + "_" + ftr]) # print(cur_key + "_" + ftr, json_data[cur_key][cur_key + "_token"], json_data[cur_key][cur_key + "_" + ftr]) val.append(0) inst = Instance.create_instance(val) dataset.add_instance(inst) dataset.class_is_last() pred = classify_new_instance(model, dataset) return pred
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() helper.print_title("Iris dataset") print(iris_data) helper.print_title("Iris dataset (incrementally output)") for i in iris_data: print(i) helper.print_title("Iris summary") print(Instances.summary(iris_data)) helper.print_title("Iris attributes") for a in iris_data.attributes(): print(a) helper.print_title("Instance at #0") print(iris_data.get_instance(0)) print(iris_data.get_instance(0).values) print("Attribute stats (first):\n" + str(iris_data.attribute_stats(0))) print("total count (first attribute):\n" + str(iris_data.attribute_stats(0).total_count)) print("numeric stats (first attribute):\n" + str(iris_data.attribute_stats(0).numeric_stats)) print("nominal counts (last attribute):\n" + str(iris_data.attribute_stats(iris_data.num_attributes - 1).nominal_counts)) helper.print_title("Instance values at #0") for v in iris_data.get_instance(0): print(v) # append datasets helper.print_title("append datasets") data1 = Instances.copy_instances(iris_data, 0, 2) data2 = Instances.copy_instances(iris_data, 2, 2) print("Dataset #1:\n" + str(data1)) print("Dataset #2:\n" + str(data2)) msg = data1.equal_headers(data2) print("#1 == #2 ? " + "yes" if msg is None else msg) combined = Instances.append_instances(data1, data2) print("Combined:\n" + str(combined)) # merge datasets helper.print_title("merge datasets") data1 = Instances.copy_instances(iris_data, 0, 2) data1.class_index = -1 data1.delete_attribute(1) data1.delete_first_attribute() data2 = Instances.copy_instances(iris_data, 0, 2) data2.class_index = -1 data2.delete_attribute(4) data2.delete_attribute(3) data2.delete_attribute(2) print("Dataset #1:\n" + str(data1)) print("Dataset #2:\n" + str(data2)) msg = data1.equal_headers(data2) print("#1 == #2 ? " + ("yes" if msg is None else msg)) combined = Instances.merge_instances(data2, data1) print("Combined:\n" + str(combined)) # load dataset incrementally iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset incrementally: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file, incremental=True) iris_data.class_is_last() helper.print_title("Iris dataset") print(iris_data) for inst in loader: print(inst) # create attributes helper.print_title("Creating attributes") num_att = Attribute.create_numeric("num") print("numeric: " + str(num_att)) date_att = Attribute.create_date("dat", "yyyy-MM-dd") print("date: " + str(date_att)) nom_att = Attribute.create_nominal("nom", ["label1", "label2"]) print("nominal: " + str(nom_att)) # create dataset helper.print_title("Create dataset") dataset = Instances.create_instances("helloworld", [num_att, date_att, nom_att], 0) print(str(dataset)) # create an instance helper.print_title("Create and add instance") values = [3.1415926, date_att.parse_date("2014-04-10"), 1.0] inst = Instance.create_instance(values) print("Instance #1:\n" + str(inst)) dataset.add_instance(inst) values = [2.71828, date_att.parse_date("2014-08-09"), Instance.missing_value()] inst = Instance.create_instance(values) dataset.add_instance(inst) print("Instance #2:\n" + str(inst)) inst.set_value(0, 4.0) print("Instance #2 (updated):\n" + str(inst)) print("Dataset:\n" + str(dataset)) dataset.delete_with_missing(2) print("Dataset (after delete of missing):\n" + str(dataset)) values = [(1, date_att.parse_date("2014-07-11"))] inst = Instance.create_sparse_instance(values, 3, classname="weka.core.SparseInstance") print("sparse Instance:\n" + str(inst)) dataset.add_instance(inst) print("dataset with mixed dense/sparse instance objects:\n" + str(dataset)) # create dataset (lists) helper.print_title("Create dataset from lists") x = [[randint(1, 10) for _ in range(5)] for _ in range(10)] y = [randint(0, 1) for _ in range(10)] dataset2 = ds.create_instances_from_lists(x, y, "generated from lists") print(dataset2) x = [[randint(1, 10) for _ in range(5)] for _ in range(10)] dataset2 = ds.create_instances_from_lists(x, name="generated from lists (no y)") print(dataset2) # create dataset (matrices) helper.print_title("Create dataset from matrices") x = np.random.randn(10, 5) y = np.random.randn(10) dataset3 = ds.create_instances_from_matrices(x, y, "generated from matrices") print(dataset3) x = np.random.randn(10, 5) dataset3 = ds.create_instances_from_matrices(x, name="generated from matrices (no y)") print(dataset3) # create more sparse instances diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff" helper.print_info("Loading dataset: " + diabetes_file) loader = Loader("weka.core.converters.ArffLoader") diabetes_data = loader.load_file(diabetes_file) diabetes_data.class_is_last() helper.print_title("Create sparse instances using template dataset") sparse_data = Instances.template_instances(diabetes_data) for i in range(diabetes_data.num_attributes - 1): inst = Instance.create_sparse_instance( [(i, float(i+1) / 10.0)], sparse_data.num_attributes, classname="weka.core.SparseInstance") sparse_data.add_instance(inst) print("sparse dataset:\n" + str(sparse_data)) # simple scatterplot of iris dataset: petalwidth x petallength iris_data = loader.load_file(iris_file) iris_data.class_is_last() pld.scatter_plot( iris_data, iris_data.attribute_by_name("petalwidth").index, iris_data.attribute_by_name("petallength").index, percent=50, wait=False) # line plot of iris dataset (without class attribute) iris_data = loader.load_file(iris_file) iris_data.class_is_last() pld.line_plot(iris_data, atts=range(iris_data.num_attributes - 1), percent=50, title="Line plot iris", wait=False) # matrix plot of iris dataset iris_data = loader.load_file(iris_file) iris_data.class_is_last() pld.matrix_plot(iris_data, percent=50, title="Matrix plot iris", wait=True)
if index == 0: atts = [] ref_present = ("Reference value" in row) or ("Reference Value" in row) for idx, col in enumerate(row): col = col.lower() atts.append(Attribute.create_numeric(col)) if not ref_present and (idx == 0): atts.append(Attribute.create_numeric("reference value")) data = Instances.create_instances("irdc", atts, 0) else: values = [] for idx, col in enumerate(row): values.append(float(col)) if not ref_present and (idx == 0): values.append(float('NaN')) inst = Instance.create_instance(values) data.add_instance(inst) saver = Saver(classname="weka.core.converters.ArffSaver") saver.save_file(data, data_dir + os.sep + outfile) # train/test/predict print("Train/test/predict...") groups = ["DataSet1", "DataSet2"] # groups = ["DataSet2"] for group in groups: print(group) train = data_dir + os.sep + group + "_Cal.arff" test = data_dir + os.sep + group + "_Test.arff"
def to_instances(X, y=None, att_names=None, att_types=None, class_name=None, class_type=None, relation_name=None, num_nominal_labels=None, num_class_labels=None): """ Turns the 2D matrix and the optional 1D class vector into an Instances object. :param X: the input variables, 2D matrix :type X: ndarray :param y: the optional class value column, 1D vector :type y: ndarray :param att_names: the list of attribute names :type att_names: list :param att_types: the list of attribute types (C=categorical, N=numeric), assumes numeric by default if not provided :param class_name: the name of the class attribute :type class_name: str :param class_type: the type of the class attribute (C=categorical, N=numeric) :type class_type: str :param relation_name: the name for the dataset :type relation_name: str :param num_nominal_labels: the dictionary with the number of labels (key is 0-based attribute index) :type num_nominal_labels: dict :param num_class_labels: the number of labels in the class attribute :type num_class_labels: int :return: the generated Instances object :rtype: Instances """ if len(X) == 0: raise Exception("No data to convert!") # defaults if att_types is None: att_types = determine_attribute_types(X) if att_names is None: att_names = [] for i in range(len(X[0])): att_names.append("att-" + str(i+1)) if relation_name is None: relation_name = "scikit-weka @ " + str(datetime.now()) if class_name is None: if "class" not in att_names: class_name = "class" else: class_name = "class-" + str(len(att_names) + 1) if y is not None: if class_type is None: class_type = determine_attribute_type(y) # create header atts = [] for i in range(len(X[0])): att_name = att_names[i] att_type = att_types[i] if att_type == "N": atts.append(Attribute.create_numeric(att_name)) elif att_type == "C": if (num_nominal_labels is not None) and (i in num_nominal_labels): values = [] for l in range(num_nominal_labels[i]): values.append("_%d" % l) else: labels = set() for n in range(len(X)): r = X[n] v = str(r[i]) labels.add(v) values = sorted(labels) atts.append(Attribute.create_nominal(att_name, values)) else: raise Exception("Unsupported attribute type for column %d: %s" % ((i+1), att_type)) if y is not None: if class_type == "N": atts.append(Attribute.create_numeric(class_name)) elif class_type == "C": if num_class_labels is not None: values = [] for l in range(num_class_labels): values.append("_%d" % l) else: values = sorted(set([str(x) for x in y])) atts.append(Attribute.create_nominal(class_name, values)) result = Instances.create_instances(relation_name, atts, len(X)) if y is not None: result.class_index = result.num_attributes - 1 # data for n in range(len(X)): values = [] r = X[n] for i in range(len(r)): if att_types[i] == "C": values.append(atts[i].index_of(str(r[i]))) elif att_types[i] == "N": values.append(r[i]) else: raise Exception("Unsupported attribute type for column %d: %s" % ((i+1), att_types[i])) if y is not None: if class_type == "C": values.append(atts[-1].index_of(str(y[n]))) elif class_type == "N": values.append(y[n]) else: raise Exception("Unsupported attribute type for class: %s" % class_type) inst = Instance.create_instance(values) result.add_instance(inst) return result
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() helper.print_title("Iris dataset") print(iris_data) helper.print_title("Iris dataset (incrementally output)") for i in iris_data: print(i) helper.print_title("Iris summary") print(Instances.summary(iris_data)) helper.print_title("Iris attributes") for a in iris_data.attributes(): print(a) helper.print_title("Instance at #0") print(iris_data.get_instance(0)) print(iris_data.get_instance(0).values) print("Attribute stats (first):\n" + str(iris_data.attribute_stats(0))) print("total count (first attribute):\n" + str(iris_data.attribute_stats(0).total_count)) print("numeric stats (first attribute):\n" + str(iris_data.attribute_stats(0).numeric_stats)) print("nominal counts (last attribute):\n" + str( iris_data.attribute_stats(iris_data.num_attributes - 1).nominal_counts)) helper.print_title("Instance values at #0") for v in iris_data.get_instance(0): print(v) # append datasets helper.print_title("append datasets") data1 = Instances.copy_instances(iris_data, 0, 2) data2 = Instances.copy_instances(iris_data, 2, 2) print("Dataset #1:\n" + str(data1)) print("Dataset #2:\n" + str(data2)) msg = data1.equal_headers(data2) print("#1 == #2 ? " + "yes" if msg is None else msg) combined = Instances.append_instances(data1, data2) print("Combined:\n" + str(combined)) # merge datasets helper.print_title("merge datasets") data1 = Instances.copy_instances(iris_data, 0, 2) data1.class_index = -1 data1.delete_attribute(1) data1.delete_first_attribute() data2 = Instances.copy_instances(iris_data, 0, 2) data2.class_index = -1 data2.delete_attribute(4) data2.delete_attribute(3) data2.delete_attribute(2) print("Dataset #1:\n" + str(data1)) print("Dataset #2:\n" + str(data2)) msg = data1.equal_headers(data2) print("#1 == #2 ? " + ("yes" if msg is None else msg)) combined = Instances.merge_instances(data2, data1) print("Combined:\n" + str(combined)) # load dataset incrementally iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset incrementally: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file, incremental=True) iris_data.class_is_last() helper.print_title("Iris dataset") print(iris_data) for inst in loader: print(inst) # create attributes helper.print_title("Creating attributes") num_att = Attribute.create_numeric("num") print("numeric: " + str(num_att)) date_att = Attribute.create_date("dat", "yyyy-MM-dd") print("date: " + str(date_att)) nom_att = Attribute.create_nominal("nom", ["label1", "label2"]) print("nominal: " + str(nom_att)) # create dataset helper.print_title("Create dataset") dataset = Instances.create_instances("helloworld", [num_att, date_att, nom_att], 0) print(str(dataset)) # create an instance helper.print_title("Create and add instance") values = [3.1415926, date_att.parse_date("2014-04-10"), 1.0] inst = Instance.create_instance(values) print("Instance #1:\n" + str(inst)) dataset.add_instance(inst) values = [ 2.71828, date_att.parse_date("2014-08-09"), Instance.missing_value() ] inst = Instance.create_instance(values) dataset.add_instance(inst) print("Instance #2:\n" + str(inst)) inst.set_value(0, 4.0) print("Instance #2 (updated):\n" + str(inst)) print("Dataset:\n" + str(dataset)) dataset.delete_with_missing(2) print("Dataset (after delete of missing):\n" + str(dataset)) values = [(1, date_att.parse_date("2014-07-11"))] inst = Instance.create_sparse_instance( values, 3, classname="weka.core.SparseInstance") print("sparse Instance:\n" + str(inst)) dataset.add_instance(inst) print("dataset with mixed dense/sparse instance objects:\n" + str(dataset)) # create dataset (lists) helper.print_title("Create dataset from lists") x = [[randint(1, 10) for _ in range(5)] for _ in range(10)] y = [randint(0, 1) for _ in range(10)] dataset2 = ds.create_instances_from_lists(x, y, "generated from lists") print(dataset2) x = [[randint(1, 10) for _ in range(5)] for _ in range(10)] dataset2 = ds.create_instances_from_lists( x, name="generated from lists (no y)") print(dataset2) # create dataset (matrices) helper.print_title("Create dataset from matrices") x = np.random.randn(10, 5) y = np.random.randn(10) dataset3 = ds.create_instances_from_matrices(x, y, "generated from matrices") print(dataset3) x = np.random.randn(10, 5) dataset3 = ds.create_instances_from_matrices( x, name="generated from matrices (no y)") print(dataset3) # create more sparse instances diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff" helper.print_info("Loading dataset: " + diabetes_file) loader = Loader("weka.core.converters.ArffLoader") diabetes_data = loader.load_file(diabetes_file) diabetes_data.class_is_last() helper.print_title("Create sparse instances using template dataset") sparse_data = Instances.template_instances(diabetes_data) for i in xrange(diabetes_data.num_attributes - 1): inst = Instance.create_sparse_instance( [(i, float(i + 1) / 10.0)], sparse_data.num_attributes, classname="weka.core.SparseInstance") sparse_data.add_instance(inst) print("sparse dataset:\n" + str(sparse_data)) # simple scatterplot of iris dataset: petalwidth x petallength iris_data = loader.load_file(iris_file) iris_data.class_is_last() pld.scatter_plot(iris_data, iris_data.attribute_by_name("petalwidth").index, iris_data.attribute_by_name("petallength").index, percent=50, wait=False) # line plot of iris dataset (without class attribute) iris_data = loader.load_file(iris_file) iris_data.class_is_last() pld.line_plot(iris_data, atts=xrange(iris_data.num_attributes - 1), percent=50, title="Line plot iris", wait=False) # matrix plot of iris dataset iris_data = loader.load_file(iris_file) iris_data.class_is_last() pld.matrix_plot(iris_data, percent=50, title="Matrix plot iris", wait=True)
def perceptron_classifier(cls, features, settings): # carrega o dataset loader = Loader("weka.core.converters.ArffLoader") instancias = loader.load_file( "./src/results/caracteristicas_sounds.arff") # sinaliza que o ultimo atributo é a classe instancias.class_is_last() # Define os Parametros learning_rate = str(settings['learningRate']) training_time = str(settings['trainingTime']) momentum = "0.2" hidden_layers = "a" seed = 2 cross_validation = 20 print('Learning Rate', learning_rate) print('Training Time', training_time) # Carrega o classificafor Multilayer Perceptron de acordo com os parametros definidos classifier = Classifier( classname="weka.classifiers.functions.MultilayerPerceptron", options=[ "-L", learning_rate, "-M", momentum, "-N", training_time, "-V", "0", "-S", str(seed), "-E", "20", "-H", hidden_layers ]) # Constroi o Classificador e Valida o dataset classifier.build_classifier(instancias) evaluation = Evaluation(instancias) # Aplica o Cross Validation rnd = Random(seed) rand_data = Instances.copy_instances(instancias) rand_data.randomize(rnd) if rand_data.class_attribute.is_nominal: rand_data.stratify(cross_validation) for i in range(cross_validation): # treina as instancias train = instancias.train_cv(cross_validation, i) # testa as instancias test = instancias.test_cv(cross_validation, i) # Constroi e Valida o Classificador cls = Classifier.make_copy(classifier) cls.build_classifier(train) evaluation.test_model(cls, test) # Cria uma nova instância com base nas caracteristicas extraidas new_instance = Instance.create_instance(features) # Adiciona a nova instância ao dataset instancias.add_instance(new_instance) # Liga a nova instancia ao dataset treinado com o classificador new_instance.dataset = train # Classifica a nova instância trazendo as probabilidades de ela pertencer as classes definidas classification = classifier.distribution_for_instance(new_instance) result = { 'cat': round(classification[0] * 100, 2), 'dog': round(classification[1] * 100, 2) } print("=== Setup ===") print("Classifier: " + classifier.to_commandline()) print("Dataset: " + instancias.relationname) print("Cross Validation: " + str(cross_validation) + "folds") print("Seed: " + str(seed)) print("") print( evaluation.summary("=== " + str(cross_validation) + " -fold Cross-Validation ===")) print("Classificação", " - Gato: ", result['cat'], " Cachorro: ", result['dog']) return result
fc.classifier = cls fc.build_classifier(train_data) # Create test data class_att = Attribute.create_nominal("class", ["good", "neutral", "bad"]) str_att = Attribute.create_string("title") test_dataset = Instances.create_instances( name="test_news_set", atts=[str_att, class_att], capacity=1 ) inst = Instance.create_instance([Instance.missing_value(), Instance.missing_value()]) test_dataset.add_instance(inst) test_dataset.get_instance(0).set_string_value(0, article['processed']['title']) test_dataset.class_is_last() # Run classifier article_instance = test_dataset.get_instance(0) prediction = fc.classify_instance(article_instance) article_type = article_instance.class_attribute.value(int(prediction)) if article_type is 'good' or 'neutral' or 'bad': articles_collection.update_one({ "_id": article_id}, { "$set": {
def test(objs, paras, testfile1, pred, real): testfile = preprocess(testfile1, True) xref = {'x_nT':1,'x_nT_delta':0,'x_nK':1,'x_nK_delta':0,'x_long':1,'x_str':0,'x_strsum':0} add_features(xref, 'x') zeroref = [] for k in ['long', 'nK', 'nK_delta', 'nT', 'nT_delta', 'str', 'strsum']: zeroref.append(xref['x_%s' % k]) zeroref.append(0) # should be obj for k in addf(): zeroref.append(xref['x_%s' % k]) with open(testfile) as fin: reader = csv.DictReader(fin) linecount = 0 for line in reader: ops = [] for h in line: if h.startswith('op'): ops.append(h[:h.find('_')]) for op in ops: add_features(line, op) stats = {} valid = True real_line = {} for h in line: if h.startswith('op'): k = h[:h.find('_')] v = h[h.find('_')+1:] if k not in stats: stats[k] = {} stats[k][v] = pfloat(line[h]) if stats[k][v] is None: valid = False elif h in objs: real_line[h] = pfloat(line[h]) if real_line[h] is None: valid = False if not valid: continue linecount += 1 if linecount > 250: continue #for k in stats: # assert len(paras) == len(stats[k]) # for v in stats[k]: # assert v in paras for obj in objs: c = Classifier(jobject=serialization.read(model_file('hash', obj))) zerovalue = c.classify_instance(Instance.create_instance(zeroref)) #s = 0 s = zerovalue for op in stats: values = [] for k in ['long', 'nK', 'nK_delta', 'nT', 'nT_delta', 'str', 'strsum']: values.append(stats[op][k]) values.append(0) # should be obj for k in addf(): values.append(stats[op][k]) ins = Instance.create_instance(values) prediction = c.classify_instance(ins) #print ' ', obj, op, values, prediction, prediction - zerovalue #s += pred s = s + max(prediction - zerovalue, 0) #print obj, 'real', real_line[obj], 'pred', s pred[obj].append(s) real[obj].append(real_line[obj]) print 'test', testfile, 'linecount', linecount subprocess.call('rm %s' % testfile, shell=True)
ref_present = ("Reference value" in row) or ("Reference Value" in row) for idx, col in enumerate(row): col = col.lower() atts.append(Attribute.create_numeric(col)) if not ref_present and (idx == 0): atts.append( Attribute.create_numeric("reference value")) data = Instances.create_instances("irdc", atts, 0) else: values = [] for idx, col in enumerate(row): values.append(float(col)) if not ref_present and (idx == 0): values.append(float('NaN')) inst = Instance.create_instance(values) data.add_instance(inst) saver = Saver(classname="weka.core.converters.ArffSaver") saver.save_file(data, data_dir + os.sep + outfile) # train/test/predict print("Train/test/predict...") groups = ["DataSet1", "DataSet2"] # groups = ["DataSet2"] for group in groups: print(group) train = data_dir + os.sep + group + "_Cal.arff" test = data_dir + os.sep + group + "_Test.arff"
def train(request): jvm.start() d_att1 = Attribute.create_numeric("bodydearword.feature") d_att2 = Attribute.create_numeric("bodyform.feature") d_att3 = Attribute.create_numeric("bodyhtml.feature") d_att4 = Attribute.create_numeric("bodymultipart.feature") d_att5 = Attribute.create_numeric("bodynumchars.feature") d_att6 = Attribute.create_numeric("bodynumfunctionwords.feature") d_att7 = Attribute.create_numeric("bodynumuniqwords.feature") d_att8 = Attribute.create_numeric("bodynumwords.feature") d_att9 = Attribute.create_numeric("bodyrichness.feature") d_att10 = Attribute.create_numeric("bodysuspensionword.feature") d_att11 = Attribute.create_numeric("bodyverifyyouraccountphrase.feature") d_att12 = Attribute.create_numeric("externalsabinary.feature") d_att13 = Attribute.create_numeric("externalsascore.feature") d_att14 = Attribute.create_numeric("scriptjavascript.feature") d_att15 = Attribute.create_numeric("scriptonclick.feature") d_att16 = Attribute.create_numeric("scriptpopup.feature") d_att17 = Attribute.create_numeric("scriptstatuschange.feature") d_att18 = Attribute.create_numeric("scriptunmodalload.feature") d_att19 = Attribute.create_numeric("senddiffreplyto.feature") d_att20 = Attribute.create_numeric("sendnumwords.feature") d_att21 = Attribute.create_numeric("sendunmodaldomain.feature") d_att22 = Attribute.create_numeric("subjectbankword.feature") d_att23 = Attribute.create_numeric("subjectdebitword.feature") d_att24 = Attribute.create_numeric("subjectfwdword.feature") d_att25 = Attribute.create_numeric("subjectnumchars.feature") d_att26 = Attribute.create_numeric("subjectnumwords.feature") d_att27 = Attribute.create_numeric("subjectreplyword.feature") d_att28 = Attribute.create_numeric("subjectrichness.feature") d_att29 = Attribute.create_numeric("subjectverifyword.feature") d_att30 = Attribute.create_numeric("urlatchar.feature") d_att31 = Attribute.create_numeric("urlbaglink.feature") d_att32 = Attribute.create_numeric("urlip.feature") d_att33 = Attribute.create_numeric("urlnumdomains.feature") d_att34 = Attribute.create_numeric("urlnumexternallink.feature") d_att35 = Attribute.create_numeric("urlnumimagelink.feature") d_att36 = Attribute.create_numeric("urlnuminternallink.feature") d_att37 = Attribute.create_numeric("urlnumip.feature") d_att38 = Attribute.create_numeric("urlnumlink.feature") d_att39 = Attribute.create_numeric("urlnumperiods.feature") d_att40 = Attribute.create_numeric("urlnumport.feature") d_att41 = Attribute.create_numeric("urlport.feature") d_att42 = Attribute.create_numeric("urltwodoains.feature") d_att43 = Attribute.create_numeric("urlunmodalbaglink.feature") d_att44 = Attribute.create_numeric("urlwordclicklink.feature") d_att45 = Attribute.create_numeric("urlwordherelink.feature") d_att46 = Attribute.create_numeric("urlwordloginlink.feature") d_att47 = Attribute.create_numeric("urlwordupdatelink.feature") d_att48 = Attribute.create_nominal("class", {'phish', 'ham'}) # data_dir = settings.BASE_DIR + "/phishing/public/datasets/" # loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(data_dir + "dataset.arff") data.class_is_last() cls = Classifier(classname="weka.classifiers.trees.J48") cls.options = ["-C", "0.3"] cls.build_classifier(data) serialization.write(data_dir + "out.model", cls) classifier = Classifier(jobject=serialization.read(data_dir + "out.model")) dataset = Instances.create_instances("test", [ d_att1, d_att2, d_att3, d_att4, d_att5, d_att6, d_att7, d_att8, d_att9, d_att10, d_att11, d_att12, d_att13, d_att14, d_att15, d_att16, d_att17, d_att18, d_att19, d_att20, d_att21, d_att22, d_att23, d_att24, d_att25, d_att26, d_att27, d_att28, d_att29, d_att30, d_att31, d_att32, d_att33, d_att34, d_att35, d_att36, d_att37, d_att38, d_att39, d_att40, d_att41, d_att42, d_att43, d_att44, d_att45, d_att46, d_att47, d_att48 ], 0) values = [ 0, 0, 0, 0, 890, 1, 124, 198, 0.22247191011236, 0, 0, 0, 0.0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 21, 4, 1, 0.19047619047619, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Instance.missing_value() ] inst = Instance.create_instance(values) dataset.add_instance(inst) dataset.class_is_last() # print(str(dataset)) var = '' for inst1 in dataset: pred = classifier.classify_instance(inst1) var = inst1.class_attribute.value(int(pred)) if var == 'ham': print('No es pishing') # do somthing else: print('Es pishing') # do somthing print(var) jvm.stop() return HttpResponse(str(var))
def main(): global stop_spinning, name, upper_clothing, lower_clothing, outer_clothing, shoes_clothing, upper_indices, lower_indices, outer_indices, shoes_indices ''' Classifies clothing using stored classification models for each user ''' FSM = ClothingFSM() #FSM.username_server() clothingdb = MySQLdb.connect(host="localhost", user="******", passwd="mypassword", # Change to your SQL DB password db = "userprofiles") cursor = clothingdb.cursor() cursor.execute("SELECT * FROM clothing") name = "Study" #Populate clothing dictionaries with user's wardrobe for row in cursor.fetchall(): print str(row[2]) print str(row[6]) if str(row[0]) == name: if str(row[1]) == "Upper Body": try: upper_clothing[row[2]].append(row[6]) except: print "Problem appending clothing to dictionary" if str(row[1]) == "Lower Body": try: lower_clothing[row[3]].append(row[6]) except: print "Problem appending clothing to dictionary" if str(row[1]) == "Outerwear": try: outer_clothing[row[4]].append(row[6]) except: print "Problem appending clothing to dictionary" if str(row[1]) == "Shoes": try: shoes_clothing[row[5]].append(row[6]) except: print "Problem appending clothing to dictionary" print upper_clothing, lower_clothing, outer_clothing, shoes_clothing # FSM.received_user_info() #In final program, we will receive this information from database #Set to true or false if receiving features vs testing defaults receive_features = True if receive_features == False: #Wait to Receive input #Example inputs from user/weather API features['casual_formal'] = 3 #5 is very comfortable 1 is not comfortable features['comfort'] = 3 #1 is not snowing 2 is light snow 3 is heavy snow features['snow'] = 1 #1 is not raining 3 is raining(no medium) features['rain'] = 3 #If user is spending their time mostly outside, set warmth to outsidewarmth. If not, set warmth features['warmth'] = 1 features['outside_warmth'] = 4 #1 is no 0 is yes features['athletic'] = 1 snowstring = '' rainstring = '' athleticstring = '' else: FSM.features_server() upper_array = [None] * 14 lower_array = [None] * 7 outer_array = [None] * 3 shoes_array = [None] * 4 upper_prediction_array = [] lower_prediction_array = [] outer_prediction_array = [] shoes_prediction_array = [] warmth_att = Attribute.create_numeric("Warmth") comfort_att = Attribute.create_numeric("Comfort") casual_att = Attribute.create_numeric("Casual") rain_att = Attribute.create_numeric("Rain") snow_att = Attribute.create_numeric("Snow") athletic_att = Attribute.create_numeric("Athletic") upper_attributes = [warmth_att, casual_att, comfort_att, athletic_att] lower_attributes = [warmth_att, casual_att, comfort_att, athletic_att] outer_attributes = [warmth_att, casual_att, comfort_att, snow_att, rain_att] shoes_attributes = [casual_att, comfort_att, athletic_att] Instances.create_instances("upper_instances", upper_attributes, 0) Instances.create_instances("lower_instances", lower_attributes, 0) Instances.create_instances("outer_instances", outer_attributes, 0) Instances.create_instances("shoes_instances", shoes_attributes, 0) #Simulate their wardrobe #Upper # Tank Top if len(upper_clothing['Tank Top']) == 0: upper_array[0] = 0 else: upper_array[0] = 1 # T-Shirt if len(upper_clothing['T-Shirt']) == 0: upper_array[1] = 0 else: upper_array[1] = 1 # Long-Sleeved Shirt if len(upper_clothing['Long-sleeved Shirt']) == 0: upper_array[2] = 0 else: upper_array[2] = 1 # Athletic Top if len(upper_clothing['Athletic Top']) == 0: upper_array[3] = 0 else: upper_array[3] = 1 # Button-down Shirt if len(upper_clothing['Button-down Shirt']) == 0: upper_array[4] = 0 else: upper_array[4] = 1 # Polo Shirt if len(upper_clothing['Polo Shirt']) == 0: upper_array[5] = 0 else: upper_array[5] = 1 # Dress Shirt if len(upper_clothing['Dress Shirt']) == 0: upper_array[6] = 0 else: upper_array[6] = 1 # Suit Jacket if len(upper_clothing['Suit Jacket']) == 0: upper_array[7] = 0 else: upper_array[7] = 1 # Blazer if len(upper_clothing['Blazer']) == 0: upper_array[8] = 0 else: upper_array[8] = 1 # Hoodie if len(upper_clothing['Hoodie']) == 0: upper_array[9] = 0 else: upper_array[9] = 1 # Sweater if len(upper_clothing['Sweater']) == 0: upper_array[10] = 0 else: upper_array[10] = 1 # Blouse if len(upper_clothing['Blouse']) == 0: upper_array[11] = 0 else: upper_array[11] = 1 # Day Dress if len(upper_clothing['Day Dress']) == 0: upper_array[12] = 0 else: upper_array[12] = 1 # Evening Dress if len(upper_clothing['Evening Dress']) == 0: upper_array[13] = 0 else: upper_array[13] = 1 #Lower # Regular Shorts if len(lower_clothing['Shorts']) == 0: lower_array[0] = 0 else: lower_array[0] = 1 # Athletic Shorts if len(lower_clothing['Athletic Shorts']) == 0: lower_array[1] = 0 else: lower_array[1] = 1 # Athletic Pants if len(lower_clothing['Athletic Pants']) == 0: lower_array[2] = 0 else: lower_array[2] = 1 # Jeans if len(lower_clothing['Jeans']) == 0: lower_array[3] = 0 else: lower_array[3] = 1 # Trousers if len(lower_clothing['Trousers']) == 0: lower_array[4] = 0 else: lower_array[4] = 1 # Skirt if len(lower_clothing['Skirt']) == 0: lower_array[5] = 0 else: lower_array[5] = 1 # Dress Pants if len(lower_clothing['Dress Pants']) == 0: lower_array[6] = 0 else: lower_array[6] = 1 #Outer # Light Jacket if len(outer_clothing['Light Jacket']) == 0: outer_array[0] = 0 else: outer_array[0] = 1 # Heavy Jacket if len(outer_clothing['Winter Jacket']) == 0: outer_array[1] = 0 else: outer_array[1] = 1 # Rain Jacket if len(outer_clothing['Rain Jacket']) == 0: outer_array[2] = 0 else: outer_array[2] = 1 #Shoes # Casual Shoes if len(shoes_clothing['Casual Shoes']) == 0: shoes_array[0] = 0 else: shoes_array[0] = 1 # Athletic Shoes if len(shoes_clothing['Athletic Shoes']) == 0: shoes_array[1] = 0 else: shoes_array[1] = 1 # Dress Shoes if len(shoes_clothing['Dress Shoes']) == 0: shoes_array[2] = 0 else: shoes_array[2] = 1 # Dressy Casual Shoes if len(shoes_clothing['Business Casual Shoes']) == 0: shoes_array[3] = 0 else: shoes_array[3] = 1 upper_list = [features['outside_warmth'], features['casual_formal'], features['comfort'], features['athletic']] lower_list = [features['outside_warmth'], features['casual_formal'], features['comfort'], math.fabs(1-features['athletic'])] outer_list = [features['outside_warmth'], features['casual_formal'], features['comfort'], features['rain'], features['snow']] shoes_list = [features['casual_formal'], features['comfort'], math.fabs(1-features['athletic'])] upper_instance = Instance.create_instance(upper_list, classname='weka.core.DenseInstance', weight= 1.0) lower_instance = Instance.create_instance(lower_list, classname='weka.core.DenseInstance', weight= 1.0) outer_instance = Instance.create_instance(outer_list, classname='weka.core.DenseInstance', weight= 1.0) shoes_instance = Instance.create_instance(shoes_list, classname='weka.core.DenseInstance', weight= 1.0) upper_path = '/home/leo/models/uppermodel2.model' lower_path = '/home/leo/models/lowermodel2.model' outer_path = '/home/leo/models/outermodel2.model' shoes_path = '/home/leo/models/shoesmodel7.model' upper_classifier = Classifier(jobject=serialization.read(upper_path)) lower_classifier = Classifier(jobject=serialization.read(lower_path)) outer_classifier = Classifier(jobject=serialization.read(outer_path)) shoes_classifier = Classifier(jobject=serialization.read(shoes_path)) upper_predictions = upper_classifier.distribution_for_instance(upper_instance) lower_predictions = lower_classifier.distribution_for_instance(lower_instance) outer_predictions = outer_classifier.distribution_for_instance(outer_instance) shoes_predictions = shoes_classifier.distribution_for_instance(shoes_instance) if features['rain'] == 1: rainstring = 'No' if features['rain'] == 3: rainstring = 'Yes' if features['snow'] == 1: snowstring = 'No' if features['snow'] == 3: snowstring = 'Yes' if features['athletic'] == 1: athleticstring = 'No' if features['athletic'] == 0: athleticstring = 'Yes' print "Features being Classified:" print "Outside Warmth:", features['outside_warmth'], "Inside-Outside:", features['inside_outside'], "Casual-Formal:", features['casual_formal'], "Comfort:", features['comfort'], "Athletic:", athleticstring, "Rain:", rainstring, "Snow:", snowstring #Remove Clothing Options User Doesn't Own for i in range(len(upper_array)): if upper_array[i] == 0: upper_prediction_array.append(0) else: upper_prediction_array.append(upper_predictions[i]) for i in range(len(lower_array)): if lower_array[i] == 0: lower_prediction_array.append(0) else: lower_prediction_array.append(lower_predictions[i]) for i in range(len(outer_array)): if outer_array[i] == 0: outer_prediction_array.append(0) else: outer_prediction_array.append(outer_predictions[i]) for i in range(len(shoes_array)): if shoes_array[i] == 0: shoes_prediction_array.append(0) else: shoes_prediction_array.append(shoes_predictions[i]) #Find the top 3 options for each classifier max_index_upper1 = 0 max_index_upper2 = 0 max_index_upper3 = 0 max_index_upper4 = 0 max_index_upper5 = 0 for i in range(1,len(upper_prediction_array)): n = upper_prediction_array[max_index_upper1] if upper_prediction_array[i] > n: max_index_upper1 = i upper_prediction_array[max_index_upper1] = 0 for i in range(1, len(upper_prediction_array)): n = upper_prediction_array[max_index_upper2] if upper_prediction_array[i] > n: max_index_upper2 = i upper_prediction_array[max_index_upper2] = 0 for i in range(1, len(upper_prediction_array)): n = upper_prediction_array[max_index_upper3] if upper_prediction_array[i] > n: max_index_upper3 = i upper_prediction_array[max_index_upper3] = 0 for i in range(1, len(upper_prediction_array)): n = upper_prediction_array[max_index_upper4] if upper_prediction_array[i] > n: max_index_upper4 = i upper_prediction_array[max_index_upper4] = 0 for i in range(1, len(upper_prediction_array)): n = upper_prediction_array[max_index_upper5] if upper_prediction_array[i] > n: max_index_upper5 = i upper_indices = [max_index_upper1, max_index_upper2, max_index_upper3, max_index_upper4, max_index_upper5] max_index_lower1 = 0 max_index_lower2 = 0 max_index_lower3 = 0 max_index_lower4 = 0 max_index_lower5 = 0 for i in range(1,len(lower_prediction_array)): n = lower_prediction_array[max_index_lower1] if lower_prediction_array[i] > n: max_index_lower1 = i lower_prediction_array[max_index_lower1] = 0 for i in range(1,len(lower_prediction_array)): n = lower_prediction_array[max_index_lower2] if lower_prediction_array[i] > n: max_index_lower2 = i lower_prediction_array[max_index_lower2] = 0 for i in range(1,len(lower_prediction_array)): n = lower_prediction_array[max_index_lower3] if lower_prediction_array[i] > n: max_index_lower3 = i lower_prediction_array[max_index_lower3] = 0 for i in range(1, len(lower_prediction_array)): n = lower_prediction_array[max_index_lower4] if lower_prediction_array[i] > n: max_index_upper4 = i lower_prediction_array[max_index_lower4] = 0 for i in range(1, len(lower_prediction_array)): n = lower_prediction_array[max_index_lower5] if lower_prediction_array[i] > n: max_index_lower5 = i lower_indices = [max_index_lower1, max_index_lower2, max_index_lower3, max_index_lower4, max_index_lower5] max_index_outer1 = 0 max_index_outer2 = 0 max_index_outer3 = 0 for i in range(1, len(outer_prediction_array)): n = outer_prediction_array[max_index_outer1] if outer_prediction_array[i] > n: max_index_outer1 = i outer_prediction_array[max_index_outer1] = 0 for i in range(1, len(outer_prediction_array)): n = outer_prediction_array[max_index_outer2] if outer_prediction_array[i] > n: max_index_outer2 = i outer_prediction_array[max_index_outer2] = 0 for i in range(1, len(outer_prediction_array)): n = outer_prediction_array[max_index_outer3] if outer_prediction_array[i] > n: max_index_outer3 = i outer_indices = [max_index_outer1, max_index_outer2, max_index_outer3] max_index_shoes1 = 0 max_index_shoes2 = 0 max_index_shoes3 = 0 max_index_shoes4 = 0 for i in range(1, len(shoes_prediction_array)): n = shoes_prediction_array[max_index_shoes1] if shoes_prediction_array[i] > n: max_index_shoes1 = i shoes_prediction_array[max_index_shoes1] = 0 for i in range(1, len(shoes_prediction_array)): n = shoes_prediction_array[max_index_shoes2] if shoes_prediction_array[i] > n: max_index_shoes2 = i shoes_prediction_array[max_index_shoes2] = 0 for i in range(1, len(shoes_prediction_array)): n = shoes_prediction_array[max_index_shoes3] if shoes_prediction_array[i] > n: max_index_shoes3 = i shoes_prediction_array[max_index_shoes3] = 0 for i in range(1, len(shoes_prediction_array)): n = shoes_prediction_array[max_index_shoes4] if shoes_prediction_array[i] > n: max_index_shoes4 = i shoes_indices = [max_index_shoes1, max_index_shoes2, max_index_shoes3, max_index_shoes4] print "Outer Indices:", outer_indices FSM.received_inputs() print "Exiting Program"
# In[4]: from weka.core import dataset from weka.core.dataset import Instance # In[5]: age, gender, mar_stat, ocd_hist, q2, q5, q10, q12, q13, q15, q17 = input( "Input list here : ").split(" ") # In[6]: x = [age, gender, mar_stat, ocd_hist, q2, q5, q10, q12, q13, q15, q17] x.append(Instance.missing_value()) data.add_instance(inst=Instance.create_instance(x)) classify = classifier.classify_instance(inst=data.get_instance( index=data.num_instances - 1)) if (classify == 0.0): print("No OCD") else: print("OCD") # In[7]: #print(data) # In[8]: jvm.stop()