def splitTrainSet(data,m_numLabledData=10) : total = data.num_instances labeled_amount = int(m_numLabledData * total / 100) unlabeled_amount = total - labeled_amount rand = Random(1) data.randomize(rand) labledDataSet = Instances.create_instances(data.relationname,data.attributes(),labeled_amount) UnlabledDataSet = Instances.create_instances(data.relationname,data.attributes(),unlabeled_amount) for i in range(labeled_amount) : labledDataSet.add_instance(data.get_instance(i)) labledDataSet.randomize(rand) for i in range(unlabeled_amount) : UnlabledDataSet.add_instance(data.get_instance(labeled_amount + i)) # labledDataSet.randomize(rand) labledDataSet.class_is_last() # UnlabledDataSet.randomize(rand) UnlabledDataSet.class_is_last() return labledDataSet,UnlabledDataSet
def df_to_instances(self): ''' transform pandas data frame to arff style data :param df: panda data frame :param relation: relation, string :param attr_label: label attribute, string :return: arff style data ''' atts = [] for col in self.df.columns: if col != self.attr_label: att = Attribute.create_numeric(col) else: att = Attribute.create_nominal(col, ['0', '1']) atts.append(att) nrow = len(self.df) result = Instances.create_instances(self.relation, atts, nrow) # data for i in range(nrow): inst = Instance.create_instance( self.df.iloc[i].astype('float64').to_numpy().copy(order='C')) result.add_instance(inst) return result
def create_dataset_header(): """ Creates the dataset header. :return: the header :rtype: Instances """ att_msg = Attribute.create_string("Message") att_cls = Attribute.create_nominal("Class", ["miss", "hit"]) result = Instances.create_instances("MessageClassificationProblem", [att_msg, att_cls], 0) return result
def addNominals(self, dataset): # Add the nominal values for all columns, in case a column has none ignoreAttributes = ['readmitted'] atts = [] for a in dataset.attributes(): if (not (a.is_nominal)) or (a.name in ignoreAttributes) : atts.append(a) else: newValues = list(a.values) pvalue = 'DefaultNominal' if(pvalue not in newValues): newValues.append(pvalue) atts.append(Attribute.create_nominal(a.name, newValues)) newDataset = Instances.create_instances("Dataset", atts, 0) newDataset.class_is_last() return newDataset
def create_dataset(tweets): text_att = Attribute.create_string('TEXT') nom_att = Attribute.create_nominal('CLASS', class_values) dataset = Instances.create_instances("tweets", [text_att, nom_att], len(tweets)) for tweet in tweets: values = [] values.append(dataset.attribute(0).add_string_value(tweet)) values.append(Instance.missing_value()) inst = Instance.create_instance(values) dataset.add_instance(inst) dataset.class_is_last() return dataset
def classify_level(sent, classifier, stats, params={}, match={}): """ Classifies the CEFR level of 'sent'. 2016 june - based on check_readability() in sent_match.py @ sent: @ stats: SentStatistics instance @ params: parameters for SentMatch (HitEx) @ match: SentMatch instance # TO DO: add argument for choosing bw WEKA and sklearn adapt to both sents and texts in- vs cross-domain setups """ sent_feats = SentFeatures(sent, stats, params) fs = sent_feats.features feature_names = fs.keys() # set the order of training attributes for values with codecs.open("auxiliaries/feature_names.txt") as f: train_fn = [l.strip("\n") for l in f.readlines()] f_list = [fs[tfn] for tfn in train_fn] # create Instance, attributes and a dummy dataset (required for prediction) inst = Instance.create_instance(f_list) attributes = [] for feat_n in train_fn: attributes.append(Attribute.create_numeric(feat_n)) attributes.append( Attribute.create_nominal("level", ["A1", "A2", "B1", "B2", "C1"])) dataset = Instances.create_instances("readability", attributes, 0) dataset.add_instance(inst) dataset.class_is_last() # make prediction cefr_mapping = {"A1": 1.0, "A2": 2.0, "B1": 3.0, "B2": 4.0, "C1": 5.0} trg_cefr_fl = cefr_mapping[params["target_cefr"]] for instance in dataset: pred = classifier.classify_instance(instance) pred_cefr = pred + 1 #if pred_cefr < 1 or pred_cefr > 5: level_diff = pred_cefr - trg_cefr_fl # negative value = easier than target nominal_level = [k for k, v in cefr_mapping.items() if v == pred_cefr][0] return (level_diff, nominal_level, fs ) #return also fs -> for detailed info in webservice
def ndarray_to_instances(array, relation, att_template="Att-#", att_list=None): """ Converts the numpy matrix into an Instances object and returns it. :param array: the numpy ndarray to convert :type array: numpy.darray :param relation: the name of the dataset :type relation: str :param att_template: the prefix to use for the attribute names, "#" is the 1-based index, "!" is the 0-based index, "@" the relation name :type att_template: str :param att_list: the list of attribute names to use :type att_list: list :return: the generated instances object :rtype: Instances """ if len(numpy.shape(array)) != 2: raise Exception("Number of array dimensions must be 2!") rows, cols = numpy.shape(array) # header atts = [] if att_list is not None: if len(att_list) != cols: raise Exception( "Number columns and provided attribute names differ: " + str(cols) + " != " + len(att_list)) for name in att_list: att = Attribute.create_numeric(name) atts.append(att) else: for i in range(cols): name = att_template.replace("#", str(i + 1)).replace( "!", str(i)).replace("@", relation) att = Attribute.create_numeric(name) atts.append(att) result = Instances.create_instances(relation, atts, rows) # data for i in range(rows): inst = Instance.create_instance(array[i]) result.add_instance(inst) return result
def main(): """ Creates a dataset from scratch using random data and outputs it. """ atts = [] for i in xrange(5): atts.append(Attribute.create_numeric("x" + str(i))) data = Instances.create_instances("data", atts, 10) for n in xrange(10): values = [] for i in xrange(5): values.append(n*100 + i) inst = Instance.create_instance(values) data.add_instance(inst) print(data)
def main(): """ Creates a dataset from scratch using random data and outputs it. """ atts = [] for i in range(5): atts.append(Attribute.create_numeric("x" + str(i))) data = Instances.create_instances("data", atts, 10) for n in range(10): values = [] for i in range(5): values.append(n * 100 + i) inst = Instance.create_instance(values) data.add_instance(inst) print(data)
def ndarray_to_instances(array, relation, att_template="Att-#", att_list=None): """ Converts the numpy matrix into an Instances object and returns it. :param array: the numpy ndarray to convert :type array: numpy.darray :param relation: the name of the dataset :type relation: str :param att_template: the prefix to use for the attribute names, "#" is the 1-based index, "!" is the 0-based index, "@" the relation name :type att_template: str :param att_list: the list of attribute names to use :type att_list: list :return: the generated instances object :rtype: Instances """ if len(numpy.shape(array)) != 2: raise Exception("Number of array dimensions must be 2!") rows, cols = numpy.shape(array) # header atts = [] if att_list is not None: if len(att_list) != cols: raise Exception( "Number columns and provided attribute names differ: " + str(cols) + " != " + len(att_list)) for name in att_list: att = Attribute.create_numeric(name) atts.append(att) else: for i in xrange(cols): name = att_template.replace("#", str(i+1)).replace("!", str(i)).replace("@", relation) att = Attribute.create_numeric(name) atts.append(att) result = Instances.create_instances(relation, atts, rows) # data for i in xrange(rows): inst = Instance.create_instance(array[i]) result.add_instance(inst) return result
def sklearn_input_to_weka(X, y=None, labels=None): from weka.core.dataset import Attribute, Instances, Instance attribs = [] for i in range(len(X[0])): attribs.append(Attribute.create_numeric('x_{}'.format(i))) if labels is None and y is not None: labels = [str(label) for label in np.unique(y)] attribs.append(Attribute.create_nominal('y', labels)) n_rows = len(X) instances = Instances.create_instances('data', attribs, n_rows) for i in range(n_rows): if y is None: row = [*X[i], '0'] elif isinstance(y, pd.Series): row = [*X[i], y.iloc[i]] else: row = [*X[i], y[i]] instances.add_instance(Instance.create_instance(row)) instances.class_is_last() return instances, labels
def addPatientNominals(self, patient, dataset): # Add the nominal values for the patient to the master header, in case they aren't already there # Loop and add patient's nominal values in case they aren't in masterDataset # newDataset will be the new master header # Waiting on prediction patient to be defined # Should be like {sex_cd: "m", ...} ignoreAttributes = ['readmitted'] atts = [] for a in dataset.attributes(): if (not (a.is_nominal)) or (a.name in ignoreAttributes) : atts.append(a) else: newValues = list(a.values) #print a.name pvalue = patient[a.name] if(pvalue not in newValues): newValues.append(pvalue) atts.append(Attribute.create_nominal(a.name, newValues)) newDataset = Instances.create_instances("Dataset", atts, 0) newDataset.class_is_last() return newDataset
def classify_json_object(lang, tag, json_data): model = load_classifier(lang, tag) # create dataset attr = create_attributes(lang, tag) dataset = Instances.create_instances(lang + "_dataset", attr, 0) # create an instance n_feature = 0 tag_list = "" tag_feature = "" if lang == LANG_ID: n_feature = ID_N_FEATURE tag_list = ID_TAG tag_feature = ID_TAG_FEATURE elif lang == LANG_EN: n_feature = EN_N_FEATURE tag_list = EN_TAG tag_feature = EN_TAG_FEATURE # print (attr) val = [] for tag in tag_list: for i in range(0, n_feature): for ftr in tag_feature: cur_key = tag + str(i + 1) val.append(json_data[cur_key][cur_key + "_" + ftr]) # print(cur_key + "_" + ftr, json_data[cur_key][cur_key + "_token"], json_data[cur_key][cur_key + "_" + ftr]) val.append(0) inst = Instance.create_instance(val) dataset.add_instance(inst) dataset.class_is_last() pred = classify_new_instance(model, dataset) return pred
def to_instances(X, y=None, att_names=None, att_types=None, class_name=None, class_type=None, relation_name=None, num_nominal_labels=None, num_class_labels=None): """ Turns the 2D matrix and the optional 1D class vector into an Instances object. :param X: the input variables, 2D matrix :type X: ndarray :param y: the optional class value column, 1D vector :type y: ndarray :param att_names: the list of attribute names :type att_names: list :param att_types: the list of attribute types (C=categorical, N=numeric), assumes numeric by default if not provided :param class_name: the name of the class attribute :type class_name: str :param class_type: the type of the class attribute (C=categorical, N=numeric) :type class_type: str :param relation_name: the name for the dataset :type relation_name: str :param num_nominal_labels: the dictionary with the number of labels (key is 0-based attribute index) :type num_nominal_labels: dict :param num_class_labels: the number of labels in the class attribute :type num_class_labels: int :return: the generated Instances object :rtype: Instances """ if len(X) == 0: raise Exception("No data to convert!") # defaults if att_types is None: att_types = determine_attribute_types(X) if att_names is None: att_names = [] for i in range(len(X[0])): att_names.append("att-" + str(i+1)) if relation_name is None: relation_name = "scikit-weka @ " + str(datetime.now()) if class_name is None: if "class" not in att_names: class_name = "class" else: class_name = "class-" + str(len(att_names) + 1) if y is not None: if class_type is None: class_type = determine_attribute_type(y) # create header atts = [] for i in range(len(X[0])): att_name = att_names[i] att_type = att_types[i] if att_type == "N": atts.append(Attribute.create_numeric(att_name)) elif att_type == "C": if (num_nominal_labels is not None) and (i in num_nominal_labels): values = [] for l in range(num_nominal_labels[i]): values.append("_%d" % l) else: labels = set() for n in range(len(X)): r = X[n] v = str(r[i]) labels.add(v) values = sorted(labels) atts.append(Attribute.create_nominal(att_name, values)) else: raise Exception("Unsupported attribute type for column %d: %s" % ((i+1), att_type)) if y is not None: if class_type == "N": atts.append(Attribute.create_numeric(class_name)) elif class_type == "C": if num_class_labels is not None: values = [] for l in range(num_class_labels): values.append("_%d" % l) else: values = sorted(set([str(x) for x in y])) atts.append(Attribute.create_nominal(class_name, values)) result = Instances.create_instances(relation_name, atts, len(X)) if y is not None: result.class_index = result.num_attributes - 1 # data for n in range(len(X)): values = [] r = X[n] for i in range(len(r)): if att_types[i] == "C": values.append(atts[i].index_of(str(r[i]))) elif att_types[i] == "N": values.append(r[i]) else: raise Exception("Unsupported attribute type for column %d: %s" % ((i+1), att_types[i])) if y is not None: if class_type == "C": values.append(atts[-1].index_of(str(y[n]))) elif class_type == "N": values.append(y[n]) else: raise Exception("Unsupported attribute type for class: %s" % class_type) inst = Instance.create_instance(values) result.add_instance(inst) return result
outfile = os.path.splitext(infile)[0] + ".arff" reader = csv.reader(csvfile) data = None ref_present = True for index, row in enumerate(reader): if index == 0: atts = [] ref_present = ("Reference value" in row) or ("Reference Value" in row) for idx, col in enumerate(row): col = col.lower() atts.append(Attribute.create_numeric(col)) if not ref_present and (idx == 0): atts.append( Attribute.create_numeric("reference value")) data = Instances.create_instances("irdc", atts, 0) else: values = [] for idx, col in enumerate(row): values.append(float(col)) if not ref_present and (idx == 0): values.append(float('NaN')) inst = Instance.create_instance(values) data.add_instance(inst) saver = Saver(classname="weka.core.converters.ArffSaver") saver.save_file(data, data_dir + os.sep + outfile) # train/test/predict print("Train/test/predict...")
cls = Classifier(classname="weka.classifiers.bayes.NaiveBayesMultinomial") fc = FilteredClassifier() fc.filter = string_to_word_vector_filter fc.classifier = cls fc.build_classifier(train_data) # Create test data class_att = Attribute.create_nominal("class", ["good", "neutral", "bad"]) str_att = Attribute.create_string("title") test_dataset = Instances.create_instances( name="test_news_set", atts=[str_att, class_att], capacity=1 ) inst = Instance.create_instance([Instance.missing_value(), Instance.missing_value()]) test_dataset.add_instance(inst) test_dataset.get_instance(0).set_string_value(0, article['processed']['title']) test_dataset.class_is_last() # Run classifier article_instance = test_dataset.get_instance(0) prediction = fc.classify_instance(article_instance) article_type = article_instance.class_attribute.value(int(prediction)) if article_type is 'good' or 'neutral' or 'bad':
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() helper.print_title("Iris dataset") print(iris_data) helper.print_title("Iris dataset (incrementally output)") for i in iris_data: print(i) helper.print_title("Iris summary") print(Instances.summary(iris_data)) helper.print_title("Iris attributes") for a in iris_data.attributes(): print(a) helper.print_title("Instance at #0") print(iris_data.get_instance(0)) print(iris_data.get_instance(0).values) print("Attribute stats (first):\n" + str(iris_data.attribute_stats(0))) print("total count (first attribute):\n" + str(iris_data.attribute_stats(0).total_count)) print("numeric stats (first attribute):\n" + str(iris_data.attribute_stats(0).numeric_stats)) print("nominal counts (last attribute):\n" + str(iris_data.attribute_stats(iris_data.num_attributes - 1).nominal_counts)) helper.print_title("Instance values at #0") for v in iris_data.get_instance(0): print(v) # append datasets helper.print_title("append datasets") data1 = Instances.copy_instances(iris_data, 0, 2) data2 = Instances.copy_instances(iris_data, 2, 2) print("Dataset #1:\n" + str(data1)) print("Dataset #2:\n" + str(data2)) msg = data1.equal_headers(data2) print("#1 == #2 ? " + "yes" if msg is None else msg) combined = Instances.append_instances(data1, data2) print("Combined:\n" + str(combined)) # merge datasets helper.print_title("merge datasets") data1 = Instances.copy_instances(iris_data, 0, 2) data1.class_index = -1 data1.delete_attribute(1) data1.delete_first_attribute() data2 = Instances.copy_instances(iris_data, 0, 2) data2.class_index = -1 data2.delete_attribute(4) data2.delete_attribute(3) data2.delete_attribute(2) print("Dataset #1:\n" + str(data1)) print("Dataset #2:\n" + str(data2)) msg = data1.equal_headers(data2) print("#1 == #2 ? " + ("yes" if msg is None else msg)) combined = Instances.merge_instances(data2, data1) print("Combined:\n" + str(combined)) # load dataset incrementally iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset incrementally: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file, incremental=True) iris_data.class_is_last() helper.print_title("Iris dataset") print(iris_data) for inst in loader: print(inst) # create attributes helper.print_title("Creating attributes") num_att = Attribute.create_numeric("num") print("numeric: " + str(num_att)) date_att = Attribute.create_date("dat", "yyyy-MM-dd") print("date: " + str(date_att)) nom_att = Attribute.create_nominal("nom", ["label1", "label2"]) print("nominal: " + str(nom_att)) # create dataset helper.print_title("Create dataset") dataset = Instances.create_instances("helloworld", [num_att, date_att, nom_att], 0) print(str(dataset)) # create an instance helper.print_title("Create and add instance") values = [3.1415926, date_att.parse_date("2014-04-10"), 1.0] inst = Instance.create_instance(values) print("Instance #1:\n" + str(inst)) dataset.add_instance(inst) values = [2.71828, date_att.parse_date("2014-08-09"), Instance.missing_value()] inst = Instance.create_instance(values) dataset.add_instance(inst) print("Instance #2:\n" + str(inst)) inst.set_value(0, 4.0) print("Instance #2 (updated):\n" + str(inst)) print("Dataset:\n" + str(dataset)) dataset.delete_with_missing(2) print("Dataset (after delete of missing):\n" + str(dataset)) values = [(1, date_att.parse_date("2014-07-11"))] inst = Instance.create_sparse_instance(values, 3, classname="weka.core.SparseInstance") print("sparse Instance:\n" + str(inst)) dataset.add_instance(inst) print("dataset with mixed dense/sparse instance objects:\n" + str(dataset)) # create dataset (lists) helper.print_title("Create dataset from lists") x = [[randint(1, 10) for _ in range(5)] for _ in range(10)] y = [randint(0, 1) for _ in range(10)] dataset2 = ds.create_instances_from_lists(x, y, "generated from lists") print(dataset2) x = [[randint(1, 10) for _ in range(5)] for _ in range(10)] dataset2 = ds.create_instances_from_lists(x, name="generated from lists (no y)") print(dataset2) # create dataset (matrices) helper.print_title("Create dataset from matrices") x = np.random.randn(10, 5) y = np.random.randn(10) dataset3 = ds.create_instances_from_matrices(x, y, "generated from matrices") print(dataset3) x = np.random.randn(10, 5) dataset3 = ds.create_instances_from_matrices(x, name="generated from matrices (no y)") print(dataset3) # create more sparse instances diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff" helper.print_info("Loading dataset: " + diabetes_file) loader = Loader("weka.core.converters.ArffLoader") diabetes_data = loader.load_file(diabetes_file) diabetes_data.class_is_last() helper.print_title("Create sparse instances using template dataset") sparse_data = Instances.template_instances(diabetes_data) for i in range(diabetes_data.num_attributes - 1): inst = Instance.create_sparse_instance( [(i, float(i+1) / 10.0)], sparse_data.num_attributes, classname="weka.core.SparseInstance") sparse_data.add_instance(inst) print("sparse dataset:\n" + str(sparse_data)) # simple scatterplot of iris dataset: petalwidth x petallength iris_data = loader.load_file(iris_file) iris_data.class_is_last() pld.scatter_plot( iris_data, iris_data.attribute_by_name("petalwidth").index, iris_data.attribute_by_name("petallength").index, percent=50, wait=False) # line plot of iris dataset (without class attribute) iris_data = loader.load_file(iris_file) iris_data.class_is_last() pld.line_plot(iris_data, atts=range(iris_data.num_attributes - 1), percent=50, title="Line plot iris", wait=False) # matrix plot of iris dataset iris_data = loader.load_file(iris_file) iris_data.class_is_last() pld.matrix_plot(iris_data, percent=50, title="Matrix plot iris", wait=True)
def train(request): jvm.start() d_att1 = Attribute.create_numeric("bodydearword.feature") d_att2 = Attribute.create_numeric("bodyform.feature") d_att3 = Attribute.create_numeric("bodyhtml.feature") d_att4 = Attribute.create_numeric("bodymultipart.feature") d_att5 = Attribute.create_numeric("bodynumchars.feature") d_att6 = Attribute.create_numeric("bodynumfunctionwords.feature") d_att7 = Attribute.create_numeric("bodynumuniqwords.feature") d_att8 = Attribute.create_numeric("bodynumwords.feature") d_att9 = Attribute.create_numeric("bodyrichness.feature") d_att10 = Attribute.create_numeric("bodysuspensionword.feature") d_att11 = Attribute.create_numeric("bodyverifyyouraccountphrase.feature") d_att12 = Attribute.create_numeric("externalsabinary.feature") d_att13 = Attribute.create_numeric("externalsascore.feature") d_att14 = Attribute.create_numeric("scriptjavascript.feature") d_att15 = Attribute.create_numeric("scriptonclick.feature") d_att16 = Attribute.create_numeric("scriptpopup.feature") d_att17 = Attribute.create_numeric("scriptstatuschange.feature") d_att18 = Attribute.create_numeric("scriptunmodalload.feature") d_att19 = Attribute.create_numeric("senddiffreplyto.feature") d_att20 = Attribute.create_numeric("sendnumwords.feature") d_att21 = Attribute.create_numeric("sendunmodaldomain.feature") d_att22 = Attribute.create_numeric("subjectbankword.feature") d_att23 = Attribute.create_numeric("subjectdebitword.feature") d_att24 = Attribute.create_numeric("subjectfwdword.feature") d_att25 = Attribute.create_numeric("subjectnumchars.feature") d_att26 = Attribute.create_numeric("subjectnumwords.feature") d_att27 = Attribute.create_numeric("subjectreplyword.feature") d_att28 = Attribute.create_numeric("subjectrichness.feature") d_att29 = Attribute.create_numeric("subjectverifyword.feature") d_att30 = Attribute.create_numeric("urlatchar.feature") d_att31 = Attribute.create_numeric("urlbaglink.feature") d_att32 = Attribute.create_numeric("urlip.feature") d_att33 = Attribute.create_numeric("urlnumdomains.feature") d_att34 = Attribute.create_numeric("urlnumexternallink.feature") d_att35 = Attribute.create_numeric("urlnumimagelink.feature") d_att36 = Attribute.create_numeric("urlnuminternallink.feature") d_att37 = Attribute.create_numeric("urlnumip.feature") d_att38 = Attribute.create_numeric("urlnumlink.feature") d_att39 = Attribute.create_numeric("urlnumperiods.feature") d_att40 = Attribute.create_numeric("urlnumport.feature") d_att41 = Attribute.create_numeric("urlport.feature") d_att42 = Attribute.create_numeric("urltwodoains.feature") d_att43 = Attribute.create_numeric("urlunmodalbaglink.feature") d_att44 = Attribute.create_numeric("urlwordclicklink.feature") d_att45 = Attribute.create_numeric("urlwordherelink.feature") d_att46 = Attribute.create_numeric("urlwordloginlink.feature") d_att47 = Attribute.create_numeric("urlwordupdatelink.feature") d_att48 = Attribute.create_nominal("class", {'phish', 'ham'}) # data_dir = settings.BASE_DIR + "/phishing/public/datasets/" # loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(data_dir + "dataset.arff") data.class_is_last() cls = Classifier(classname="weka.classifiers.trees.J48") cls.options = ["-C", "0.3"] cls.build_classifier(data) serialization.write(data_dir + "out.model", cls) classifier = Classifier(jobject=serialization.read(data_dir + "out.model")) dataset = Instances.create_instances("test", [ d_att1, d_att2, d_att3, d_att4, d_att5, d_att6, d_att7, d_att8, d_att9, d_att10, d_att11, d_att12, d_att13, d_att14, d_att15, d_att16, d_att17, d_att18, d_att19, d_att20, d_att21, d_att22, d_att23, d_att24, d_att25, d_att26, d_att27, d_att28, d_att29, d_att30, d_att31, d_att32, d_att33, d_att34, d_att35, d_att36, d_att37, d_att38, d_att39, d_att40, d_att41, d_att42, d_att43, d_att44, d_att45, d_att46, d_att47, d_att48 ], 0) values = [ 0, 0, 0, 0, 890, 1, 124, 198, 0.22247191011236, 0, 0, 0, 0.0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 21, 4, 1, 0.19047619047619, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Instance.missing_value() ] inst = Instance.create_instance(values) dataset.add_instance(inst) dataset.class_is_last() # print(str(dataset)) var = '' for inst1 in dataset: pred = classifier.classify_instance(inst1) var = inst1.class_attribute.value(int(pred)) if var == 'ham': print('No es pishing') # do somthing else: print('Es pishing') # do somthing print(var) jvm.stop() return HttpResponse(str(var))
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() helper.print_title("Iris dataset") print(iris_data) helper.print_title("Iris dataset (incrementally output)") for i in iris_data: print(i) helper.print_title("Iris summary") print(Instances.summary(iris_data)) helper.print_title("Iris attributes") for a in iris_data.attributes(): print(a) helper.print_title("Instance at #0") print(iris_data.get_instance(0)) print(iris_data.get_instance(0).values) print("Attribute stats (first):\n" + str(iris_data.attribute_stats(0))) print("total count (first attribute):\n" + str(iris_data.attribute_stats(0).total_count)) print("numeric stats (first attribute):\n" + str(iris_data.attribute_stats(0).numeric_stats)) print("nominal counts (last attribute):\n" + str( iris_data.attribute_stats(iris_data.num_attributes - 1).nominal_counts)) helper.print_title("Instance values at #0") for v in iris_data.get_instance(0): print(v) # append datasets helper.print_title("append datasets") data1 = Instances.copy_instances(iris_data, 0, 2) data2 = Instances.copy_instances(iris_data, 2, 2) print("Dataset #1:\n" + str(data1)) print("Dataset #2:\n" + str(data2)) msg = data1.equal_headers(data2) print("#1 == #2 ? " + "yes" if msg is None else msg) combined = Instances.append_instances(data1, data2) print("Combined:\n" + str(combined)) # merge datasets helper.print_title("merge datasets") data1 = Instances.copy_instances(iris_data, 0, 2) data1.class_index = -1 data1.delete_attribute(1) data1.delete_first_attribute() data2 = Instances.copy_instances(iris_data, 0, 2) data2.class_index = -1 data2.delete_attribute(4) data2.delete_attribute(3) data2.delete_attribute(2) print("Dataset #1:\n" + str(data1)) print("Dataset #2:\n" + str(data2)) msg = data1.equal_headers(data2) print("#1 == #2 ? " + ("yes" if msg is None else msg)) combined = Instances.merge_instances(data2, data1) print("Combined:\n" + str(combined)) # load dataset incrementally iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset incrementally: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file, incremental=True) iris_data.class_is_last() helper.print_title("Iris dataset") print(iris_data) for inst in loader: print(inst) # create attributes helper.print_title("Creating attributes") num_att = Attribute.create_numeric("num") print("numeric: " + str(num_att)) date_att = Attribute.create_date("dat", "yyyy-MM-dd") print("date: " + str(date_att)) nom_att = Attribute.create_nominal("nom", ["label1", "label2"]) print("nominal: " + str(nom_att)) # create dataset helper.print_title("Create dataset") dataset = Instances.create_instances("helloworld", [num_att, date_att, nom_att], 0) print(str(dataset)) # create an instance helper.print_title("Create and add instance") values = [3.1415926, date_att.parse_date("2014-04-10"), 1.0] inst = Instance.create_instance(values) print("Instance #1:\n" + str(inst)) dataset.add_instance(inst) values = [ 2.71828, date_att.parse_date("2014-08-09"), Instance.missing_value() ] inst = Instance.create_instance(values) dataset.add_instance(inst) print("Instance #2:\n" + str(inst)) inst.set_value(0, 4.0) print("Instance #2 (updated):\n" + str(inst)) print("Dataset:\n" + str(dataset)) dataset.delete_with_missing(2) print("Dataset (after delete of missing):\n" + str(dataset)) values = [(1, date_att.parse_date("2014-07-11"))] inst = Instance.create_sparse_instance( values, 3, classname="weka.core.SparseInstance") print("sparse Instance:\n" + str(inst)) dataset.add_instance(inst) print("dataset with mixed dense/sparse instance objects:\n" + str(dataset)) # create dataset (lists) helper.print_title("Create dataset from lists") x = [[randint(1, 10) for _ in range(5)] for _ in range(10)] y = [randint(0, 1) for _ in range(10)] dataset2 = ds.create_instances_from_lists(x, y, "generated from lists") print(dataset2) x = [[randint(1, 10) for _ in range(5)] for _ in range(10)] dataset2 = ds.create_instances_from_lists( x, name="generated from lists (no y)") print(dataset2) # create dataset (matrices) helper.print_title("Create dataset from matrices") x = np.random.randn(10, 5) y = np.random.randn(10) dataset3 = ds.create_instances_from_matrices(x, y, "generated from matrices") print(dataset3) x = np.random.randn(10, 5) dataset3 = ds.create_instances_from_matrices( x, name="generated from matrices (no y)") print(dataset3) # create more sparse instances diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff" helper.print_info("Loading dataset: " + diabetes_file) loader = Loader("weka.core.converters.ArffLoader") diabetes_data = loader.load_file(diabetes_file) diabetes_data.class_is_last() helper.print_title("Create sparse instances using template dataset") sparse_data = Instances.template_instances(diabetes_data) for i in xrange(diabetes_data.num_attributes - 1): inst = Instance.create_sparse_instance( [(i, float(i + 1) / 10.0)], sparse_data.num_attributes, classname="weka.core.SparseInstance") sparse_data.add_instance(inst) print("sparse dataset:\n" + str(sparse_data)) # simple scatterplot of iris dataset: petalwidth x petallength iris_data = loader.load_file(iris_file) iris_data.class_is_last() pld.scatter_plot(iris_data, iris_data.attribute_by_name("petalwidth").index, iris_data.attribute_by_name("petallength").index, percent=50, wait=False) # line plot of iris dataset (without class attribute) iris_data = loader.load_file(iris_file) iris_data.class_is_last() pld.line_plot(iris_data, atts=xrange(iris_data.num_attributes - 1), percent=50, title="Line plot iris", wait=False) # matrix plot of iris dataset iris_data = loader.load_file(iris_file) iris_data.class_is_last() pld.matrix_plot(iris_data, percent=50, title="Matrix plot iris", wait=True)
with open(data_dir + os.sep + infile, "rb") as csvfile: print(infile) outfile = os.path.splitext(infile)[0] + ".arff" reader = csv.reader(csvfile) data = None ref_present = True for index, row in enumerate(reader): if index == 0: atts = [] ref_present = ("Reference value" in row) or ("Reference Value" in row) for idx, col in enumerate(row): col = col.lower() atts.append(Attribute.create_numeric(col)) if not ref_present and (idx == 0): atts.append(Attribute.create_numeric("reference value")) data = Instances.create_instances("irdc", atts, 0) else: values = [] for idx, col in enumerate(row): values.append(float(col)) if not ref_present and (idx == 0): values.append(float('NaN')) inst = Instance.create_instance(values) data.add_instance(inst) saver = Saver(classname="weka.core.converters.ArffSaver") saver.save_file(data, data_dir + os.sep + outfile) # train/test/predict print("Train/test/predict...")
def main(): global stop_spinning, name, upper_clothing, lower_clothing, outer_clothing, shoes_clothing, upper_indices, lower_indices, outer_indices, shoes_indices ''' Classifies clothing using stored classification models for each user ''' FSM = ClothingFSM() #FSM.username_server() clothingdb = MySQLdb.connect(host="localhost", user="******", passwd="mypassword", # Change to your SQL DB password db = "userprofiles") cursor = clothingdb.cursor() cursor.execute("SELECT * FROM clothing") name = "Study" #Populate clothing dictionaries with user's wardrobe for row in cursor.fetchall(): print str(row[2]) print str(row[6]) if str(row[0]) == name: if str(row[1]) == "Upper Body": try: upper_clothing[row[2]].append(row[6]) except: print "Problem appending clothing to dictionary" if str(row[1]) == "Lower Body": try: lower_clothing[row[3]].append(row[6]) except: print "Problem appending clothing to dictionary" if str(row[1]) == "Outerwear": try: outer_clothing[row[4]].append(row[6]) except: print "Problem appending clothing to dictionary" if str(row[1]) == "Shoes": try: shoes_clothing[row[5]].append(row[6]) except: print "Problem appending clothing to dictionary" print upper_clothing, lower_clothing, outer_clothing, shoes_clothing # FSM.received_user_info() #In final program, we will receive this information from database #Set to true or false if receiving features vs testing defaults receive_features = True if receive_features == False: #Wait to Receive input #Example inputs from user/weather API features['casual_formal'] = 3 #5 is very comfortable 1 is not comfortable features['comfort'] = 3 #1 is not snowing 2 is light snow 3 is heavy snow features['snow'] = 1 #1 is not raining 3 is raining(no medium) features['rain'] = 3 #If user is spending their time mostly outside, set warmth to outsidewarmth. If not, set warmth features['warmth'] = 1 features['outside_warmth'] = 4 #1 is no 0 is yes features['athletic'] = 1 snowstring = '' rainstring = '' athleticstring = '' else: FSM.features_server() upper_array = [None] * 14 lower_array = [None] * 7 outer_array = [None] * 3 shoes_array = [None] * 4 upper_prediction_array = [] lower_prediction_array = [] outer_prediction_array = [] shoes_prediction_array = [] warmth_att = Attribute.create_numeric("Warmth") comfort_att = Attribute.create_numeric("Comfort") casual_att = Attribute.create_numeric("Casual") rain_att = Attribute.create_numeric("Rain") snow_att = Attribute.create_numeric("Snow") athletic_att = Attribute.create_numeric("Athletic") upper_attributes = [warmth_att, casual_att, comfort_att, athletic_att] lower_attributes = [warmth_att, casual_att, comfort_att, athletic_att] outer_attributes = [warmth_att, casual_att, comfort_att, snow_att, rain_att] shoes_attributes = [casual_att, comfort_att, athletic_att] Instances.create_instances("upper_instances", upper_attributes, 0) Instances.create_instances("lower_instances", lower_attributes, 0) Instances.create_instances("outer_instances", outer_attributes, 0) Instances.create_instances("shoes_instances", shoes_attributes, 0) #Simulate their wardrobe #Upper # Tank Top if len(upper_clothing['Tank Top']) == 0: upper_array[0] = 0 else: upper_array[0] = 1 # T-Shirt if len(upper_clothing['T-Shirt']) == 0: upper_array[1] = 0 else: upper_array[1] = 1 # Long-Sleeved Shirt if len(upper_clothing['Long-sleeved Shirt']) == 0: upper_array[2] = 0 else: upper_array[2] = 1 # Athletic Top if len(upper_clothing['Athletic Top']) == 0: upper_array[3] = 0 else: upper_array[3] = 1 # Button-down Shirt if len(upper_clothing['Button-down Shirt']) == 0: upper_array[4] = 0 else: upper_array[4] = 1 # Polo Shirt if len(upper_clothing['Polo Shirt']) == 0: upper_array[5] = 0 else: upper_array[5] = 1 # Dress Shirt if len(upper_clothing['Dress Shirt']) == 0: upper_array[6] = 0 else: upper_array[6] = 1 # Suit Jacket if len(upper_clothing['Suit Jacket']) == 0: upper_array[7] = 0 else: upper_array[7] = 1 # Blazer if len(upper_clothing['Blazer']) == 0: upper_array[8] = 0 else: upper_array[8] = 1 # Hoodie if len(upper_clothing['Hoodie']) == 0: upper_array[9] = 0 else: upper_array[9] = 1 # Sweater if len(upper_clothing['Sweater']) == 0: upper_array[10] = 0 else: upper_array[10] = 1 # Blouse if len(upper_clothing['Blouse']) == 0: upper_array[11] = 0 else: upper_array[11] = 1 # Day Dress if len(upper_clothing['Day Dress']) == 0: upper_array[12] = 0 else: upper_array[12] = 1 # Evening Dress if len(upper_clothing['Evening Dress']) == 0: upper_array[13] = 0 else: upper_array[13] = 1 #Lower # Regular Shorts if len(lower_clothing['Shorts']) == 0: lower_array[0] = 0 else: lower_array[0] = 1 # Athletic Shorts if len(lower_clothing['Athletic Shorts']) == 0: lower_array[1] = 0 else: lower_array[1] = 1 # Athletic Pants if len(lower_clothing['Athletic Pants']) == 0: lower_array[2] = 0 else: lower_array[2] = 1 # Jeans if len(lower_clothing['Jeans']) == 0: lower_array[3] = 0 else: lower_array[3] = 1 # Trousers if len(lower_clothing['Trousers']) == 0: lower_array[4] = 0 else: lower_array[4] = 1 # Skirt if len(lower_clothing['Skirt']) == 0: lower_array[5] = 0 else: lower_array[5] = 1 # Dress Pants if len(lower_clothing['Dress Pants']) == 0: lower_array[6] = 0 else: lower_array[6] = 1 #Outer # Light Jacket if len(outer_clothing['Light Jacket']) == 0: outer_array[0] = 0 else: outer_array[0] = 1 # Heavy Jacket if len(outer_clothing['Winter Jacket']) == 0: outer_array[1] = 0 else: outer_array[1] = 1 # Rain Jacket if len(outer_clothing['Rain Jacket']) == 0: outer_array[2] = 0 else: outer_array[2] = 1 #Shoes # Casual Shoes if len(shoes_clothing['Casual Shoes']) == 0: shoes_array[0] = 0 else: shoes_array[0] = 1 # Athletic Shoes if len(shoes_clothing['Athletic Shoes']) == 0: shoes_array[1] = 0 else: shoes_array[1] = 1 # Dress Shoes if len(shoes_clothing['Dress Shoes']) == 0: shoes_array[2] = 0 else: shoes_array[2] = 1 # Dressy Casual Shoes if len(shoes_clothing['Business Casual Shoes']) == 0: shoes_array[3] = 0 else: shoes_array[3] = 1 upper_list = [features['outside_warmth'], features['casual_formal'], features['comfort'], features['athletic']] lower_list = [features['outside_warmth'], features['casual_formal'], features['comfort'], math.fabs(1-features['athletic'])] outer_list = [features['outside_warmth'], features['casual_formal'], features['comfort'], features['rain'], features['snow']] shoes_list = [features['casual_formal'], features['comfort'], math.fabs(1-features['athletic'])] upper_instance = Instance.create_instance(upper_list, classname='weka.core.DenseInstance', weight= 1.0) lower_instance = Instance.create_instance(lower_list, classname='weka.core.DenseInstance', weight= 1.0) outer_instance = Instance.create_instance(outer_list, classname='weka.core.DenseInstance', weight= 1.0) shoes_instance = Instance.create_instance(shoes_list, classname='weka.core.DenseInstance', weight= 1.0) upper_path = '/home/leo/models/uppermodel2.model' lower_path = '/home/leo/models/lowermodel2.model' outer_path = '/home/leo/models/outermodel2.model' shoes_path = '/home/leo/models/shoesmodel7.model' upper_classifier = Classifier(jobject=serialization.read(upper_path)) lower_classifier = Classifier(jobject=serialization.read(lower_path)) outer_classifier = Classifier(jobject=serialization.read(outer_path)) shoes_classifier = Classifier(jobject=serialization.read(shoes_path)) upper_predictions = upper_classifier.distribution_for_instance(upper_instance) lower_predictions = lower_classifier.distribution_for_instance(lower_instance) outer_predictions = outer_classifier.distribution_for_instance(outer_instance) shoes_predictions = shoes_classifier.distribution_for_instance(shoes_instance) if features['rain'] == 1: rainstring = 'No' if features['rain'] == 3: rainstring = 'Yes' if features['snow'] == 1: snowstring = 'No' if features['snow'] == 3: snowstring = 'Yes' if features['athletic'] == 1: athleticstring = 'No' if features['athletic'] == 0: athleticstring = 'Yes' print "Features being Classified:" print "Outside Warmth:", features['outside_warmth'], "Inside-Outside:", features['inside_outside'], "Casual-Formal:", features['casual_formal'], "Comfort:", features['comfort'], "Athletic:", athleticstring, "Rain:", rainstring, "Snow:", snowstring #Remove Clothing Options User Doesn't Own for i in range(len(upper_array)): if upper_array[i] == 0: upper_prediction_array.append(0) else: upper_prediction_array.append(upper_predictions[i]) for i in range(len(lower_array)): if lower_array[i] == 0: lower_prediction_array.append(0) else: lower_prediction_array.append(lower_predictions[i]) for i in range(len(outer_array)): if outer_array[i] == 0: outer_prediction_array.append(0) else: outer_prediction_array.append(outer_predictions[i]) for i in range(len(shoes_array)): if shoes_array[i] == 0: shoes_prediction_array.append(0) else: shoes_prediction_array.append(shoes_predictions[i]) #Find the top 3 options for each classifier max_index_upper1 = 0 max_index_upper2 = 0 max_index_upper3 = 0 max_index_upper4 = 0 max_index_upper5 = 0 for i in range(1,len(upper_prediction_array)): n = upper_prediction_array[max_index_upper1] if upper_prediction_array[i] > n: max_index_upper1 = i upper_prediction_array[max_index_upper1] = 0 for i in range(1, len(upper_prediction_array)): n = upper_prediction_array[max_index_upper2] if upper_prediction_array[i] > n: max_index_upper2 = i upper_prediction_array[max_index_upper2] = 0 for i in range(1, len(upper_prediction_array)): n = upper_prediction_array[max_index_upper3] if upper_prediction_array[i] > n: max_index_upper3 = i upper_prediction_array[max_index_upper3] = 0 for i in range(1, len(upper_prediction_array)): n = upper_prediction_array[max_index_upper4] if upper_prediction_array[i] > n: max_index_upper4 = i upper_prediction_array[max_index_upper4] = 0 for i in range(1, len(upper_prediction_array)): n = upper_prediction_array[max_index_upper5] if upper_prediction_array[i] > n: max_index_upper5 = i upper_indices = [max_index_upper1, max_index_upper2, max_index_upper3, max_index_upper4, max_index_upper5] max_index_lower1 = 0 max_index_lower2 = 0 max_index_lower3 = 0 max_index_lower4 = 0 max_index_lower5 = 0 for i in range(1,len(lower_prediction_array)): n = lower_prediction_array[max_index_lower1] if lower_prediction_array[i] > n: max_index_lower1 = i lower_prediction_array[max_index_lower1] = 0 for i in range(1,len(lower_prediction_array)): n = lower_prediction_array[max_index_lower2] if lower_prediction_array[i] > n: max_index_lower2 = i lower_prediction_array[max_index_lower2] = 0 for i in range(1,len(lower_prediction_array)): n = lower_prediction_array[max_index_lower3] if lower_prediction_array[i] > n: max_index_lower3 = i lower_prediction_array[max_index_lower3] = 0 for i in range(1, len(lower_prediction_array)): n = lower_prediction_array[max_index_lower4] if lower_prediction_array[i] > n: max_index_upper4 = i lower_prediction_array[max_index_lower4] = 0 for i in range(1, len(lower_prediction_array)): n = lower_prediction_array[max_index_lower5] if lower_prediction_array[i] > n: max_index_lower5 = i lower_indices = [max_index_lower1, max_index_lower2, max_index_lower3, max_index_lower4, max_index_lower5] max_index_outer1 = 0 max_index_outer2 = 0 max_index_outer3 = 0 for i in range(1, len(outer_prediction_array)): n = outer_prediction_array[max_index_outer1] if outer_prediction_array[i] > n: max_index_outer1 = i outer_prediction_array[max_index_outer1] = 0 for i in range(1, len(outer_prediction_array)): n = outer_prediction_array[max_index_outer2] if outer_prediction_array[i] > n: max_index_outer2 = i outer_prediction_array[max_index_outer2] = 0 for i in range(1, len(outer_prediction_array)): n = outer_prediction_array[max_index_outer3] if outer_prediction_array[i] > n: max_index_outer3 = i outer_indices = [max_index_outer1, max_index_outer2, max_index_outer3] max_index_shoes1 = 0 max_index_shoes2 = 0 max_index_shoes3 = 0 max_index_shoes4 = 0 for i in range(1, len(shoes_prediction_array)): n = shoes_prediction_array[max_index_shoes1] if shoes_prediction_array[i] > n: max_index_shoes1 = i shoes_prediction_array[max_index_shoes1] = 0 for i in range(1, len(shoes_prediction_array)): n = shoes_prediction_array[max_index_shoes2] if shoes_prediction_array[i] > n: max_index_shoes2 = i shoes_prediction_array[max_index_shoes2] = 0 for i in range(1, len(shoes_prediction_array)): n = shoes_prediction_array[max_index_shoes3] if shoes_prediction_array[i] > n: max_index_shoes3 = i shoes_prediction_array[max_index_shoes3] = 0 for i in range(1, len(shoes_prediction_array)): n = shoes_prediction_array[max_index_shoes4] if shoes_prediction_array[i] > n: max_index_shoes4 = i shoes_indices = [max_index_shoes1, max_index_shoes2, max_index_shoes3, max_index_shoes4] print "Outer Indices:", outer_indices FSM.received_inputs() print "Exiting Program"