def pop_up_button1(self, i): if i.text() == "OK": self.msg1.close() MainWindow.close() self.time_to_wait_before = self.time_to_wait_combobox.currentText() pre_process(self.file_path, self.time_to_wait_before, self.audio_path)
def read_test_train(): train_path = "fbpac-ads-en-US-train.csv" test_path = "fbpac-ads-en-US-test.csv" # data_path = "data/limited_sample.csv" data_train = pd.read_csv(train_path, error_bad_lines=False) data_test = pd.read_csv(test_path, error_bad_lines=False) # pre processing all the documents [title:04 + message:05] processed_docs = [] for index, row in data_train.iterrows(): try: processed_record = pre_process(row[5]) processed_docs.append(processed_record) except: print("Error in pre-processing: " + str(index)) for index, row in data_test.iterrows(): try: processed_record = pre_process(row[5]) processed_docs.append(processed_record) except: print("Error in pre-processing: " + str(index)) print("Log: pre processing is done.") return processed_docs
def windowed_subdivs(model, input_ch, train_mean, train_std, padded_img, window_size, overlap_pct): """ Create tiled overlapping patches. Returns: 5D numpy array of shape = ( nb_patches_along_X, nb_patches_along_Y, patches_resolution_along_X, patches_resolution_along_Y, nb_output_channels ) Note: patches_resolution_along_X == patches_resolution_along_Y == window_size """ step = int(window_size * (1 - overlap_pct)) padx_len = padded_img.shape[0] pady_len = padded_img.shape[1] subdivs = [] if input_ch == 3: for i in range(0, padx_len - window_size + 1, step): subdivs.append([]) for j in range(0, pady_len - window_size + 1, step): patch = padded_img[i:i + window_size, j:j + window_size] patch = pre_process(patch, train_mean, train_std) / 255 patch = cv2.merge((patch, patch, patch)) subdivs[-1].append(patch) else: for i in range(0, padx_len - window_size + 1, step): subdivs.append([]) for j in range(0, pady_len - window_size + 1, step): patch = padded_img[i:i + window_size, j:j + window_size] patch = pre_process(patch, train_mean, train_std) / 255 patch = np.expand_dims(patch, axis=-1) subdivs[-1].append(patch) # Here, `gc.collect()` clears RAM between operations. # It should run faster if they are removed, if enough memory is available. gc.collect() subdivs = np.array(subdivs) gc.collect() a, b, c, d, e = subdivs.shape subdivs = subdivs.reshape(a * b, c, d, e) gc.collect() subdivs = model.predict(subdivs) gc.collect() # Such 5D array: subdivs = subdivs.reshape(a, b, c, d, 1) gc.collect() return subdivs
def read_seeds_data(): all_docs = [] docs_labels = [] with open('data/seeds.json') as f: data = json.load(f) try: for item in data["not_political"]: all_docs.append(pre_process(item)) docs_labels.append(0) for item in data["political"]: all_docs.append(pre_process(item)) docs_labels.append(1) except: print("Error in reading data.") return all_docs, docs_labels
def main(): # preprocess includes: spliting and using standard scaler to transform x_train, x_test, y_train, y_test = pre_process() ### feature selection with low variance #print(x_train.shape) x_train, x_test = delete_low_variance(x_train, x_test) #print(x_train.shape) #print(x_test.shape) #this deleted 218 features criteria = 'entropy' model = DecisionTreeClassifier(criterion=criteria) model.fit(x_train, y_train) prediction = model.predict(x_test) #visualize_tree(model,features) #print('the criteria is: ' + criteria) #print('the mean accuracy is: %.10f' % model.score(x_test,y_test)) #print("F-1 score(micro) for test is: %.10f " % f1_score(y_test,prediction,average= 'micro')) print("F-1 score(weighted) for test is: %.10f " % f1_score(y_test, prediction, average='weighted'))
def main(): # preprocess includes: spliting and using standard scaler to transform x_train, x_test, y_train, y_test = pre_process() ### feature selection with low variance #print(x_train.shape) x_train, x_test = delete_low_variance(x_train, x_test) #print(x_train.shape) #print(x_test.shape) #this deleted 218 features strength = 1 model = LogisticRegression(C=strength, penalty='l1', solver="liblinear", multi_class="ovr") model.fit(x_train, y_train.values.ravel()) prediction = model.predict(x_test) #print(model.get_params) #print("Accuracy score for test is: %.6f" % model.score(x_test, y_test)) #print("Strength is: %.2f" % strength) #print("F-1 score(micro) for test is: %.10f " % f1_score(y_test,prediction,average= 'micro')) #print("F-1 score(macro) for test is: %.10f " % f1_score(y_test,prediction,average= 'macro')) print "F-1 score(weighted) for test is: %.10f " % f1_score(y_test,prediction,average= 'weighted')
def using_bag_of_words(X): # preprocess, words needs to be tokenized proc_data = pre_process(X, tokenize=True, stop_words=stop_words) # get BOW vector vec_X = bag_of_words(proc_data) return vec_X
def run_model(): x_train, x_test, y_train, y_test = pre_process() #x_train, x_test = delete_low_variance(x_train, x_test) clf = RandomForestClassifier(criterion="entropy") clf.fit(x_train, y_train.values.ravel()) y_pred = clf.predict(x_test) print "Weighted F-1 Score:", f1_score(y_test, y_pred, average="weighted")
def docs_to_topics_vector(docs, lda_model, dictionary): """ given a list of documents and a trained topic mode, this method return the topic vector representation of all documents""" docs_topics_vectors = [] for doc in docs: bow_vector = dictionary.doc2bow(pre_process(doc)) docs_topics_vectors.append(lda_model[bow_vector]) return docs_topics_vectors
def doc_topic_model(doc, lda_model, dictionary): """ given a sample document, trained LDA model and its corresponding dictionary, this method prints the topics of the documents and a score associated with each topic""" print("\n") bow_vector = dictionary.doc2bow(pre_process(doc)) for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1 * tup[1]): print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))
def getDocBlocks(self, idx, blockSize=100, stride=20): doc = self.getContext(idx)[0] doc = pre_process(doc, stemming=False).split() i = 0 docBlocks = [] while i < len(doc): j = i + blockSize wordlist = doc[i : j] docBlocks.append(' '.join(wordlist)) i += stride return docBlocks
def run_model(): print "kNN MODEL RESULTS" p_val = [1, 2, 3] n_val = [3, 5] for n in n_val: for dist in p_val: x_train, x_test, y_train, y_test = pre_process() x_train, x_test = delete_low_variance(x_train, x_test) clf = KNeighborsClassifier(p = dist, n_neighbors = n) clf.fit(x_train, y_train.values.ravel()) y_pred = clf.predict(x_test) print "kNN: p =", dist, "neighbors =", n print "Weighted F-1 Score:", f1_score(y_test, y_pred, average = "weighted")
def transform(self, X, **transform_params): docs_topics_vectors = [] lda_model = load_file("models/LDAbow_fbpac.pickle") lda_dictionary = load_file("models/LDAdict_fbpac.pickle") for doc in X: try: bow_vector = lda_dictionary.doc2bow(pre_process(doc)) docs_topics_vectors.append(lda_model[bow_vector]) except Exception as e: print(e) print("Error in computing topic vector") n, nx, ny = np.array(docs_topics_vectors).shape d2_all_docs = np.array(docs_topics_vectors).reshape((n, nx * ny)) return d2_all_docs[:, 1::2]
def create_topic_models(): data_path = "data/fbpac-ads-en-US.csv" # data_path = "data/limited_sample.csv" data = pd.read_csv(data_path, error_bad_lines=False) # pre processing all the documents [title:04 + message:05] processed_docs = [] for index, row in data.iterrows(): try: processed_record = pre_process(row[4] + " " + row[5]) processed_docs.append(processed_record) except: print("Error in pre-processing: " + str(index)) print("Log: pre processing is done.") # creating a dictionary of all tokens in all documents dictionary = gensim.corpora.Dictionary(processed_docs) save_file('models/LDAdict.pickle', dictionary) dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000) print("Log: dictionary is created and saved.") # creating bag of words and tf-idf corpora bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs] tf_idf = models.TfidfModel(bow_corpus) corpus_tf_idf = tf_idf[bow_corpus] # creating LDA model using bag of words lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=2, id2word=dictionary, passes=2, workers=4) save_file('models/LDAbow.pickle', lda_model) print("Log: lda model [bog] is created and saved.") for idx, topic in lda_model.print_topics(-1): print('Topic: {} | Words: {}'.format(idx, topic)) # creating LDA model using tf-idf lda_model_tf_idf = gensim.models.LdaMulticore(corpus_tf_idf, num_topics=2, id2word=dictionary, passes=2, workers=4) save_file('models/LDAtfidf.pickle', lda_model) print("Log: lda model [tf-idf] is created and saved.") for idx, topic in lda_model_tf_idf.print_topics(-1): print('Topic: {} | Word: {}'.format(idx, topic))
def using_doc2vec(X): # preprocess proc_data = pre_process(X, tokenize=True, stop_words=[]) # no tokenize for doc2vec when creating model model, vec_X = doc2vec_create_model(X, max_epochs=100, vec_size=10, alpha=0.025) # example of loading in a model later model = Doc2Vec.load("Doc2Vec//apnews_dbow//doc2vec.bin") # tokenize when using model to convert other data vec_X = doc2vec_use_model(proc_data, model) return vec_X
def predict(): ''' For rendering results on HTML GUI ''' questions = [] questions.append([str(x) for x in request.form.values()]) df = pd.DataFrame(questions, columns=['question1','question2']) df['id'] = 1 df = pp.pre_process(df) df = fg.generate_features(df) X = df[['words_diff_q1_q2', 'word_common', 'word_total', 'word_share', 'cosine_distance', 'cityblock_distance', 'jaccard_distance', 'canberra_distance', 'euclidean_distance', 'minkowski_distance', 'braycurtis_distance', 'fuzz_qratio', 'fuzz_WRatio', 'fuzz_partial_ratio', 'fuzz_partial_token_set_ratio', 'fuzz_partial_token_sort_ratio', 'fuzz_token_set_ratio', 'fuzz_token_sort_ratio', 'common_bigrams', 'common_trigrams', 'q1_readability_score', 'q2_readability_score']] prediction = 'duplicate' if (model_rf.predict(X)[0] == 1) else 'not duplicate' return render_template('index.html', prediction_text=f'The questions are {prediction}')
def read_main_data(): data = pd.read_csv(data_path, error_bad_lines=False) # pre processing all the documents [title:04 + message:05] processed_docs = [] # printing unique list of advertisers advertisers = data.iloc[:, 16].unique() np.savetxt('data/advertisers.txt', advertisers, fmt='%s') for index, row in data.iterrows(): try: processed_record = pre_process(row[5]) processed_docs.append(processed_record) except: print("Error in pre-processing: " + str(index)) print("Log: pre processing is done.") return processed_docs
def main(): test_data_path = 'train.data.csv' test_scheme_path = 'wine.names.csv' # test_data_path = 'datasets/iris.data' # test_scheme_path = 'datasets/iris.names' data, attributes, value_type = read(test_data_path, test_scheme_path) random.shuffle(data) train_dataset = pre_process(data, attributes, value_type) cars = rule_generator(train_dataset, 0.22, 0.6) cars.prune_rules(train_dataset) cars.rules = cars.pruned_rules classifier_m1 = classifier_builder_m1(cars, train_dataset) # error_rate = get_error_rate(classifier_m1, train_dataset) total_car_number = len(cars.rules) # total_classifier_rule_num = len(classifier_m1.rule_list) # print("_______________________________________________________") # print(error_rate) # print("_______________________________________________________") # print(total_classifier_rule_num) print("_______________________________________________________") cars.print_rule() print("_______________________________________________________") cars.prune_rules(train_dataset) cars.print_pruned_rule() print("_______________________________________________________") print() classifier_m1.print() print("_______________________________________________________") print(total_car_number)
def minimum_cosine(query, data): query = pre_process(query) data = data.copy() data.append(query) #Transform all the answers and questions into tfidf vectors TfidfVec = TfidfVectorizer() tfidf = TfidfVec.fit_transform(data) #Store all the cosine angles in theta theta = [] #Compute the cosine similarity between the query and the data. for i in range(len(data) - 1): val = cosine_similarity(tfidf[-1], tfidf[i]) theta.append(math.acos(val)) #Find the minimum angle and the index of that text from data and return it min_angle = min(theta) data_index = theta.index(min_angle) return min_angle, data_index
def cross_validate_m1_without_prune(data_path, scheme_path, minsup=0.01, minconf=0.5): data, attributes, value_type = read(data_path, scheme_path) random.shuffle(data) dataset = pre_process(data, attributes, value_type) block_size = int(len(dataset) / 10) split_point = [k * block_size for k in range(0, 10)] split_point.append(len(dataset)) cba_rg_total_runtime = 0 cba_cb_total_runtime = 0 total_car_number = 0 total_classifier_rule_num = 0 error_total_rate = 0 for k in range(len(split_point) - 1): print("\nRound %d:" % k) training_dataset = dataset[:split_point[k]] + dataset[split_point[k + 1]:] test_dataset = dataset[split_point[k]:split_point[k + 1]] start_time = time.time() cars = rule_generator(training_dataset, minsup, minconf) end_time = time.time() cba_rg_runtime = end_time - start_time cba_rg_total_runtime += cba_rg_runtime start_time = time.time() classifier_m1 = classifier_builder_m1(cars, training_dataset) end_time = time.time() cba_cb_runtime = end_time - start_time cba_cb_total_runtime += cba_cb_runtime error_rate = get_error_rate(classifier_m1, test_dataset) error_total_rate += error_rate total_car_number += len(cars.rules) total_classifier_rule_num += len(classifier_m1.rule_list) print("CBA's error rate without pruning: %.1lf%%" % (error_rate * 100)) print("No. of CARs without pruning: %d" % len(cars.rules)) print("CBA-RG's run time without pruning: %.2lf s" % cba_rg_runtime) print("CBA-CB M1's run time without pruning: %.2lf s" % cba_cb_runtime) print("No. of rules in classifier of CBA-CB M1 without pruning: %d" % len(classifier_m1.rule_list)) print("\nAverage CBA's error rate without pruning: %.1lf%%" % (error_total_rate / 10 * 100)) print("Average No. of CARs without pruning: %d" % int(total_car_number / 10)) print("Average CBA-RG's run time without pruning: %.2lf s" % (cba_rg_total_runtime / 10)) print("Average CBA-CB M1's run time without pruning: %.2lf s" % (cba_cb_total_runtime / 10)) print( "Average No. of rules in classifier of CBA-CB M1 without pruning: %d" % int(total_classifier_rule_num / 10))
def pop_up_button2(self, j): if j.text() == "OK": self.msg2.close() MainWindow.close() pre_process(self.file_path, self.time_to_wait_before, self.audio_path)
def cross_validate(data_path, scheme_path, class_first=False, minsup=0.1, minconf=0.6): data, attributes, value_type = read(data_path, scheme_path) if class_first: for i in range(len(data)): a = data[i].pop(0) data[i].append(a) a = attributes.pop(0) attributes.append(a) b = value_type.pop(0) value_type.append(b) # print(data[0]) random.shuffle(data) dataset = pre_process(data, attributes, value_type) block_size = int(len(dataset) / 10) split_point = [k * block_size for k in range(0, 10)] split_point.append(len(dataset)) cba_rg_total_runtime = 0 cba_cb_total_runtime = 0 total_car_number = 0 total_classifier_rule_num = 0 error_total_rate = 0 acc_total = 0 for k in range(len(split_point) - 1): print("\nRound %d:" % k) training_dataset = dataset[:split_point[k]] + dataset[split_point[k + 1]:] test_dataset = dataset[split_point[k]:split_point[k + 1]] start_time = time.time() cars = rule_generator(training_dataset, minsup, minconf) end_time = time.time() cba_rg_runtime = end_time - start_time cba_rg_total_runtime += cba_rg_runtime start_time = time.time() classifier = classifier_builder_m1(cars, training_dataset) end_time = time.time() cba_cb_runtime = end_time - start_time cba_cb_total_runtime += cba_cb_runtime classifier.print() res = acc(classifier, test_dataset) acc_total += res error_rate = get_error_rate(classifier, test_dataset) error_total_rate += error_rate total_car_number += len(cars.rules) total_classifier_rule_num += len(classifier.rule_list) print("accuracy:", (res * 100)) print("No. of CARs : ", len(cars.rules)) print("CBA-RG's run time : s", cba_rg_runtime) print("CBA-CB M1's run time : s", cba_cb_runtime) print("No. of rules in classifier of CBA-CB: ", len(classifier.rule_list)) print("\n Average CBA's accuracy :", (acc_total / 10 * 100)) print("Average No. of CARs : ", (total_car_number / 10)) print("Average CBA-RG's run time: ", (cba_rg_total_runtime / 10)) print("Average CBA-CB run time: ", (cba_cb_total_runtime / 10)) print("Average No. of rules in classifier of CBA-CB: ", (total_classifier_rule_num / 10))
# Params max_word = 25 # Percent of data for train validation and test train_pct, val_pct, test_pct = 0.9, 0.0, 0.10 batch_size = 32 n_class = 2 n_epoch = 10 # Other saving_dir = 'saved_model' # Load Movie Review Data df = load_movie_review_data() set_params(max_word=max_word) word_array, sentiment = pre_process(df) # Divide data into train, validation and test set len_data = word_array.shape[0] n_train_data = int(len_data * train_pct) train_input = word_array[:n_train_data] train_target = sentiment[:n_train_data] val_data_index = int(len_data * (train_pct + val_pct)) n_val_data = val_data_index - n_train_data val_input = word_array[n_train_data:val_data_index] val_target = sentiment[n_train_data:val_data_index] n_test_data = len_data - (n_train_data + n_val_data) test_input = word_array[val_data_index:] test_target = sentiment[val_data_index:] # free memory space word_array, sentiment = [], []
def getQuestion(self, idx): return pre_process(self.trainData[idx]['question'], stemming=False)
def cross_validate_m1_without_prune(data_path, scheme_path,class_first=False, minsup=0.1, minconf=0.6): data, attributes, value_type = read(data_path, scheme_path) if class_first: for i in range(len(data)): a=data[i].pop(0) data[i].append(a) a=attributes.pop(0) attributes.append(a) b=value_type.pop(0) value_type.append(b) # print(data[0]) random.shuffle(data) dataset = pre_process(data, attributes, value_type) block_size = int(len(dataset) / 10) split_point = [k * block_size for k in range(0, 10)] split_point.append(len(dataset)) apr_rg_total_runtime = 0 apr_cb_total_runtime = 0 total_car_number = 0 total_classifier_rule_num = 0 error_total_rate = 0 acc_total=0 for k in range(len(split_point)-1): print("\nRound %d:" % k) training_dataset = dataset[:split_point[k]] + dataset[split_point[k+1]:] test_dataset = dataset[split_point[k]:split_point[k+1]] start_time = time.time() cars = rule_generator(training_dataset, minsup, minconf) end_time = time.time() apr_rg_runtime = end_time - start_time apr_rg_total_runtime += apr_rg_runtime arr=list(cars.rules_list) max=-1 for i in range(len(arr)): if len(arr[i].cond_set)>max: max=len(arr[i].cond_set) T=[[] for i in range(max)] for i in range(len(arr)): T[len(arr[i].cond_set)-1].append(arr[i]) u=[] for i in range(len(T)): T[i]=sort_dict(T[i]) for j in T[i]: u.append(j) # print([u[i].cond_set for i in range(len(u))]) apr_rg_total_runtime += apr_rg_runtime start_time = time.time() # print("----------") classifier= classifier_builder_m1(cars, training_dataset,minsup,len(training_dataset),u) end_time = time.time() apr_cb_runtime = (end_time - start_time)/10 apr_cb_total_runtime += apr_cb_runtime classifier.print() res=acc(classifier,test_dataset) acc_total+=res error_rate = get_error_rate(classifier, test_dataset) error_total_rate += error_rate total_car_number += len(cars.rules) total_classifier_rule_num += len(classifier.rule_list) print("accuracy:",(res*100)) print("No. of CARs : ",len(cars.rules_list)) print("apr-RG's run time : s" ,apr_rg_runtime) print("apr-CB run time : s" ,apr_cb_runtime) print("No. of rules in classifier of apr: " ,len(classifier.rule_list)) print("\n Average APR's accuracy :",(acc_total/10*100)) print("Average No. of CARs : ",(total_car_number / 10)) print("Average apr-RG's run time : " ,(apr_rg_total_runtime / 10)) print("Average apr-CB run time : " ,(apr_cb_total_runtime / 10)) print("Average No. of rules in classifier of apr: " ,(total_classifier_rule_num / 10))
# Main method in this file, to get data list after processing and scheme list. # data_path: tell where *.data file stores. # scheme_path: tell where *.names file stores. def read(data_path, scheme_path): data = read_data(data_path) attributes, value_type = read_scheme(scheme_path) data = str2numerical(data, value_type) return data, attributes, value_type # just for test if __name__ == '__main__': import pre_processing test_data_path = 'zoo.data' test_scheme_path = 'zoo.names' test_data, test_attributes, test_value_type = read(test_data_path, test_scheme_path) # for i in range(len(test_data)): # a=test_data[i].pop(0) # test_data[i].append(a) # a=test_attributes.pop(0) # test_attributes.append(a) # b=test_value_type.pop(0) # test_value_type.append(b) # print(test_data[0]) result_data = pre_processing.pre_process(test_data, test_attributes, test_value_type) print(result_data)
def get_details(): path = args.input all_values = [] for i in os.listdir(path): single_img = [] if i.endswith(".jpg"): #filename = i image_path = f'{os.path.join(path,i)}' image = pre_process(image_path) with io.open(image_path, 'rb') as image_file: content = image_file.read() image = vision.types.Image( content=content) # construct an iamge instance # annotates Image Response response = client.text_detection( image=image) # returns TextAnnotation texts = response.text_annotations all_ = {} nums = [] dates = [] pattern = get_pattern() for text in texts: if (re.match("^[A-Z]{2}[0-9]{1,2}[A-Z0-9]{1,3}[0-9]{3,4}$", re.sub('\W+', '', text.description))): all_['Reg No'] = text.description if (re.match("^[^\\Wioq]{11,18}$", text.description) and re.search("[0-9]{5,6}$", text.description)): nums.append(text.description) if (get_fields(pattern.fullmatch(text.description))): dates.append( get_fields(pattern.fullmatch(text.description))) if (re.match( '^[0-9]{1,2}[-|\/]{1}[0-9]{1,2}[-|\/]{1}[0-9]{4}$', text.description)): dates.append(text.description) if len(nums) == 1: all_['VIN No/Chassis No'] = nums[0] elif len(nums) > 1 and len(nums) < 3: all_['VIN No/Chassis No'] = sorted(nums, key=len)[-1] all_['Engine No'] = sorted(nums, key=len)[-2] if len(dates) >= 1: for i in dates: if (len(i) == 2): i['day'] = '01' D = [] for j in dates: try: if (len(j) == 3): D.append(j['year'] + '-' + j['month'] + '-' + j['day']) D.sort() #print(j) except KeyError: D = D #for k in D: #all_.append(k) if len(D) > 1: all_['MFG DT'] = D[0] all_['REG DT'] = D[1] else: all_['MFG DT'] = dates[0] #print(all_) all_values.append(all_) with open('detailsss.txt', 'w+') as f: f.write(str(all_values)) return all_values
gop = [] others = [] for index, row in advertiser_partisanship.iterrows(): if row[2] == "Dem": dems.append(row[0]) elif row[2] == "GOP": gop.append(row[0]) elif row[2] == "nonpartisan" or row[2] == "other": others.append(row[0]) docs_topics_vectors = [] lda_model = load_file("models/LDAbow_fbpac.pickle") lda_dictionary = load_file("models/LDAdict_fbpac.pickle") for doc in train_df['text']: try: bow_vector = lda_dictionary.doc2bow(pre_process(doc)) docs_topics_vectors.append(lda_model[bow_vector]) except Exception as e: print(e) print("Error in computing topic vector") n, nx, ny = np.array(docs_topics_vectors).shape d2_all_docs = np.array(docs_topics_vectors).reshape((n, nx * ny)) X = d2_all_docs[:, 1::2] x_filtered = [] x_advertiser = [] for i in range(n): result = np.sort(X[i]) if not (round(X[i][3], 3) == 0.2 and round(X[i][4], 3) == 0.2): if str(advertiser_df[i]) != 'nan': x_filtered.append([X[i][3], X[i][4]])
thresh = conf_matrix.max() / 2. for i in range(conf_matrix.shape[0]): for j in range(conf_matrix.shape[1]): ax.text(j, i, format(conf_matrix[i, j], fmt), ha="center", va="center", color="white" if conf_matrix[i, j] > thresh else "black") fig.tight_layout() return ax training_accs = [] test_accs = [] for leaf_nodes in range(2, 128): file_prefix = f"../output/max_leaf_nodes_{leaf_nodes}/" # Get data df = pre.pre_process() # Features for the training set X = df[["is_gender_female", "is_gender_male", "is_race_group A", "is_race_group B", "is_race_group C", "is_race_group D", "is_race_group E", "is_parent_education_associate's degree", "is_parent_education_bachelor's degree", "is_parent_education_high school", "is_parent_education_master's degree", "is_parent_education_some college", "is_parent_education_some high school", "is_lunch_free/reduced", "is_lunch_standard", "is_prepared_completed", "is_prepared_none"]] features = X.keys() X = np.array(X) # Targets for the training set y = df["student performance"] y = np.array(y) # Hold one out cross validation
for i in range(len(x)): # print(inx2word_dic[i]) if i * x[i] != 0: x_inx.append(i) print(x_inx) sent = [inx2word_dic[i] for i in x_inx] return ' '.join(sent) # 随机森林模型, 本身就是多类分类器 def random_forest_cla(training_data, model_name): X_train = training_data['X'] y_train = training_data['y'] forest_clf = RandomForestClassifier(random_state=42) forest_clf.fit(X_train, y_train) joblib.dump(forest_clf, model_name) if __name__ == '__main__': print('Start to load training data...') data_dir_path = r'data' training_file = 'train.txt' training_data = pre_process(data_dir_path, training_file, min_freq=10, count_in_sent=False) print('Training data\'s shape is', training_data['X'].shape) print('Start to fit model...') saved_model_name = 'saved_model.pkl' random_forest_cla(training_data, model_name=saved_model_name) print('Model saved in:', os.path.abspath(saved_model_name))