def dTrees_predict(): """决策树分类预测""" data_set = read_data(FILE_PATH) data_set = filter_data(data_set) data_set = fit_transform(data_set) test_set = read_data(TEST_PATH) test_set = filter_data(test_set) test_set = fit_transform(test_set) column_x = get_column_x(data_set) column_y = get_column_y(data_set) dtrees = DTrees(data_set, test_set, column_x, column_y) train_x, train_y = dtrees.get_train_x_y() test_x, test_y = dtrees.get_test_x_y() model = tree.DecisionTreeClassifier() model.fit(train_x, train_y) # dot_data = tree.export_graphviz(model, out_file=None, # filled=True, rounded=True, # special_characters=True) # graph = graphviz.Source(dot_data) # # graph.render('example.gv', directory='.\\', view=True) predicted = model.predict(test_x) print("决策树准确度:", accuracy_score(test_y, predicted))
def bayes_predict(): """贝叶斯分类预测""" data_set = read_data(FILE_PATH) data_set = filter_data(data_set) test_set = read_data(TEST_PATH) test_set = filter_data(test_set) column_x = get_column_x(data_set) column_y = get_column_y(data_set) bayes = Bayes(data_set, column_x, column_y) # column_x_value = bayes.set_test_x(Dates=23,DayOfWeek='Wednesday',PdDistrict='NORTHERN') # dict,result = bayes.predict(column_x_value) print('准备开始...') p = bayes.predict_all(test_set) print(p)
def __init__(self, train_data_path, output_test_path, max_iter=50, max_time=10, C=9, tolerance=0.0001, kernel=SMO.linear_kernel): self.data = read_data(train_data_path) self.output_test_data = read_data(output_test_path) # TODO change to submit format self.training_data, self.testing_data = split_data(self.data) self.train_X, self.train_Y = self.training_data[:, :-1], np.squeeze(self.training_data[:, -1:]) self.test_X, self.test_Y = self.testing_data[:, :-1], np.squeeze(self.testing_data[:, -1:]) # print(self.train_X.shape, self.train_Y.shape) # self.alphas = np.random.randn(len(self.train_X)) self.alphas = np.zeros(len(self.train_X)) self.b = 0.0 self.m = len(self.train_X) self.max_iter = max_iter self.max_time = max_time self.kernel = kernel self.C = C self.tolerance = tolerance
def neual_network_predict(): """神经网络分类预测""" data_set = read_data(FILE_PATH) data_set = filter_data(data_set) data_set = fit_transform(data_set) test_set = read_data(TEST_PATH) test_set = filter_data(test_set) test_set = fit_transform(test_set) train_x = get_data_set_x(data_set) train_y = get_data_set_y(data_set) test_x = get_data_set_x(test_set) test_y = get_data_set_y(test_set) labels_train = fit_bin_transform(train_y) print(labels_train) network = NeuralNetwork([3, 50, len(labels_train[0])]) network.fit(train_x, labels_train, epochs=3000) a, b = network.predict_all(test_x, test_y) print(a, '\n', b)
from __future__ import print_function import collections import math import os import random import numpy as np from six.moves import xrange # pylint: disable=redefined-builtin import tensorflow as tf from data_process import maybe_download,read_data,build_dataset,vocabulary_size from tensorflow.contrib.tensorboard.plugins import projector filename = maybe_download(31344016) vocabulary = read_data(filename) data, count, dictionary, reverse_dictionary = build_dataset( vocabulary, vocabulary_size) print('Data size', len(vocabulary)) print('Most common words (+UNK)', count[:5]) print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]]) data_index = 0 # Step 3: Function to generate a training batch for the skip-gram model. def generate_batch(batch_size, num_skips, skip_window): global data_index assert batch_size % num_skips == 0 assert num_skips <= 2 * skip_window batch = np.ndarray(shape=(batch_size), dtype=np.int32) labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
import data_process from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.naive_bayes import MultinomialNB import sys from pickle import dump from pickle import load """This module uses methods in data_process to process the dataset and trains the Naive Bayes classifier: 1. Tf-idf unigram model: using filtered unigram data 2. Tf-idf bigram model: using filtered bigram data 3. Tf-idf bigram pmi model: based on 2, only keep the bigrams with positive mutual information as feature set """ # split the data into training and test sets targets, raw_docs = data_process.read_data("spam.csv") docs = data_process.data_filter(raw_docs) docs_train, docs_test, cate_train, cate_test = train_test_split( docs, targets, test_size=0.20, random_state=12) # training and test sets for bigram model docs_bigram_train = [data_process.get_bigram(doc) for doc in docs_train] docs_bigram_test = [data_process.get_bigram(doc) for doc in docs_test] # using bigram with positive mutual information as training set fre_uni = data_process.frequency(docs_train) fre_big = data_process.frequency(docs_bigram_train) docs_bigram_train_pmi = [ data_process.filter_pmi(doc, fre_uni, fre_big) for doc in docs_bigram_train ] def dummy(doc): """This is the dummy tokenizer for CountVectorizer,
import numpy as np import phenome_classify as pc import sub_string as sb import data_process import pre_process import calc_target as ct #从文件中按列读取数据 root_mono = "labels/mono" root_full = "labels/full" file_list_mono = sb.traverse_dir(root_mono) file_list_full = sb.traverse_dir(root_full) sb.read_files(file_list_full) sb.read_files_time(file_list_full) sb.read_mono(file_list_mono) #####从按列保存的数据中读取所需行(每个note保存音节核)信息 data_process.read_data() ##### dir = "res/note_lines.npy" dir_time = "res/note_time.npy" dir_mono = "res/note_mono_lines.npy" ##### 从按音节保存的行信息中读取所需要的特征,并保存到到all_train.npy pre_process.get_train_data(dir, dir_time) ##### 从all_train.npy中按谱面时间和mono的时间计算target, 并保存target在[-15,14]的行 ##### 最后的shape是target和data的shape 按照这个改模型输入的神经元 ct.get_targets("res/note_time.npy", "res/note_mono_lines.npy", "res/all_train.npy")
out = model(x) loss = F.binary_cross_entropy(out, target.float()) losses.append(loss.item()) targets += list(target.numpy()) out = out.view(-1).detach().numpy() outs += list(np.int64(out > 0.5)) acc = accuracy_score(targets, outs) return acc, sum(losses) / len(losses) if __name__ == "__main__": args = set_args() device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # 加载数据 data_path = './data/adult.data' data = read_data(data_path) train_data, test_data, deep_columns_idx, embedding_columns_dict = feature_engine( data) data_wide = train_data[0] train_data = (torch.from_numpy(train_data[0].values), torch.from_numpy(train_data[1].values), torch.from_numpy(train_data[2].values)) train_data = MyDataSet(train_data) test_data = (torch.from_numpy(test_data[0].values), torch.from_numpy(test_data[1].values), torch.from_numpy(test_data[2].values)) test_data = MyDataSet(test_data) trainloader = DataLoader(train_data, batch_size=args.batch_size, shuffle=True)
""" # -*- coding: utf-8 -*- # @File : predict.py # @Time : 2020/12/28 4:13 下午 # @Author : xiaolu # @Email : [email protected] # @Software: PyCharm """ import torch from model import WideDeep from data_process import read_data, feature_engine from config import set_args args = set_args() path = './data/adult.data' data = read_data(path) train_data, test_data, deep_columns_idx, embedding_columns_dict = feature_engine(data) data_wide = train_data[0] # 预测数据的输入格式,这里预测一条数据 t = (torch.from_numpy(train_data[0].values[0].reshape(-1, train_data[0].values.shape[1])), torch.from_numpy(train_data[1].values[0].reshape(-1, train_data[1].values.shape[1]))) # parameters setting deep_model_params = { 'deep_columns_idx': deep_columns_idx, 'embedding_columns_dict': embedding_columns_dict, 'hidden_size_list': args.hidden_size_list, 'dropouts': args.dropouts, 'deep_output_dim': args.deep_out_dim} wide_model_params = {
print "x size: ", len(x) print "y size: ", len(y) plt.scatter(x, y, c=color) plt.xlabel(xname) plt.ylabel(yname) # add legend classes = ['0', '1'] class_colours = ['r', 'g'] recs = [] for i in range(len(class_colours)): recs.append(mpatches.Rectangle((0, 0), 1, 1, fc=class_colours[i])) plt.legend(recs, classes, loc='upper left') plt.show() train, test, features, features_non_numeric = data_process.read_data() train, test, features, features_non_numeric = data_process.process_data( train, test, features, features_non_numeric) tsize = 0.001 dtrain, dtest = cross_validation.train_test_split(train, test_size=tsize) #importance_feat(features) #Correlation_Matrix_plot(train) features = ['Customers', 'Sales', 'Promo'] data = dtest[features] Scatter_plot(data)