def standrand_data(): lrmfc_data_path = get_path('section1', 'LRFMC_data_path') data = pd.read_excel(lrmfc_data_path) data = (data - data.mean(axis=0)) / (data.std(axis=0)) data.columns = ['Z' + i for i in data.columns] zscored_data_path = get_path('section1', 'zscored_data_path') data.to_excel(zscored_data_path, index=False)
def clean_data(): source_data_path = get_path('section1', 'source_data_path') clean_data_path = get_path('section1', 'clean_data_path') data = pd.read_csv(source_data_path, encoding='utf-8') data = data[data['SUM_YR_1'].notnull() * data['SUM_YR_2'].notnull()] index1 = data['SUM_YR_1'] != 0 index2 = data['SUM_YR_2'] != 0 index3 = (data['SEG_KM_SUM'] == 0) & (data['avg_discount'] == 0) data = data[index1 | index2 | index3] data.to_excel(clean_data_path)
def largrange_interpolation(): inputfile = get_path('section1', 'inputfile') outputfile = get_path('section2', 'outputfile') data = pd.read_excel(inputfile, header=None) # judge data if interpolation for i in data.columns: for j in range(len(data)): if (data[i].isnull())[j]: data[i][j] = ployinterp_column(data[i], j) data.to_excel(outputfile, header=None, index=False)
def explore_data(): source_data_path = get_path('section1', 'source_data_path') result_data_path = get_path('section1', 'result_data_path') data = pd.read_csv(source_data_path, encoding='utf-8') explore = data.describe(percentiles=[], include='all').T # T is transposition explore['null'] = len(data) - explore['count'] explore = explore[['null', 'max', 'min']] explore.columns = [u'空值数', u'最大值', u'最小值'] explore.describe() explore.to_excel(result_data_path)
def train_kmeans(): zscored_data_path = get_path('section1', 'zscored_data_path') kmeans_file = get_path('section2', 'kmeans_file') data = pd.read_excel(zscored_data_path) kmodel = KMeans(n_clusters=5, n_jobs=4) kmodel.fit(data) # print(kmodel.cluster_centers_) ### five centres # print(kmodel.inertia_) kmodel.predict(data) # print (kmodel.labels_) joblib.dump(kmodel, kmeans_file)
def lm_model(): net_file = get_path('section2', 'net_file') train = get_data()[0] test = get_data()[1] net = load_model(net_file) predict_result = net.predict_classes(test[:, :3]).reshape( len(test)) ## transform result cm_plot(test[:, 3], predict_result).show()
def get_data(): model_data_path = get_path('section1', 'model_data_path') model_data = pd.read_excel(model_data_path) model_matrix = model_data.as_matrix() shuffle(model_matrix) p = 0.8 # ratio of training train_data = model_matrix[:int(len(model_matrix) * p), :] test_data = model_matrix[int(len(model_matrix) * p):, :] print(train_data, test_data) return train_data, test_data
def train_lm_classification(): # init net net = Sequential() netfile = get_path('section1', 'lm_net_file') net.add(Dense(input_dim=3, output_dim=10)) # input to hide net.add(Activation('relu')) # relu function between their net.add(Dense(input_dim=10, output_dim=1)) # hide to output net.add(Activation('sigmoid')) # sigmoid's function between their net.compile(loss='binary_crossentropy', optimizer='adam') ## use adam train = get_data()[0] net.fit(train[:, :3], train[:, 3], nb_epoch=1000, batch_size=1) # train model ,1000's loop net.save(netfile)
def train_cart_classification(): treefile = get_path('section1', 'tree_file') tree = DecisionTreeClassifier() train = get_data()[0] tree.fit(train[:, :3], train[:, 3]) joblib.dump(tree, treefile) # save training model by joblib
def cart_model(): tree_file = get_path('section2', 'tree_file') train = get_data()[0] test = get_data()[1] cart = joblib.load(tree_file) cm_plot(train[:, 3], cart.predict(train[:, :3])).show()