def set_checkins(self, path, city, n, m, range): data_processor = data_process() data_processor.set_basic_info(path, city) self.pairs_path = data_processor.pairs_path self.path = data_processor.path self.city = data_processor.city self.range = range users = data_processor.checkins.uid.unique().tolist() users.sort(reverse=False) self.users = np.array(deepcopy(users)) # 将用户的id进行从小到大的排序 del users gc.collect() self.M = m # 经度网格 self.N = n # 纬度网格 grid_divider = grid_divide(data_processor.checkins.values.tolist(), self.N, self.M, self.range) self.checkins = grid_divider.divide_area_by_NN().ix[:, [0, 1, 2, 3, 4, 5]] # 带有网格id self.latInterval = grid_divider.get_latInterval() self.lngInterval = grid_divider.get_lngInterval() # self.lons_per_km = 0.005202 * 2 # delta longitudes per kilo meter 0.005681 # self.lons_per_km = 0.005681 * 2 # 旧金山 # self.lats_per_km = 0.004492 * 2 # delta latitudes per kilo meter self.lons_per_km = 0.0059352 * 2 # NY self.lats_per_km = 0.0044966 * 2 # delta latitudes per kilo meter print("城市用户数为:", len(self.users))
def route_sorting(): try: toPOST = request.args.get('toPOST') # user input points except: print("GET failed") return jsonify({"message": "request body is not json."}), 400 if toPOST != {}: print("RADIX SORT START!") ### Process history data try: history = data_process(url) # entire historical typhoon tracks point_data = history_point_data(history) # P(i, j) except: return jsonify({"message": "Defective JMA API."}), 406 ### Process similarity model try: U = json.loads(toPOST) # convert user inputs to dict except: return jsonify({"message": "Wrong fromat in points data."}), 400 ### Success return jsonify(radix_sort(history, point_data, U)) else: print('NO USER INPUTS!') return jsonify({"message": "user input is empty."}), 400
def open_file(file, to_file): if (os.path.isdir(file)): # 判断是否是路径还 files = dp.read_path(file) for file in files: data = dp.create_data(file) data = dp.data_process(data, 2) dp.insert_csv(data, to_file) else: print("csv文件生成失败!") return
def trainandshow(): msg = '展示训练结果' s.send(msg.encode('utf-8')) print("send len is : [%d]" % len(msg)) messagebox.showinfo(title='开始训练', message='开始训练') train_para.withdraw() global train_show train_show = Tk() train_show.geometry('400x450') train_show.title('SQuAD2.0问答系统--训练结果') title = Label(train_show, text="训练结果测评", font=("楷体", 20)) title.place(x=130, y=50) data_use = data_process.data_process() model_para = server_gui.find_model() item = Label(train_show, text=("模型名称: " + model_para.name), font=("楷体", 12), bg="LightGreen") item.place(x=100, y=100) item1 = Label(train_show, text=("存储路径: " + model_para.output_dir), font=("楷体", 12), bg="LightGreen") item1.place(x=100, y=120) item2 = Label(train_show, text=("数据集名称: {num}".format(num=model_para.dataset_name)), font=("楷体", 12), bg="LightGreen") item2.place(x=100, y=140) item3 = Label(train_show, text=("batch_size: {num}".format(num=model_para.batch_size)), font=("楷体", 12), bg="LightGreen") item3.place(x=100, y=160) item3 = Label(train_show, text=("epoch: {num}".format(num=model_para.epoch)), font=("楷体", 12), bg="LightGreen") item3.place(x=100, y=180) item4 = Label(train_show, text=("lr: {num}".format(num=model_para.lr)), font=("楷体", 12), bg="LightGreen") item4.place(x=100, y=200) # 实验结果 result = data_use.read_result() res1 = Label(train_show, text=("EM: {num}".format(num=result["exact"])), font=("楷体", 12), bg="Wheat") res1.place(x=100, y=250) res2 = Label(train_show, text=("F1 score: {num}".format(num=result["f1"])), font=("楷体", 12), bg="Wheat") res2.place(x=100, y=270) res3 = Label(train_show, text=("HasAns_exact: {num}".format(num=result["HasAns_exact"])), font=("楷体", 12), bg="Wheat") res3.place(x=100, y=290) res4 = Label(train_show, text=("HasAns_f1: {num}".format(num=result["HasAns_f1"])), font=("楷体", 12), bg="Wheat") res4.place(x=100, y=310) res5 = Label(train_show, text=("NoAns_exact: {num}".format(num=result["NoAns_exact"])), font=("楷体", 12), bg="Wheat") res5.place(x=100, y=330) res6 = Label(train_show, text=("NoAns_f1: {num}".format(num=result["NoAns_f1"])), font=("楷体", 12), bg="Wheat") res6.place(x=100, y=350) exit_bt = Button(train_show, text='退出', command=sys_exit) exit_bt.place(x=350, y=400) train_show.mainloop()
def data_to_csv(file,to_file): if (os.path.isdir(file)): # 判断是否是路径还 files = dp.read_path(file) for file in files: data = dp.create_data(file) for i in const.DATA_PART: to_files = to_file + i +'.csv' print(to_files) id = const.DATA_PART.index(i) datas = dp.data_process(data,id) dp.insert_csv(datas,to_files) else: print("创建csv文件失败!") return
def set_checkins(self, path, city): data_processor = data_process() data_processor.set_basic_info(path, city) data_processor.user_pairs() self.pairs_path = data_processor.pairs_path self.path = data_processor.path self.city = data_processor.city users = data_processor.checkins.uid.unique().tolist() users.sort(reverse=False) self.users = np.array(deepcopy(users)) # 将用户的id进行从小到大的排序 del users gc.collect() self.checkins = data_processor.checkins print("城市用户数为:", len(self.users))
def train(file): train_data, train_label, vocab_size, output_size = data_process(file) # length, depth = np.shape(train_data) batch_size = 6 # print(np.shape(train_data)) batches = get_batches(train_data, train_label, batch_size) # data = np.reshape(train_data, [length//batch_size, batch_size, depth]) # data:(num_batches, B, T) # label = np.reshape(train_label, [length//batch_size, batch_size]) # label:(num_batches, B) max_epochs = 1000 bilstm = BiLSTM(vocab_size, output_size) sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer()) for epoch in range(max_epochs): batches = get_batches(train_data, train_label, batch_size) for data, label in batches: loss = bilstm.train(sess, data, label) # graph = bilstm.save_graph(sess) print(loss) if loss <= 1e-3: break # result = [] y_pred, y_true = [], [] test = get_batches(train_data, train_label, batch_size) for data, label in test: pred = bilstm.predict(sess, data) # result.extend(pred-label) y_pred.extend(pred) y_true.extend(label) # result = np.array(result) # result[result != 0] = 1 # accuracy = 1 - np.sum(result)/length accuracy = accuracy_score(y_true, y_pred) print("the accuracy is %s" % accuracy) f1 = f1_score(y_true, y_pred, average='macro') print("macro-f1 score is %s" % f1) bilstm.save_graph(sess)
def result_statistics(self, total_time): """ Process the test results and write it to a log file. :param total_time: Program execution time :return: Average Requests Number Per Second """ logger = init_logging(self.log_location, self.log_level) logger.info("===============================================") logger.info('URL: {}'.format(self.urls)) logger.info('Total Requests Number: {}'.format(self.total_requests)) logger.info('Concurrent Requests Number: {}'.format(self.concurrency)) logger.info('Total Time Cost(seconds): {}'.format(total_time)) logger.info('Average Time Per Request(seconds): {}'.format( total_time / self.total_requests)) logger.info('Average Requests Number Per Second: {}'.format( self.total_requests / total_time)) if hasattr(self, 'time_location'): try: result_dict = data_process(self.time_location) logger.info('Max Request Time(seconds): {}'.format( result_dict['max'])) logger.info('Min Request Time(seconds): {}'.format( result_dict['min'])) logger.info('Mean Request Time(seconds): {}'.format( result_dict['mean'])) logger.info( 'Standard Deviation of Request Execution Time(seconds): {:5}' .format(result_dict['std'])) logger.info( 'Execution Time for the First 25% of Request(seconds): {}'. format(result_dict['25.0%'])) logger.info( 'Execution Time for the First 50% of Request(seconds): {}'. format(result_dict['50.0%'])) logger.info( 'Execution Time for the First 75% of Request(seconds): {}'. format(result_dict['75.0%'])) except UnboundLocalError: print('There is not a statistics result.') logger.info("===============================================") return self.total_requests / total_time
def set_checkins(self, path, city, n, m): data_processor = data_process() data_processor.set_basic_info(path, city) self.pairs_path = data_processor.pairs_path self.path = data_processor.path self.city = data_processor.city users = data_processor.checkins.uid.unique().tolist() users.sort(reverse=False) self.users = np.array(deepcopy(users)) # 将用户的id进行从小到大的排序 del users gc.collect() self.m = m # 经度网格 self.n = n # 纬度网格 grid_divider = grid_divide( data_processor.checkins.values.tolist(), self.n, self.m, [30.387953, -97.843911, 30.249935, -97.635460]) self.checkins = grid_divider.divide_area_by_NN().ix[:, [0, 1, 2, 3, 4, 5]] # 带有网格id self.is_resemble = [] # 用来记录社区内相似用户对 self.clusterlist = [] # 用来记录社区id print("城市用户数为:", len(self.users))
from data_process import data_process # 超参数 # Batch Size batch_size = 128 # RNN Size rnn_size = 50 # Number of Layers num_layers = 2 # Embedding Size encoding_embedding_size = 15 decoding_embedding_size = 15 data_process = data_process() def get_inputs(): ''' 模型输入tensor ''' inputs = tf.placeholder(tf.int32, [None, None], name='inputs') targets = tf.placeholder(tf.int32, [None, None], name='targets') learning_rate = tf.placeholder(tf.float32, name='learning_rate') # 定义target序列最大长度(之后target_sequence_length和source_sequence_length会作为feed_dict的参数) target_sequence_length = tf.placeholder(tf.int32, (None,), name='target_sequence_length') max_target_sequence_length = tf.reduce_max(target_sequence_length, name='max_target_len') source_sequence_length = tf.placeholder(tf.int32, (None,), name='source_sequence_length') print("lenghth==", target_sequence_length, " ", max_target_sequence_length, " ", source_sequence_length)
from data_process import data_process from sklearn.naive_bayes import GaussianNB from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer adata, data_after_stop, lables = data_process() data_tr, data_te, labels_tr, labels_te = train_test_split(adata, lables, test_size=0.2) countVectorizer = CountVectorizer() data_tr = countVectorizer.fit_transform(data_tr) X_tr = TfidfTransformer().fit_transform(data_tr.toarray()).toarray() data_te = CountVectorizer( vocabulary=countVectorizer.vocabulary_).fit_transform(data_te) X_te = TfidfTransformer().fit_transform(data_te.toarray()).toarray() model = GaussianNB() model.fit(X_tr, labels_tr) model.score(X_te, labels_te)
def cal_precision(testVec, label): precision = 0 for i in range(len(testVec)): test_data = testVec[i] result = model.classify(myTree, featLabels, test_data) if result == label[i]: precision = precision + 1 precision = precision / len(testVec) return precision if __name__ == '__main__': #----------------------load数据和进行处理 data = pd.read_csv("bank-additional-full-train.csv") bank_list = data.values.tolist() data_processor = process.data_process(bank_list=bank_list) #----------------------得到训练样本和测试样本 data = data_processor.select(train_number, train_feature_number) #----------------------生成决策树 model = dt.decision_tree(select_data=data) featLabels = [] myTree = model.createTree(dataSet=model.select_data[0], labels=model.label(model.select_data[2], model.feature_list), featLabels=featLabels) featLabels = featLabels[:train_feature_number] # print(myTree) createPlot(myTree) #画出决策树 #----------------------测试决策树效果 testVec = data[3][:-1] label = [data[3][i][-1] for i in range(len(data[3]))]
import numpy as np import pandas as pd from tbats import TBATS from data_process import data_process if __name__ == "__main__": process_data = data_process() tbats_res = [] p_pv = [] temp = np.array(process_data[0])[0] estimator = TBATS(seasonal_periods=[7]) fitted_model = estimator.fit(temp) y_1 = fitted_model.forecast(steps=7) temp = np.array(process_data[1])[0] estimator = TBATS(seasonal_periods=[7]) fitted_model = estimator.fit(temp) y_2 = fitted_model.forecast(steps=7) for i in range(5): p_pv.append(0.65 * y_1[i] + 0.35 * y_2[i]) p_6 = (temp[-2] + temp[-9]) * 0.5 p_7 = (temp[-1] + temp[-8]) * 0.5 p_pv.append(p_6) p_pv.append(p_7) tbats_res.append(p_pv) p_uv = [] temp = np.array(process_data[2])[0] estimator = TBATS(seasonal_periods=[7])
str(BATCH_SIZE), str(EPOCHS), HISTORY_FORMAT) # 保存模型和历史记录 def save_model(model, history): if not gfile.Exists(MODEL_DIR): gfile.MakeDirs(MODEL_DIR) model.save(MODEL_FILE) if gfile.Exists(HISTORY_DIR) == False: gfile.MakeDirs(HISTORY_DIR) with open(HISTORY_FILE, 'wb') as f: pickle.dump(history.history, f) if __name__ == "__main__": # input1为unembedding数据,input2为embedding数据 input1, input2, Y_train = data_process() model = model(LOSS, OPT, dropout_ALPHA) history = model.fit([input1, input2], Y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=2, validation_split=0.01) save_model(model, history)
# ip = "10.1.114.125" port = 6000 s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) failed_count = 0 while True: try: print("start connect to server ") s.connect((ip, port)) break except socket.error: failed_count += 1 print("fail to connect to server %d times" % failed_count) if failed_count == 100: sys.exit(-11) read_data = data_process.data_process() info, data_ans, data_pred = read_data.read_json() login = Tk() # main_gui = Tk() login.title('登录界面') # 设计窗口大小 login.geometry('210x200') # 设计GUI用户登录窗体 Label(login, text="用户登录").grid(row=0, column=0, columnspan=2) Label(login, text="用户名").grid(row=1, column=0) name = Entry(login) name.grid(row=1, column=1) Label(login, text="密码:").grid(row=2, column=0) passWord = Entry(login, show='●')
import numpy as np import pandas as pd from tbats import TBATS from params import params_0 from linear_model import model_fit_predict from data_process import data_process if __name__ == "__main__": # 数据预处理 data_process() data_dir = ['p_pv', 'p_uv', 'r_pv', 'r_uv'] # 线性全局模型 linear_res = [] for idx, name in enumerate(data_dir): y_forecasted = model_fit_predict(name) linear_res.append(y_forecasted) linear_res = pd.DataFrame(list(linear_res)).T linear_res.columns = ['p_pv', 'p_uv', 'r_pv', 'r_uv'] # 传统分解模型 tbats_res = [] for idx, name in enumerate(data_dir): data = pd.read_csv('processed_data/' + name + '.csv')[[name]].T print(idx) data = np.array(data)[0] estimator = TBATS(seasonal_periods=params_0['seasonal_periods'][idx]) fitted_model = estimator.fit(data) y_forecasted = fitted_model.forecast(steps=7) y_forecasted = [x * params_0['after_rate'][idx] for x in y_forecasted]
######### #The main loop ######### #The sensor reads the finger print in the desired format #and store it in a file the name of which is stored print "Getting fingerprint..." file = sensor.get("LES"); print "Fingerprint received" #The data is processed differently for enrollment and query if len(argv) > 2 and 'e' not in argv[1]: to_send = data_process.data_process(file) else: to_send = data_process.data_process(file, True) i = 0 #For every SMS to send while i < len(to_send): print i+1 resend = False #Clear previous I/O stream tty.flush()
# Author:jingyile # Data:2020/4/3 下午2:10 # Des:数据集的划分 from data_process import data_process from random import sample data = data_process() # 导入经过清洗后的婚姻数据集 def trainTestSplit(data=data, n=1): data['realIP'] = data['realIP'].apply(str) # 将IP地址转为字符类型 ipCount = data['realIP'].value_counts() # 统计每个用户的网页浏览数 reaIP = ipCount[ipCount > n].index # 找出浏览网页数在n次以上的用户IP ipTrain = sample(list(reaIP), int(len(reaIP) * 0.8)) # 训练集用户 ipTest = [i for i in list(reaIP) if i not in ipTrain] # 测试集用户 index_tr = [i in ipTrain for i in data['realIP']] # 训练用户浏览记录索引 index_te = [i in ipTest for i in data['realIP']] # 测试用户浏览记录索引 dataTrain = data[index_tr] # 训练集数据 dataTest = data[index_te] # 测试集数据 return dataTrain, dataTest
######### #Use try-except to capture and terminate on 'Ctrl-C' signal try: while True: #The sensor reads the finger print in the desired format #and store it in a file the name of which is stored print "Getting fingerprint..." file = sensor.get(form) print "Fingerprint received" #The data is processed according to the format to_send = data_process.data_process(file, form, method) to_send = to_send.split(' ') for i in to_send: while (not send(i)): continue #Send the data # data_transmit.send(tty, to_send, form) break except KeyboardInterrupt: pass
def sendData_to_trama(DeviceId, DeviceSequence, Mac_slave): print("\n>>>>>>>>>>>>>>>>>>>sendData_to_trama!!!!\n") #time.sleep(5) trama = "" Mac_master = "24f7b5e350cc" #MAC ADDRESS DO MASTER NECESSARIO ALTERAR SE O MASTER TIVER OUTRO MAC ADDRESS TramaType = "0" #0equivalent to configuratio 1 - equivalent to broadcast MasterId = DeviceId[0][0:5] SlaveId = DeviceId[0][5:8] print("MasterId: ", MasterId) print("SlaveId: ", SlaveId) #Lê valores de configurações dos DEVICE a partir do ficheir JSON para obter configuraçõs dos DEVICE with open('DeviceConfiguration.json', 'r') as data_device: Device_data = json.load(data_device) SIZE_OFF_DEVICE_DATA = (len(Device_data['DeviceConfiguration'])) #Idem para configurações dos SLAVE with open('SlaveConfiguration.json', 'r') as data_slave: Slave_data = json.load(data_slave) SIZE_OFF_SLAVE_DATA = len(Slave_data['SlaveConfiguration']) #Obtém o ActuationParameter para o SLAVE em questão: ActuationParameter = [] for x in range(0, SIZE_OFF_SLAVE_DATA): if SlaveId == Slave_data['SlaveConfiguration'][x][ 'SlaveId'] and MasterId == Slave_data['SlaveConfiguration'][x][ 'MasterId']: ActuationParameter.append( Slave_data['SlaveConfiguration'][x]['ActuationParameter']) str_actuation = str(ActuationParameter[-1]) SlopeConvertion = [] InterceptConvertion = [] DeviceType = [] ReadType = [] trama = "" for i in range(0, len(DeviceId)): #print("DeviceID: ",DeviceId[i]) Slope = [] Intercept = [] Device_type = [] Read_type = [] #Carrega todas as configurações de DEVICES for x in range(0, SIZE_OFF_DEVICE_DATA): #Utiliza apenas as configurações mais recentes: if (DeviceId[i] == Device_data['DeviceConfiguration'][x] ['DeviceId']): Slope.append( Device_data['DeviceConfiguration'][x]['SlopeConvertion']) Intercept.append(Device_data['DeviceConfiguration'][x] ['InterceptConvertion']) Device_type.append( Device_data['DeviceConfiguration'][x]['DeviceType']) Read_type.append( Device_data['DeviceConfiguration'][x]['ReadType']) #Obter a ultima posição do array, ou seja os ados mais recentes if len(Slope) != 0: SlopeConvertion.append(Slope[-1]) if len(Intercept) != 0: InterceptConvertion.append(Intercept[-1]) if len(Device_type) != 0: DeviceType.append(Device_type[-1]) if len(Read_type) != 0: ReadType.append(Read_type[-1]) # print(SlopeConvertion) # print(InterceptConvertion) # print(DeviceType) # print(ReadType) #Passa o valor do array para uma string, ou seja, aqui é que converte os paramtros na TRAMA: str_Slope = "" str_Inter = "" str_DeviceType = "" str_ReadType = "" for j in range(0, len(SlopeConvertion)): str_Slope = str_Slope + SlopeConvertion[j] for j in range(0, len(InterceptConvertion)): str_Inter = str_Inter + InterceptConvertion[j] for j in range(0, len(DeviceType)): str_DeviceType = str_DeviceType + DeviceType[j] str_DeviceType = str_DeviceType.replace(" ", "") for j in range(0, len(ReadType)): str_ReadType = str_ReadType + ReadType[j] # print(str_DeviceType) #Preenchimento com zeros para atingir o comprimento definido: if len(DeviceSequence) != 60: DeviceSequence = zerofill(DeviceSequence, 60) if len(str_ReadType) != 15: str_ReadType = zerofill(str_ReadType, 15) if len(str_DeviceType) != 45: str_DeviceType = zerofill(str_DeviceType, 45) if len(str_Slope) != 150: str_Slope = zerofill(str_Slope, 150) if len(str_Inter) != 150: str_Inter = zerofill(str_Inter, 150) if len(str_actuation) != 20: str_rulle = zerofill(str_actuation, 20) #trama de configuration print("rulle: ", str_rulle) trama = Mac_slave + TramaType + MasterId + SlaveId + DeviceSequence + str_ReadType + str_DeviceType + str_Slope + str_Inter + str_rulle print(len(trama), trama) nowd = datetime.now() nowd = nowd.strftime("%d/%m/%Y %H:%M:%S") print(">>>> READING SLAVES....", nowd) #Envio da TAMA para o MASTER micrococontroller: #Comunicação via série para o microcontrolador, e receção do que lá estiver para receber response = TradeAntenna(trama) str_response = str(response) print(str_response) recv_trama = "" for x in range(0, len(response)): recv_trama = (response[x]) print("recv_trama:", recv_trama) print(len(str_response)) if len(recv_trama) == 297: data_process( recv_trama ) #Processa toda a informação recebida do SLAVE, gera alarmes, faz broadcast e escreve informação no ficheiro DataSeries.json else: print("SLAVE NOT RESPONDING -> SIZE: ", len(recv_trama)) print("\nwaiting for next slave<<<<<<<<<<<<<\n")
parser.add_argument("--do_predict", type=ast.literal_eval, default=True, help="do predict") parser.add_argument("--do_model", type=str, default="trigger", choices=["trigger", "role"], help="trigger or role") parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay rate for L2 regularizer.") parser.add_argument("--warmup_proportion", type=float, default=0.1, help="Warmup proportion params for warmup strategy") parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.") parser.add_argument("--eval_step", type=int, default=200, help="eval step") parser.add_argument("--model_save_step", type=int, default=3000, help="model save step") parser.add_argument("--batch_size", type=int, default=32, help="Total examples' number in batch for training.") parser.add_argument("--add_crf", type=ast.literal_eval, default=True, help="add crf") parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint") parser.add_argument("--use_data_parallel", type=ast.literal_eval, default=False, help="Whether use data parallel.") args = parser.parse_args() # yapf: enable. # 先把数据处理好保存下来 train_data = data_process(args.train_data, args.do_model) # 处理训练数据 dev_data = data_process(args.dev_data, args.do_model) # 处理dev数据 test_data = data_process(args.test_data, args.do_model) predict_sents, predict_data = data_process(args.predict_data, args.do_model, is_predict=True) write_by_lines("{}/{}_train.tsv".format(args.data_dir, args.do_model), train_data) write_by_lines("{}/{}_dev.tsv".format(args.data_dir, args.do_model), dev_data) write_by_lines("{}/{}_test.tsv".format(args.data_dir, args.do_model), test_data) write_by_lines("{}/{}_predict.tsv".format(args.data_dir, args.do_model), predict_data) schema_labels = schema_process(args.schema_path, args.do_model)
import requests import numpy as np import json import csv import time import urllib3 import tensorflow as tf from data_process import data_process import boxx import pickle '''This predictor is a binary predictor and can predict which team will win in the match, the shortcome is that it somehow cannot give a win rate prediction information.''' data_process_1 = data_process() # create dicts heros_id = np.arange(115) heroes_info_dict = data_process_1.get_hero_data() id_list = [] for i in range(115): id_list.append(int(heroes_info_dict[i]['id'])) id_dict = dict(zip(id_list, heros_id)) print(id_dict) # read match data heros_data, results_data = data_process_1.process_data('data_3.csv') results_data = np.reshape(results_data, [40000, 2]) # map the heros data to a binary matrix heros_features = data_process_1.map_heros_data_matrix(heros_data, id_dict) print(heros_features)
def typhoon_history(): return jsonify(data_process(url))
checkpoint, directory=config.configs_checkpoint()['directory'], max_to_keep=config.configs_checkpoint()['max_to_keep']) if manager.latest_checkpoint: checkpoint.restore(manager.latest_checkpoint) #迭代训练 for epoch in range(1, epochs + 1): start = time.time() loss = train_sample(inputs, labels, label_length, optimizer, model) end = time.time() te = end - start #5次保存一个检查点并输出一个loss if epoch % 5 == 0: manager.save() print("Epoch %d/%d" % (epoch, epochs)) print( "%d/%d [==============================] - %ds %dms/step - loss: %.4f" % (inputs.shape[0], inputs.shape[0], te, te * 1000 / inputs.shape[0], loss)) if __name__ == "__main__": model = DS2() epochs = config.configs_train()["train_epochs"] data_path = config.configs_train()["data_path"] batch_size = config.configs_train()["batch_size"] inputs, labels, label_length = data_process(data_path=data_path, batch_size=batch_size, if_train_or_test='train') optimizer = tf.keras.optimizers.Adam() train(model, optimizer, inputs, labels, label_length, epochs)
import sys import os import data_process if __name__=="__main__": filepath="/Users/mengqwang/Documents/tMall/clean_t_alibaba_data.csv" ratio0=int(sys.argv[1]) ratio1=int(sys.argv[2]) ratio2=int(sys.argv[3]) ratio3=int(sys.argv[4]) ratio4=int(sys.argv[5]) dp=data_process.data_process(filepath,ratio0,ratio1,ratio2,ratio3,ratio4) dp.action2rate() dp.dataOutput()
def classify(inputTree, featLabels, testVec): firstStr = next(iter(inputTree)) #获取决策树结点 secondDict = inputTree[firstStr] #下一个字典 featIndex = featLabels.index(firstStr) for key in secondDict.keys(): if testVec[featIndex] == key: if type(secondDict[key]).__name__ == 'dict': classLabel = classify(secondDict[key], featLabels, testVec) else: classLabel = secondDict[key] return classLabel if __name__ == '__main__': data = pd.read_csv("bank-additional-full.csv") bank_list = data.values.tolist() data_processor = process.data_process(bank_list=bank_list) data = data_processor.select(10000, 2) dataSet, labels = data[0], label(data[2], feature_list) featLabels = [] myTree = createTree(dataSet, labels, featLabels) print(myTree) testVec = data[3][:-1] label = [data[3][i][-1] for i in range(len(data[3]))] precision = 0 for i in range(len(testVec)): result = classify(myTree, featLabels, testVec[i]) if result == label[i]: print('a') precision = precision + 1 precision = precision / len(testVec)
# te.to_excel('./te.xlsx') # ===构建物品相似度矩阵====== cor = jaccard(te) # 杰卡德相似系数 cor = pd.DataFrame(cor, index=urlTrain, columns=urlTrain) # cor.to_excel('./cor.xlsx') # 构建测试集用户网址浏览字典 ipTest = list(set(data_te['realIP'])) dic_te = {ip: list(data_te.loc[data_te['realIP'] == ip, 'fullURL']) for ip in ipTest} rem = pd.DataFrame(index=range(len(data_te)), columns=['IP', 'url', 'rec1','rec2','rec3','rec4','rec5','recall','precision','R','anum']) rem['IP'] = list(data_te['realIP']) rem['url'] = list(data_te['fullURL']) index = data_process()['fullURL'].value_counts() for i in rem.index: rnum = 0 # 给用户的推荐中用户真正感兴趣的个数 anum = len(dic_te[rem.loc[i, 'IP']]) # 当前用户实际访问个数 即该用户实际感兴趣的网页个数 rem.loc[i, 'anum'] = anum if rem.loc[i, 'url'] in urlTrain: rem.loc[i,'R'] = 1 index1 = cor.loc[rem.loc[i, 'url']].argmax() rem.loc[i, 'rec1'] = cor.index[index1] # 推荐的网址1 index2 = cor.loc[rem.loc[i, 'url']].argmax()-1 rem.loc[i, 'rec2'] = cor.index[index2] # 推荐的网址2 index3 = cor.loc[rem.loc[i, 'url']].argmax()-2 rem.loc[i, 'rec3'] = cor.index[index3] # 推荐的网址3 index4 = cor.loc[rem.loc[i, 'url']].argmax()-3 rem.loc[i, 'rec4'] = cor.index[index4] # 推荐的网址4 index5 = cor.loc[rem.loc[i, 'url']].argmax()-4
file_utils.mkdir_path(log_foloder_name) log_utils.log_config(log_foloder_name + LOG_NAME) #1. 初始化程序运行配置基础信息 config.init_config(root_path, data_type, get_type) config_dict = config.get_config(root_path, data_type, curr_date) #2. 开始采集 run_result = data_list_collection.data_collection(config_dict) if run_result < 1: logging.error("[1]执行不成功,终止程序运行:%s" % (run_result)) #清空当天文件夹 data_list_folder_name = config_dict["data_list_folder_name"] if file_utils.clear_folder(data_list_folder_name): logging.info("清空文件夹文件:%s" % (data_list_folder_name)) logging.error("今天[%s]数据采集不成功,请重新运行采集程序!" % (curr_date)) sys.exit(0) #3.数据分析 data_process.data_process(config_dict) #4.新增数据采集 run_result = data_collection_get_new_data.get_new_data(config_dict) if run_result < 1: logging.error("[2]执行不成功,终止程序运行:%s" % (run_result)) sys.exit(0) # 5.数据采集检查 check_data.check_data(curr_date)
def main(): filename='Eur-Lex' path='../'+filename+'/' + filename # small checks # al=min_max_variance_active_learner(path) # al.active_select() # return """with open(path+'.seed.0','r') as fp: line=fp.readline() print len(line.strip().split(' ')) return l=learner(path) l.read_sparse_data(path+'.train.norm',nfeatures=1837) sparse_list_data=[1,2,3,4,5,6,7,8] sparse_list_row=[0,1,2,3,4,5,6,7] sparse_list_col=[0,1,0,1,0,1,0,1] Q=csr_matrix((np.array(sparse_list_data),(np.array(sparse_list_row),np.array(sparse_list_col))),shape=(8,2)) print Q.todense() Q=normalize(Q, axis=0, norm='l2') print Q.todense() return """ # change tradeoff function calls if len(sys.argv)>1: input_text = sys.argv[1] if input_text in ['tradeoff','check']: input_method = sys.argv[2] else: input_text='generate_seed'#tradeoff'#''#'random'#'random'#'check_minmax_l2distance'#'preprocess' input_method='random'#'multiple_seed'#'maxquery_vary_delta' #********************** #***Tradeoff Section*** #********************** if input_text in 'tradeoff': seed_file_idx=['0']#[str(i) for i in range(5)] seed_files=[path+'.seed.'+idx for idx in seed_file_idx] max_iter=2 #********************************** if input_method in 'max_query_vary_delta': set_of_delta=list(np.arange(.1,.2,.01)) for fseed in seed_files: for delta in set_of_delta: active_learner_general(max_query_active_learner(path,delta=delta), fseed, fsave_ext='.delta_'+str(delta), max_iter=max_iter) return #********************************* if input_method in 'minmax': al= min_max_inner_prod(path) if input_method in 'minsum':# al= min_sum_inner_prod(path) if input_method in 'minmin':# al= min_min_inner_prod(path) if input_method in 'maxquery':# delta=.1 save_fig_interval=50 al=max_query_active_learner(path,delta=delta, save_fig_interval=save_fig_interval) if input_method in 'minmaxvar': al=min_max_variance_active_learner(path, count=5) if input_method in 'random':# al= random_active_learner(path) run_tradeoff_exp(al,seed_files,max_iter=max_iter) #********************************************************** #****************CHECK************************************* #********************************************************** if input_text in 'check': if input_method in 'tuning': l=learner(path) l.tune() if input_method in 'popular_n_rare': l=learner(' ') acc = l.train_test() with open('acc','a') as fp : fp.write(' '.join([str(acc_val) for acc_val in acc])+'\n') #d=data_process() #d.get_seed(frac=.5,ftrain='train') #d.get_label() #l.read_out_file() #l.decide_popular_n_rare_labels() #l.get_acc() return """ X,Y,Q=generate_sparse_data(nsamples=5,nlabels=2,nfeatures=2,density=.75) al=min_max_inner_prod(path) al.Xtrain=X al.Ytrain=Y al.Q=Q for i in range(5): al.active_select() return""" if input_method in 'min_max_l2distance': check_min_max_dist_outer(path) if input_method in 'ball_tree': X,Y,Q = generate_sparse_data(nsamples=8,nlabels=2,nfeatures=2,density=.75) al=min_max_l2distance_lower(path,leaf_size=2) al.Xtrain=X al.Ytrain=Y al.Q=Q al.create_ball_tree() al.brute_force_l2_norm(X,np.array(range(X.shape[0]))) print('************\n\ndoing active select\n\n *******************') al.active_select() #al.show_ball_tree_n_points() if input_method in 'inner_prod':# checking is left k=1 l=learner(path) l.read_metadata() # change ftrain to ftrain_src l.ftrain=l.ftrain_src # run train and test * create Q, create out file , decide a rank value l.train_test(k=k,acc_method='popular_n_rare')# define rank, default k=1 # read test file, l.Ytest,l.Xtest=l.read_sparse_data(l.ftest, l.nfeatures) # compute out file values pred_computed = l.compute_k_scores(l.Xtest, l.Q, k=k) # change the output if required later Ytest_comp=[] score_comp=[] for sample_score in pred_computed: Ytest_comp.append(sample_score[0][0]) score_comp.append(sample_score[0][1]) # compare and report the result Ytest_pred, score_pred = l.read_out_file() for label1,label2,score1,score2 in zip(Ytest_comp,Ytest_pred, score_comp, score_pred): print('label1:'+str(label1)+',label2:'+str(label2)+',score1:'+str(score1)+',score2:'+str(score2)+'\n') #********************************************************** #****************PLOT************************************* #********************************************************** if input_text in 'plot': if input_method in 'active_score_per_iter': path='../results/exp7rep/bibtex.maxquery.count.' nfiles=40 plot_active_score_per_iter(path, nfiles) if input_method in 'samples per labels': l=learner(path) l.read_metadata() Ytrain=l.read_sparse_data(l.ftrain_src,l.nfeatures)[0] l.find_samples_per_label(l.nlabels,Ytrain) if input_method in 'single_seed': plot_single_seed() if input_method in 'multiple_seed': seed_set=[str(i) for i in range(5) ] #seed_set=[''] path='../results/testing/bibtex.acc.' method_list=['minmaxdev','random'] dp = data_process() dp.readfiles_outer(path, method_list, seed_set, num_of_acc=2,fsave=path) # modify if input_method in 'ad-hoc': path='../results/exp5/e/bibtex.acc.' method_list=['maxquery','random'] file_list = [path+m for m in method_list] plot_your_choice(2 , file_list, method_list) #********************************************************** #****************OTHER************************************* #********************************************************** if input_text in 'preprocess': l = learner(path) list_of_files=[path+ext for ext in ['.train', '.test', '.heldout']] l.read_metadata() l.read_normalize_write_sparse_files(list_of_files,l.nfeatures) if input_text in 'min_max_l2distance': print('minmax l2 distance') leaf_size=20 bound='lower' minmaxl2=min_max_l2distance(path,leaf_size,bound) fseed=path+'.seed.0' active_learner_general(minmaxl2,fseed) #active_learner_general(random_al,frac_seed) if input_text in 'generate_seed': #print 'seed generation' start_idx=0 nseed_required=1#10 gen_seed=data_process() frac_seed=.1 ftrain=path+'.train.norm' for i in range(nseed_required): fseed=path+'.seed.'+str(i+start_idx) #print fseed #list_of_labels,nlabels =gen_seed.get_label(ftrain) seeds=gen_seed.get_seed(frac_seed,ftrain)# how to get nlabels #print len(seeds) #with open(fseed,'w') as fp: # fp.write(' '.join(str(s) for s in seeds)) #seed_generator.get_seed(frac_seed,fseed) if input_text in 'changing_seed':# incomplete frac_l=0 frac_u=0 frac_diff=0 gen_seed=data_process() for frac in range(frac_l,frac_u, frac_diff): seeds=gen_seed.get_seed(frac_seed,ftrain) with open(fseed,'w') as fp: fp.write(' '.join(str(s) for s in seeds))
from data_process import data_process from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer from sklearn.naive_bayes import GaussianNB, MultinomialNB from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report,confusion_matrix vectorizer = CountVectorizer() # 姝ゅ悜閲忚浆鍖栧嚱鏁� transformer = TfidfTransformer() # tfidf杞�鍖栧嚱鏁� adata, labels, data_after_stop = data_process(number_p=10000,number_n=10000) # 璇诲叆鏁版嵁 data_tr, data_te, labels_tr, labels_te = \ train_test_split(adata, labels, test_size=0.2) # 鏁版嵁鎷嗗垎 word_vec_tr = vectorizer.fit_transform(data_tr) # 璁�缁冮泦鏍锋湰澶勭悊 tfidf_tr = transformer.fit_transform(word_vec_tr) word_vec_te = CountVectorizer(vocabulary=vectorizer.vocabulary_).fit_transform(data_te) tfidf_te = transformer.fit_transform(word_vec_te) # 娴嬭瘯闆嗘牱鏈�澶勭悊 model = MultinomialNB().fit(tfidf_tr,labels_tr) # 妯″瀷璁�缁� pre = model.predict(tfidf_te) sum(labels_te==pre)/len(labels_te) classification_report(labels_te,pre) confusion_matrix(labels_te,pre)
# 预测 def predict(test, pre_model): predict_result = pre_model.predict(test) predict_result = list(predict_result.reshape(len(predict_result), )) # 根据赛题要求,不是0就是1 # result=[] # for i in predict_result: # if i < 0.5: # i = 0 # else: # i = 1 # result.append(i) return predict_result # 保存为csv def save_csv(sid, result): result_table = pd.DataFrame({"sid": sid, "label": result}) result_table.to_csv("predict.csv", index=False) if __name__ == "__main__": pre_model = load_model() input1, input2, sid = data_process(train=False) result = predict([input1, input2], pre_model) save_csv(sid, result)
from data_process import data_process data_tr, data_te = trainTestSplit(n=3) # ===取出训练集用户的IP与浏览网址====== ipTrain = list(set(data_tr['realIP'])) urlTrain = list(set(data_tr['fullURL'])) # 构建测试集用户网址浏览字典 ipTest = list(set(data_te['realIP'])) dic_te = { ip: list(data_te.loc[data_te['realIP'] == ip, 'fullURL']) for ip in ipTest } index = data_process() index2 = index['fullURL'].value_counts( ) # 取出热门网址 <class 'pandas.core.series.Series'> rem = pd.DataFrame(index=range(len(data_te)), columns=[ 'IP', 'url', 'rec1', 'rec2', 'rec3', 'rec4', 'rec5', 'recall', 'precision' ]) rem['IP'] = list(data_te['realIP']) rem['url'] = list(data_te['fullURL']) for i in rem.index: index3 = index2 # 当前用户不能干扰其与用户 rnum = 0 # 给用户的推荐中用户真正感兴趣的个数 anum = len(dic_te[rem.loc[i, 'IP']]) # 当前用户实际访问个数 即该用户实际感兴趣的网页个数 if rem.loc[i, 'url'] in index3: # 判断当前浏览网址是否在候选网址集合中 # del index3[rem.loc[i, 'url']] # 这样会出现在index3中删除后index2也删除的问题...
######### # Use try-except to capture and terminate on 'Ctrl-C' signal try: while True: # The sensor reads the finger print in the desired format # and store it in a file the name of which is stored print "Getting fingerprint..." file = sensor.get(form) print "Fingerprint received" # The data is processed according to the format to_send = data_process.data_process(file, form, method) to_send = to_send.split(" ") for i in to_send: while not send(i): continue # Send the data # data_transmit.send(tty, to_send, form) break except KeyboardInterrupt: pass