def do_persistence(seed, id_set): """ 做持久化处理 :param seed:待持久化的文件 """ data_list = seed.get('data') for each in data_list: if each[0] not in id_set: ctx = [] # 首先构建字段 ctx.append(seed.get('brand_id')) ctx.append(seed.get('brand')) ctx.append(seed.get('serise_id')) ctx.append(seed.get('serise')) ctx.append(seed.get('p_type')) ctx.extend(each) ctx.append(seed.get('date')) ctx.append(datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')) ctx.append(str(seed.get('epoh'))) # 写入数据 text = '\u0001'.join(ctx) write_2_file(data_file, text) # 写入hdfs append_2_hdfs(text) # 记录new token write_2_file(token, each[0]) del ctx else: print('数据id已存在')
def lets_fuck_recycle(self, data): """这个函数的作用就是炒回锅肉""" info = [] recycle = [] info_file = config.info_file.get(self.name) url_info = config.url_info_dict.get(self.name) headers_info = config.headers_info headers_info.update(config.headers_info_dict.get(self.name)) api = RequestAPI() for each in data: params_info = self.construct_params_info(each[-1]) json_text = api.receive_and_request(url=url_info, headers=headers_info, params=params_info, method='GET') # 需要验证是否有数据 js_dict = self.verify_json_text(json_text) if js_dict is not None: # 放回数据 info.append(js_dict) # 记录数据 write_2_file(info_file, dumps_json(js_dict)) else: self.feedback() self.pop_captcha_info() recycle.append(each) del params_info return info, recycle
def fire(self, data): self.loggings.log('Fire') rec_array = [] new_msgs_dict = self.platform.get_new_msgs() meta = self.default_meta() meta['platform'] = self.platform.class_name for convo_name in new_msgs_dict: ''' expect these types of data (aka medium) to record: -text -img -meta For GroupMe_web: -images are expected when there is an empty line with perhaps just a space character. -emoticons don't appear at all.. just the username and its avatar proceeds it. -message sent times are grouped and always say AM or PM. It assumes the closest specified day of the week or it uses actual date. ALL caps. -A line is a message if it is proceeded by a username. TUE, 4:15 PM Avatar User 1 Name testing message 2, blah ''' # iterate the array of new messages from oldest to newest meta['convo_name'] = convo_name for line in new_msgs_dict[convo_name]: if self.platform.is_timestamp(line): meta['sent_time'] = self.platform.parse_timestamp(line) continue if self.platform.is_username(line): meta['sender'] = self.alias_dict[line] continue if line == 'Avatar': continue rec_time = utils.get_str_time() rec_time_str = utils.datetime2str(rec_time) meta['rec_time'] = rec_time_str.split('-') if line == ' ': meta['medium'] = 'img' '''TODO''' else: meta['medium'] = 'txt' f = utils.mktxt('../database/files/', rec_time_str + '-raw_text.txt', log=0) utils.write_2_file(f, line, no_time=True) utils.pickle_sto('../database/files/', rec_time_str + '-meta.pkl', meta) self.loggings.log(str(meta)) rec_array.append(rec_time) self.skull.publish(self.name, rec_array)
def parase_html(self, html): info = [] total_num = 1 js_dict = loads_json(html) if js_dict is not None: data = js_dict.get('data') if isinstance(data, list) and data != []: for i in data: for each in i.get('result'): info.append(each) # 放入持久化 write_2_file(self.baidu_list, dumps_json(each)) # 还需要判断是否有下一页 total_num = i.get('dispNum', 1) return info, total_num
def insert_seed_save(url, brand, serise, city, date, brand_id, serise_id, p_type): """注入seed以及保存""" seed = deepcopy(seed_demo) seed.update({'url': url, 'brand_id': brand_id, 'brand': brand, 'serise_id': serise_id, 'serise': serise, 'check_city': city, 'date': date, 'p_type': p_type }) write_2_file(seed_file, dumps_json(seed)) del seed return
def lets_get_cookie(cookies): # result 用来验证cookie 从而验证登录是否成功 result = False index_list = ['userid', 'guaZiUserInfo', 'GZ_TOKEN'] user_info = {} for each in cookies: for i in index_list: if each.get('name') == i: result = True user_info.update({i: each.get('value')}) if result: # 保存账号 write_2_file('DB/user_info.txt', dumps_json(user_info)) print('该用户信息已经保存') else: print('登录失败,未能保存用户cookie信息')
def __delete_cookie(self, user_id): """ 这里是删除思路就是遍历一遍cookie_list 除开删除的用户,其余重新写入文件里 :param user_id:待删除的 user_id """ cookie_list = self.load_cookies_list() new_cookie_list = [] for cookie in cookie_list: cookie = loads_json(cookie) if not cookie.get('userid') == user_id: new_cookie_list.append(cookie) # 重新写入文件 initial_file(config.user_info_file) if new_cookie_list != []: for each in new_cookie_list: write_2_file(config.user_info_file, dumps_json(each)) return
def quiet(self, i, msg): utils.write_2_file(self.files[i], msg)
def process_train(corpus_train_path,corpus_test_path,prf_file,base_model_weight=None,flag=6): # 训练语料 raw_train_file = [corpus_train_path + os.sep + type_path + os.sep + type_file \ for type_path in os.listdir(corpus_train_path) \ for type_file in os.listdir(corpus_train_path + os.sep + type_path)] raw_test_file = [corpus_test_path + os.sep + type_path + os.sep + type_file \ for type_path in os.listdir(corpus_test_path) \ for type_file in os.listdir(corpus_test_path + os.sep + type_path)] if flag == 4:# 0 为padding的label 4tag label_2_index = {'Pad': 0, 'B': 1, 'M': 2, 'E': 3, 'S': 4, 'Unk': 5} index_2_label = {0: 'Pad', 1: 'B', 2: 'M', 3: 'E', 4: 'S', 5: 'Unk'} utils.process_data(raw_train_file, 'train.data') utils.process_data(raw_test_file, 'test.data') else: # 6tag label_2_index = {'Pad': 0, 'B': 1, 'B2': 2, 'B3': 3, 'M': 4, 'E': 5, 'S': 6, 'Unk': 7} index_2_label = {0: 'Pad', 1: 'B', 2: 'B2', 3: 'B3', 4: 'M', 5: 'E', 6: 'S', 7: 'Unk'} utils.process_dataB(raw_train_file, 'train.data') utils.process_dataB(raw_test_file, 'test.data') class_label_count = len(label_2_index) train_documents = utils.create_documents('train.data') test_documents = utils.create_documents('test.data') # 生成词典 lexicon, lexicon_reverse = utils.get_lexicon(train_documents+test_documents) # 词典内字符个数 print(len(lexicon), len(lexicon_reverse)) print(len(test_documents)) # 测试语料划分句子个数 print(len(train_documents)) # 训练语料划分句子个数 #embedding_model = gensim.models.Word2Vec.load(r'model_embedding_bakeoff2005-50.m') #size = 50 #embedding_model = gensim.models.Word2Vec.load(r'model_embedding_bakeoff2005.m') #size = 100 #embedding_model = gensim.models.Word2Vec.load(r'model_embedding_bakeoff2005-150.m') #size = 150 embedding_model = gensim.models.Word2Vec.load(r'model_embedding_pku_100.m') #size = 200 embedding_size = embedding_model.vector_size print(embedding_size) # 预训练词向量 embedding_weights = utils.create_embedding(embedding_model, embedding_size, lexicon_reverse) print(embedding_weights.shape) train_data_list, train_label_list, train_index_list=utils.create_matrix(train_documents,lexicon,label_2_index) test_data_list, test_label_list, test_index_list=utils.create_matrix(test_documents,lexicon,label_2_index) print(len(train_data_list), len(train_label_list), len(train_index_list)) print(len(test_data_list), len(test_label_list), len(test_index_list)) # print(train_data_list[0]) # print(train_label_list[0]) #查看句子长度分布 #print("查看句子长度分布") #visualization.plot_sentence_length(train_data_list+test_data_list,train_label_list+test_label_list) max_len = max(map(len, train_data_list)) print('maxlen:', max_len) #if max_len > 64: # max_len = 64 print('maxlen:', max_len) train_data_array, train_label_list_padding = utils.padding_sentences(train_data_list, train_label_list, max_len) #定长 都是二维数据 test_data_array, test_label_list_padding = utils.padding_sentences(test_data_list, test_label_list, max_len) print(train_data_array.shape) print(test_data_array.shape) #print(train_data_array[0]) train_label_array = np_utils.to_categorical(train_label_list_padding, class_label_count). \ reshape((len(train_label_list_padding), len(train_label_list_padding[0]), -1)) test_label_array = np_utils.to_categorical(test_label_list_padding, class_label_count). \ reshape((len(test_label_list_padding), len(test_label_list_padding[0]), -1)) # 实现多分类问题 变成三维数据 # 测试用的句子个数 * 句子长度 * 6 print(train_label_array.shape) print(test_label_array.shape) # model model = CNN_Bilstm_Crf(max_len, len(lexicon), class_label_count, embedding_weights, embedding_size, model_type) print(model.input_shape) print(model.output_shape) model.summary() model_name = 'model_%d.png'%model_type #plot_model(model, to_file=model_name, show_shapes=True, show_layer_names=True) train_nums = len(train_data_array) # 对应的train_data_list填充0后就是 train_data_array 填充0后的字在字典中的索引 train_array, val_array = train_data_array[:int(train_nums * 0.9)], train_data_array[int(train_nums * 0.9):] # 0.9的行用于训练 0.1的行用于防止过拟合 train_label, val_label = train_label_array[:int(train_nums * 0.9)], train_label_array[int(train_nums * 0.9):] checkpointer = ModelCheckpoint(filepath='train_model_pku_100_m6.hdf5', verbose=1, \ save_best_only=True, monitor='val_loss', mode='auto') hist = model.fit(train_array, train_label, batch_size=256, epochs=4, verbose=1,validation_data=(val_array,val_label),callbacks=[checkpointer]) # save model model.save_weights('train_model_pku_100_m6.hdf5') print(hist.history['val_loss']) # 记录下每次的平均损失大小 best_model_epoch = np.argmin(hist.history['val_loss']) print('best_model_epoch:', best_model_epoch) # 可视化loss acc #visualization.plot_acc_loss(hist) #visualization.plot_acc(hist) #visualization.plot_loss(hist) print(hist.history) model.load_weights('train_model_pku_100_m6.hdf5') # test_data_array 是测试句子个数 * 句子索引中各字在字典中(填充0后)的长度(填充0后) test_y_pred = model.predict(test_data_array,batch_size=256,verbose=1) # 本函数按batch获得输入数据对应的输出,函数的返回值是预测值的numpy array print("test_y_pred.shape:") # 测试句子个数 * 测试句子长度 * 5 print(test_y_pred.shape) #句子个数 * 句子长度 * 5 # pred_label是预测出的标签 [0,0,....,1,2,3,1] 句子个数 * 句子长度 pred_label = np.argmax(test_y_pred,axis=2) # 预测出的测试句子个数 * 句子长度 # save lexicon pickle.dump([lexicon, lexicon_reverse, max_len, index_2_label], open('lexicon_pku_100_m6.pkl', 'wb')) K.clear_session() # 清除session中的缓存数据 # 生成输出文档 # 字典大小 lexicon_reverse: {index:char} real_text_list, pred_text_list, real_label_list, pred_label_list = utils.create_pred_text( \ lexicon_reverse, test_data_array, pred_label, test_label_list_padding, test_index_list, class_label_count) # {index:char}, 测试句子个数 * 句子长度(填充0后),# 预测出的测试句子个数 * 句子长度,test_label_list_padding对应标签填充0后的数据,每一行的索引 # 写进文件 utils.write_2_file(real_text_list, pred_text_list) # score F = score.prf_score('real_text.txt', 'pred_text.txt', prf_file,model_type, best_model_epoch,class_label_count) # 返回平均值
def save_data(data, id_set): for each in data: if each[0] not in id_set: ctx = '\t'.join(each) write_2_file(brands_file, ctx)
def save_serise_data(bid, bname, data, id_set): """保存车系的数据""" for each in data: if each[0] not in id_set: ctx = '\t'.join([bid, bname, each[0], each[1], each[2]]) write_2_file(serise_file, ctx)