예제 #1
0
def do_persistence(seed, id_set):
    """
    做持久化处理
    :param seed:待持久化的文件
    """

    data_list = seed.get('data')
    for each in data_list:
        if each[0] not in id_set:
            ctx = []
            # 首先构建字段
            ctx.append(seed.get('brand_id'))
            ctx.append(seed.get('brand'))
            ctx.append(seed.get('serise_id'))
            ctx.append(seed.get('serise'))
            ctx.append(seed.get('p_type'))
            ctx.extend(each)
            ctx.append(seed.get('date'))
            ctx.append(datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S'))
            ctx.append(str(seed.get('epoh')))
            # 写入数据
            text = '\u0001'.join(ctx)
            write_2_file(data_file, text)
            # 写入hdfs
            append_2_hdfs(text)
            # 记录new token
            write_2_file(token, each[0])
            del ctx
        else:
            print('数据id已存在')
예제 #2
0
    def lets_fuck_recycle(self, data):
        """这个函数的作用就是炒回锅肉"""
        info = []
        recycle = []

        info_file = config.info_file.get(self.name)

        url_info = config.url_info_dict.get(self.name)

        headers_info = config.headers_info
        headers_info.update(config.headers_info_dict.get(self.name))

        api = RequestAPI()

        for each in data:
            params_info = self.construct_params_info(each[-1])
            json_text = api.receive_and_request(url=url_info,
                                                headers=headers_info,
                                                params=params_info,
                                                method='GET')
            # 需要验证是否有数据
            js_dict = self.verify_json_text(json_text)
            if js_dict is not None:
                # 放回数据
                info.append(js_dict)
                # 记录数据
                write_2_file(info_file, dumps_json(js_dict))
            else:
                self.feedback()
                self.pop_captcha_info()
                recycle.append(each)

            del params_info
        return info, recycle
예제 #3
0
 def fire(self, data):
     self.loggings.log('Fire')
     rec_array = []
     new_msgs_dict = self.platform.get_new_msgs()
     meta = self.default_meta()
     meta['platform'] = self.platform.class_name
     for convo_name in new_msgs_dict:
         ''' expect these types of data (aka medium) to record:
         -text
         -img
         -meta
         
         For GroupMe_web:
         -images are expected when there is an empty line with perhaps just a space character.
         -emoticons don't appear at all.. just the username and its avatar proceeds it.
         -message sent times are grouped and always say AM or PM. It assumes the closest
             specified day of the week or it uses actual date. ALL caps.
         -A line is a message if it is proceeded by a username.
         
         TUE, 4:15 PM
         Avatar
         User 1 Name
         testing
         message 2, blah
         '''
         # iterate the array of new messages from oldest to newest
         meta['convo_name'] = convo_name
         for line in new_msgs_dict[convo_name]:
             if self.platform.is_timestamp(line):
                 meta['sent_time'] = self.platform.parse_timestamp(line)
                 continue
             if self.platform.is_username(line):
                 meta['sender'] = self.alias_dict[line]
                 continue
             if line == 'Avatar':
                 continue
             rec_time = utils.get_str_time()
             rec_time_str = utils.datetime2str(rec_time)
             meta['rec_time'] = rec_time_str.split('-')
             if line == ' ':
                 meta['medium'] = 'img'
                 '''TODO'''
             else:
                 meta['medium'] = 'txt'
                 f = utils.mktxt('../database/files/',
                                 rec_time_str + '-raw_text.txt',
                                 log=0)
                 utils.write_2_file(f, line, no_time=True)
             utils.pickle_sto('../database/files/',
                              rec_time_str + '-meta.pkl', meta)
             self.loggings.log(str(meta))
             rec_array.append(rec_time)
     self.skull.publish(self.name, rec_array)
예제 #4
0
 def parase_html(self, html):
     info = []
     total_num = 1
     js_dict = loads_json(html)
     if js_dict is not None:
         data = js_dict.get('data')
         if isinstance(data, list) and data != []:
             for i in data:
                 for each in i.get('result'):
                     info.append(each)
                     # 放入持久化
                     write_2_file(self.baidu_list, dumps_json(each))
                 # 还需要判断是否有下一页
                 total_num = i.get('dispNum', 1)
     return info, total_num
예제 #5
0
def insert_seed_save(url, brand, serise, city, date, brand_id, serise_id, p_type):
    """注入seed以及保存"""
    seed = deepcopy(seed_demo)
    seed.update({'url': url,
                 'brand_id': brand_id,
                 'brand': brand,
                 'serise_id': serise_id,
                 'serise': serise,
                 'check_city': city,
                 'date': date,
                 'p_type': p_type
                 })
    write_2_file(seed_file, dumps_json(seed))
    del seed
    return
예제 #6
0
def lets_get_cookie(cookies):
    # result 用来验证cookie 从而验证登录是否成功
    result = False
    index_list = ['userid', 'guaZiUserInfo', 'GZ_TOKEN']
    user_info = {}
    for each in cookies:
        for i in index_list:
            if each.get('name') == i:
                result = True
                user_info.update({i: each.get('value')})

    if result:
        # 保存账号
        write_2_file('DB/user_info.txt', dumps_json(user_info))
        print('该用户信息已经保存')
    else:
        print('登录失败,未能保存用户cookie信息')
예제 #7
0
 def __delete_cookie(self, user_id):
     """
     这里是删除思路就是遍历一遍cookie_list
     除开删除的用户,其余重新写入文件里
     :param user_id:待删除的 user_id
     """
     cookie_list = self.load_cookies_list()
     new_cookie_list = []
     for cookie in cookie_list:
         cookie = loads_json(cookie)
         if not cookie.get('userid') == user_id:
             new_cookie_list.append(cookie)
     # 重新写入文件
     initial_file(config.user_info_file)
     if new_cookie_list != []:
         for each in new_cookie_list:
             write_2_file(config.user_info_file, dumps_json(each))
     return
예제 #8
0
 def quiet(self, i, msg):
     utils.write_2_file(self.files[i], msg)
예제 #9
0
def process_train(corpus_train_path,corpus_test_path,prf_file,base_model_weight=None,flag=6):

    # 训练语料
    raw_train_file = [corpus_train_path + os.sep + type_path + os.sep + type_file \
                      for type_path in os.listdir(corpus_train_path) \
                      for type_file in os.listdir(corpus_train_path + os.sep + type_path)]

    raw_test_file = [corpus_test_path + os.sep + type_path + os.sep + type_file \
                      for type_path in os.listdir(corpus_test_path) \
                      for type_file in os.listdir(corpus_test_path + os.sep + type_path)]

    if flag == 4:# 0 为padding的label 4tag
        label_2_index = {'Pad': 0, 'B': 1, 'M': 2, 'E': 3, 'S': 4, 'Unk': 5}
        index_2_label = {0: 'Pad', 1: 'B', 2: 'M', 3: 'E', 4: 'S', 5: 'Unk'}
        utils.process_data(raw_train_file, 'train.data')
        utils.process_data(raw_test_file, 'test.data')
    else: # 6tag
        label_2_index = {'Pad': 0, 'B': 1, 'B2': 2, 'B3': 3, 'M': 4, 'E': 5, 'S': 6, 'Unk': 7}
        index_2_label = {0: 'Pad', 1: 'B', 2: 'B2', 3: 'B3', 4: 'M', 5: 'E', 6: 'S', 7: 'Unk'}
        utils.process_dataB(raw_train_file, 'train.data')
        utils.process_dataB(raw_test_file, 'test.data')
    
    class_label_count = len(label_2_index)

    train_documents = utils.create_documents('train.data')
    test_documents = utils.create_documents('test.data')
    # 生成词典
    lexicon, lexicon_reverse = utils.get_lexicon(train_documents+test_documents)
    # 词典内字符个数
    print(len(lexicon), len(lexicon_reverse))

    print(len(test_documents))  # 测试语料划分句子个数
    print(len(train_documents)) # 训练语料划分句子个数

    #embedding_model = gensim.models.Word2Vec.load(r'model_embedding_bakeoff2005-50.m') #size = 50
    #embedding_model = gensim.models.Word2Vec.load(r'model_embedding_bakeoff2005.m') #size = 100
    #embedding_model = gensim.models.Word2Vec.load(r'model_embedding_bakeoff2005-150.m') #size = 150

    embedding_model = gensim.models.Word2Vec.load(r'model_embedding_pku_100.m') #size = 200
    embedding_size = embedding_model.vector_size
    print(embedding_size)

    # 预训练词向量
    embedding_weights = utils.create_embedding(embedding_model, embedding_size, lexicon_reverse)
    print(embedding_weights.shape)
    
    train_data_list, train_label_list, train_index_list=utils.create_matrix(train_documents,lexicon,label_2_index)
    test_data_list, test_label_list, test_index_list=utils.create_matrix(test_documents,lexicon,label_2_index)
    

    print(len(train_data_list), len(train_label_list), len(train_index_list))
    print(len(test_data_list), len(test_label_list), len(test_index_list))
    # print(train_data_list[0])
    # print(train_label_list[0])
    #查看句子长度分布
    #print("查看句子长度分布")
    #visualization.plot_sentence_length(train_data_list+test_data_list,train_label_list+test_label_list)

    max_len = max(map(len, train_data_list))
    print('maxlen:', max_len)
    #if max_len > 64:
    #    max_len = 64
    print('maxlen:', max_len)

    train_data_array, train_label_list_padding = utils.padding_sentences(train_data_list, train_label_list, max_len) #定长 都是二维数据
    test_data_array, test_label_list_padding = utils.padding_sentences(test_data_list, test_label_list, max_len)

    print(train_data_array.shape)
    print(test_data_array.shape)
    #print(train_data_array[0])

    train_label_array = np_utils.to_categorical(train_label_list_padding, class_label_count). \
        reshape((len(train_label_list_padding), len(train_label_list_padding[0]), -1))

    test_label_array = np_utils.to_categorical(test_label_list_padding, class_label_count). \
        reshape((len(test_label_list_padding), len(test_label_list_padding[0]), -1))  # 实现多分类问题  变成三维数据
    # 测试用的句子个数 * 句子长度 * 6
    print(train_label_array.shape)
    print(test_label_array.shape)

    # model
    model = CNN_Bilstm_Crf(max_len, len(lexicon), class_label_count, embedding_weights, embedding_size, model_type)
    print(model.input_shape)
    print(model.output_shape)
    model.summary()
    model_name = 'model_%d.png'%model_type
    #plot_model(model, to_file=model_name, show_shapes=True, show_layer_names=True)

    train_nums = len(train_data_array)  # 对应的train_data_list填充0后就是 train_data_array  填充0后的字在字典中的索引

    train_array, val_array = train_data_array[:int(train_nums * 0.9)], train_data_array[int(train_nums * 0.9):]  # 0.9的行用于训练 0.1的行用于防止过拟合
    train_label, val_label = train_label_array[:int(train_nums * 0.9)], train_label_array[int(train_nums * 0.9):]

    checkpointer = ModelCheckpoint(filepath='train_model_pku_100_m6.hdf5', verbose=1, \
                                   save_best_only=True, monitor='val_loss', mode='auto')

    hist = model.fit(train_array, train_label, batch_size=256, epochs=4, verbose=1,validation_data=(val_array,val_label),callbacks=[checkpointer])

    # save model
    model.save_weights('train_model_pku_100_m6.hdf5')

    print(hist.history['val_loss'])  # 记录下每次的平均损失大小
    best_model_epoch = np.argmin(hist.history['val_loss'])
    print('best_model_epoch:', best_model_epoch)

    # 可视化loss acc
    #visualization.plot_acc_loss(hist)
    #visualization.plot_acc(hist)
    #visualization.plot_loss(hist)

    print(hist.history)

    model.load_weights('train_model_pku_100_m6.hdf5')
    # test_data_array 是测试句子个数 * 句子索引中各字在字典中(填充0后)的长度(填充0后)
    test_y_pred = model.predict(test_data_array,batch_size=256,verbose=1) # 本函数按batch获得输入数据对应的输出,函数的返回值是预测值的numpy array
    print("test_y_pred.shape:")  # 测试句子个数 * 测试句子长度 * 5
    print(test_y_pred.shape) #句子个数 * 句子长度 * 5
	# pred_label是预测出的标签 [0,0,....,1,2,3,1]  句子个数 * 句子长度
    pred_label = np.argmax(test_y_pred,axis=2)  # 预测出的测试句子个数 * 句子长度

    # save lexicon
    pickle.dump([lexicon, lexicon_reverse, max_len, index_2_label], open('lexicon_pku_100_m6.pkl', 'wb'))

    K.clear_session()  # 清除session中的缓存数据
    # 生成输出文档
    # 字典大小 lexicon_reverse: {index:char}
    real_text_list, pred_text_list, real_label_list, pred_label_list = utils.create_pred_text( \
        lexicon_reverse, test_data_array, pred_label, test_label_list_padding, test_index_list, class_label_count)
    # {index:char}, 测试句子个数 * 句子长度(填充0后),# 预测出的测试句子个数 * 句子长度,test_label_list_padding对应标签填充0后的数据,每一行的索引
    # 写进文件
    utils.write_2_file(real_text_list, pred_text_list)
    # score
    F = score.prf_score('real_text.txt', 'pred_text.txt', prf_file,model_type, best_model_epoch,class_label_count)  # 返回平均值
예제 #10
0
def save_data(data, id_set):
    for each in data:
        if each[0] not in id_set:
            ctx = '\t'.join(each)
            write_2_file(brands_file, ctx)
예제 #11
0
def save_serise_data(bid, bname, data, id_set):
    """保存车系的数据"""
    for each in data:
        if each[0] not in id_set:
            ctx = '\t'.join([bid, bname, each[0], each[1], each[2]])
            write_2_file(serise_file, ctx)