def gen_post_put(post_item): """ 产生插入post数据库的put :param post_item: :return: """ info_fml = b'info' picture_fml = b'picture' post_id = post_item['post_id'] author_id = post_item['author_id'] row_key = gen_row_key(MD5Utils.md5_code(author_id), MD5Utils.md5_code(post_id)) column_values = [] for key in post_item: if not key.startswith(('_', 'picture')): column_values.append( TColumnValue(info_fml, key.encode(), str(post_item[key]).encode())) elif key.startswith('picture'): column_values.append( TColumnValue(picture_fml, (key + '_num').encode(), str(len(post_item[key])).encode())) for p in range(len(post_item[key])): column_values.append( TColumnValue(picture_fml, (key + str(p)).encode(), str(post_item[key][p]).encode())) put = TPut(row_key, column_values) return row_key, put
def gen_fans_put(fans_item): info_fml = b'info' fans = fans_item['fans'] follow = fans_item['follow'] row_key = gen_row_key(MD5Utils.md5_code(fans), MD5Utils.md5_code(follow)) column_values = [] for key in fans_item: column_values.append( TColumnValue(info_fml, key.encode(), str(comment_item[key]).encode())) put = TPut(row_key, column_values) return row_key, put
def gen_start_spider_info(): """ 在爬虫开始时记录爬虫开始的时间 :return: """ now = time.time() spider_name = 'bbs_sohu' # 让最新的行放在最前面 row_key = gen_row_key(MD5Utils.md5_code(spider_name), str(10**12 - now)) name_column = TColumnValue(b'spider_name', b'name', spider_name.encode()) time_column = TColumnValue(b'time', b'start_time', str(now).encode()) column_values = [name_column, time_column] put = TPut(row_key, column_values) return row_key, put
def gen_author_put(author_item): """ 产生插入author数据库的put :param author_item: :return: """ info_fml = b'info' author_id = author_item['author_id'] row_key = MD5Utils.md5_code(author_id).encode() column_values = [] for key in author_item: if not key.startswith('_'): column_values.append(TColumnValue(info_fml, key.encode(), str(author_item[key])).encode()) put = TPut(row_key, column_values) return row_key, put