def get_record(): print('[{}]--start save word'.format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) result = [] num = sql.queryone("select count(*) from bazhuayu_article") step = 100 count = 0 for i in range(int(num / step)): array = sql.queryall("select * from bazhuayu_article limit %s, %s", ((count + 1), (count + step))) count += step print("[{}]--{} data has get from databases......".format( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), count)) for item in array: temp = [] temp.append(item.get('title')) temp.append(item.get('time')) temp.append(item.get('content')) temp.append(item.get('read_num')) comment_array = sql.queryall("select * from bazhuayu_comment where article_id = %s", item.get('id')) comment = [] for item1 in comment_array: temp1 = [] temp1.append(item1.get('user')) temp1.append(item1.get('comment')) temp1.append(item1.get('comment_time')) temp1.append(item1.get('like_num')) comment.append(temp1) temp.append(comment) result.append(temp) print('[{}]--data process finally'.format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) return result
def get_all_data(): # start = time.time() title_lst = sql.queryall("select title from origin_data GROUP BY title") # print("time is %s" % (time.time() - start)) # print("data num is %s" % (sql.queryone("select count(*) from origin_data"))) article = [] comment = [] for title in title_lst: array1 = [] lst = sql.queryall("select * from origin_data where title = %s", title) article_id = str(uuid.uuid4()) array1.append(article_id) array1.append(lst[0].get('title')) array1.append(lst[0].get('time')) array1.append(lst[0].get('content')) array1.append(lst[0].get('read_num')) article.append(array1) comment_lst = [] for item in lst: if item.get('comment') in comment_lst: continue else: comment_lst.append(item.get('comment')) array2 = [] array2.append(item.get('user')) array2.append(item.get('comment')) array2.append(item.get('comment_time')) array2.append(item.get('like_num')) array2.append(article_id) array2.append(str(uuid.uuid4())) comment.append(array2) return article, comment
def sina_record(): print('[{}]--start save word'.format( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) result = [] num = sql.queryone("select count(*) from sj_sina_article") step = 100 count = 0 for i in range(int(num / step)): array = sql.queryall("select * from sj_sina_article limit %s, %s", ((count + 1), (count + step))) count += step print("[{}]--{} data has get from databases......".format( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), count)) for item in array: temp = [ item.get('id'), item.get('nickname'), item.get('post_time'), item.get('post_content_txt'), item.get('reposts_count'), item.get('comments_count'), item.get('attitudes_count') ] comment_array = sql.queryall( "select * from sj_sina_comment where article_id = %s", item.get('id')) comment = [] for item1 in comment_array: temp1 = [ item1.get('comment_nickname'), item1.get('comment_content'), item1.get('comment_attitudes_count'), item1.get('comment_time'), item1.get('comment_source') ] comment.append(temp1) temp.append(comment) result.append(temp) print('[{}]--data process finally'.format( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) return result
def sina_data(): print("[{}]--start process sina!".format( time.strftime("%Y-%m-%d %H-%M-%S", time.localtime()))) sql.execute("truncate table sj_sina_article") sql.execute("truncate table sj_sina_comment") print("[{}]--truncate table finally!".format( time.strftime("%Y-%m-%d %H-%M-%S", time.localtime()))) article = [] comment = [] result = sql.queryall( "select * from sina where isLongText = 'False' and is_repost = 'false' and comments <> '[]' limit %s", 200) for item1 in result: article_id = str(uuid.uuid4()) temp = [ article_id, item1.get('url'), item1.get('post_time'), item1.get('nickname'), item1.get('post_time'), item1.get('post_content_txt'), item1.get('reposts_count'), item1.get('comments_count'), item1.get('attitudes_count'), item1.get('topic') ] article.append(temp) for item in json.loads(item1.get('comments')): temp = [ str(uuid.uuid4()), article_id, item.get('comment_userid'), item.get('comment_nickname'), item.get('comment_content'), item.get('comment_attitudes_count'), item.get('comment_time'), item.get('comment_source') ] comment.append(temp) print("[{}]--data integration finally!".format( time.strftime("%Y-%m-%d %H-%M-%S", time.localtime()))) # 添加数据到数据库 insert_article_sql = 'insert into sj_sina_article(id, url, time, nickname, post_time, post_content_txt,' \ ' reposts_count, comments_count, attitudes_count, topic)' \ ' values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)' insert_comment_sql = 'insert into sj_sina_comment(id, article_id, comment_userid, comment_nickname,' \ ' comment_content, comment_attitudes_count, comment_time, comment_source)' \ ' values(%s, %s, %s, %s, %s, %s, %s, %s)' cnt = sql.insertmany(insert_article_sql, article) print("[{}]--article data insert to sql success. data count is {}.".format( time.strftime("%Y-%m-%d %H-%M-%S", time.localtime()), cnt)) cnt = sql.insertmany(insert_comment_sql, comment) print("[{}]--comment data insert to sql success. data count is {}.".format( time.strftime("%Y-%m-%d %H-%M-%S", time.localtime()), cnt))
def get_record(): filters = filter_tags_util.FilterTag() print('[{}]--start save word'.format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) article_comment = [] article = [] excel = [] num = sql.queryone("select count(*) from sj_tianya_article") step = 100 count = 0 sum = 0 for i in range(int(num / step)): array = sql.queryall("select * from sj_tianya_article limit %s, %s", (count, (count + step))) count += step print("[{}]--{} data has get from databases......".format( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), count)) for item in array: sum += 1 txt_id = '0' * (5 - len(str(sum))) + str(sum) excel_temp = [txt_id, item.get('question_link'), time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(item.get('question_publish_time')))), item.get('question_topics')] temp = [item.get('question_detail')] comment_array = sql.queryall("select * from sj_tianya_comment where article_id = %s", item.get('id')) temp_filter = '' for item1 in comment_array: temp1 = [item1.get('question_answer_content')] for item2 in temp1: temp_filter += (filters.stripTagSimple(item2) + '\r') # 只有文章 article.append(temp) temp.append(temp_filter) # 文章加评论 article_comment.append(temp) # excel索引 excel.append(excel_temp) print('[{}]--data process finally'.format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) return article_comment, article, excel
def tianya_data(): print("[{}]--start process tianya!".format( time.strftime("%Y-%m-%d %H-%M-%S", time.localtime()))) sql.execute("truncate table sj_tianya_article") sql.execute("truncate table sj_tianya_comment") print("[{}]--truncate table finally!".format( time.strftime("%Y-%m-%d %H-%M-%S", time.localtime()))) article = [] comment = [] result = sql.queryall( "select * from tianya where question_link <> '[]' limit %s", 200) for item1 in result: article_id = str(uuid.uuid4()) temp = [ article_id, item1.get('question_title'), item1.get('get_time'), item1.get('question_detail'), item1.get('question_author'), item1.get('question_publish_time'), item1.get('question_topics'), item1.get('question_link') ] article.append(temp) for item in json.loads(item1.get('question_answer')): temp = [ str(uuid.uuid4()), article_id, item.get('question_answer_content'), item.get('question_answer_author'), item.get('question_answer_agree_count'), item.get('question_answer_publish_time') ] comment.append(temp) print("[{}]--data integration finally!".format( time.strftime("%Y-%m-%d %H-%M-%S", time.localtime()))) # 添加数据到数据库 insert_article_sql = 'insert into sj_tianya_article(id, question_title, get_time, question_detail,' \ ' question_author, question_publish_time, question_topics, question_link)' \ ' values(%s, %s, %s, %s, %s, %s, %s, %s)' insert_comment_sql = 'insert into sj_tianya_comment(id, article_id, question_answer_content, ' \ 'question_answer_author, question_answer_agree_count, question_answer_publish_time)' \ ' values(%s, %s, %s, %s, %s, %s)' cnt = sql.insertmany(insert_article_sql, article) print("[{}]--article data insert to sql success. data count is {}.".format( time.strftime("%Y-%m-%d %H-%M-%S", time.localtime()), cnt)) cnt = sql.insertmany(insert_comment_sql, comment) print("[{}]--comment data insert to sql success. data count is {}.".format( time.strftime("%Y-%m-%d %H-%M-%S", time.localtime()), cnt))
def get_record(): title = [] result = [] comment = defaultdict(list) # num = sql.queryone("select count(*) from origin_data") num = 21000 step = 1000 count = 0 for i in range(int(num / step)): array = sql.queryall("select * from origin_data limit %s, %s", ((count + 1), (count + step))) count += step print("%s条数据读取完毕。。。。。。" % count) if i == 0: result1 = [] title.append(array[0].get('title')) result1.append(array[0].get('title')) result1.append(array[0].get('time')) result1.append(array[0].get('content')) result1.append(array[0].get('readNum')) result.append(result1) for item in array: result_temp = [] if item.get('title') not in title: title.append(item.get('title')) result_temp.append(item.get('title')) result_temp.append(item.get('time')) result_temp.append(item.get('content')) result_temp.append(item.get('readNum')) comment[item.get('title')].append(item.get('user')) comment[item.get('title')].append(item.get('comment')) comment[item.get('title')].append(item.get('commentTime')) comment[item.get('title')].append(item.get('likeNum')) result.append(result_temp) else: comment[item.get('title')].append(item.get('user')) comment[item.get('title')].append(item.get('comment')) comment[item.get('title')].append(item.get('commentTime')) comment[item.get('title')].append(item.get('likeNum')) return result, comment
def get_record(): filters = filter_tags_util.FilterTag() print('[{}]--start save word'.format( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) article = [] excel = [] # num = sql.queryone("select count(*) from souhu limit 200") num = 200 step = 100 count = 0 sum = 0 for i in range(int(num / step)): array = sql.queryall("select * from souhu limit %s, %s", (count, (count + step))) count += step print("[{}]--{} data has get from databases......".format( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), count)) for item in array: sum += 1 txt_id = '0' * (5 - len(str(sum))) + str(sum) excel_temp = [ txt_id, item.get('article_link'), time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(item.get('time'))), item.get('article_category') ] temp = [filters.stripTagSimple(item.get('article_content'))] # 只有文章 article.append(temp) # excel索引 excel.append(excel_temp) print('[{}]--data process finally'.format( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) return article, excel
def get_data(): # sql_str = "select n, a , pc, cn, hi, pi, upi, t from AMiner_Author limit 1000" sql_str = "select index_id, pc, cn, hi, pi, upi from AMiner_Author limit 3000" data = sql.queryall(sql_str) print(data) return data
count = 0 for content in self._data_list: count += 1 split(stop_word_list, content) if count % 100 == 0: print("processed %d records" % count) print("process the %d thread, records total num is %d" % (self._num, count)) end_writer_file = time.time() - start_writer_file print("split the {:.0f} thread, records time {:.0f}m {:.0f}s".format( self._num, end_writer_file // 60, end_writer_file % 60)) if __name__ == '__main__': stop_word_list = get_stop_word() num = sql_util.queryone("select count(*) from origin_data") limit = 1000 count = 0 for temp in range(0, num, limit): data = sql_util.queryall("select content from origin_data limit %s,%s", (temp, count + limit)) count += limit step = 100 temp_list = [data[i:i + step] for i in range(0, len(data), step)] thr_list = [MyThread(i, temp_list[i]) for i in range(len(temp_list))] [thr.start() for thr in thr_list] [thr.join() for thr in thr_list]
def get_data_from_sql(): sql_str = "select pc, cn, hi, pi, upi from AMiner_Author where pc<>0 and cn<>0 and hi <>0 and pi<>0 and upi <> 0 " \ "limit 100000" data_result = sql_util.queryall(sql_str) return data_result