def sina_data(): print("[{}]--start process sina!".format( time.strftime("%Y-%m-%d %H-%M-%S", time.localtime()))) sql.execute("truncate table sj_sina_article") sql.execute("truncate table sj_sina_comment") print("[{}]--truncate table finally!".format( time.strftime("%Y-%m-%d %H-%M-%S", time.localtime()))) article = [] comment = [] result = sql.queryall( "select * from sina where isLongText = 'False' and is_repost = 'false' and comments <> '[]' limit %s", 200) for item1 in result: article_id = str(uuid.uuid4()) temp = [ article_id, item1.get('url'), item1.get('post_time'), item1.get('nickname'), item1.get('post_time'), item1.get('post_content_txt'), item1.get('reposts_count'), item1.get('comments_count'), item1.get('attitudes_count'), item1.get('topic') ] article.append(temp) for item in json.loads(item1.get('comments')): temp = [ str(uuid.uuid4()), article_id, item.get('comment_userid'), item.get('comment_nickname'), item.get('comment_content'), item.get('comment_attitudes_count'), item.get('comment_time'), item.get('comment_source') ] comment.append(temp) print("[{}]--data integration finally!".format( time.strftime("%Y-%m-%d %H-%M-%S", time.localtime()))) # 添加数据到数据库 insert_article_sql = 'insert into sj_sina_article(id, url, time, nickname, post_time, post_content_txt,' \ ' reposts_count, comments_count, attitudes_count, topic)' \ ' values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)' insert_comment_sql = 'insert into sj_sina_comment(id, article_id, comment_userid, comment_nickname,' \ ' comment_content, comment_attitudes_count, comment_time, comment_source)' \ ' values(%s, %s, %s, %s, %s, %s, %s, %s)' cnt = sql.insertmany(insert_article_sql, article) print("[{}]--article data insert to sql success. data count is {}.".format( time.strftime("%Y-%m-%d %H-%M-%S", time.localtime()), cnt)) cnt = sql.insertmany(insert_comment_sql, comment) print("[{}]--comment data insert to sql success. data count is {}.".format( time.strftime("%Y-%m-%d %H-%M-%S", time.localtime()), cnt))
def insert_data(article, comment): sql_str1 = "truncate table article" sql_str2 = "truncate table comment" sql.execute(sql_str1) sql.execute(sql_str2) sql_article = "insert into article(id, title, time, content, read_num) " \ "values (%s, %s, %s, %s, %s)" sql_comment = "insert into comment(user, comment, comment_time, like_num, article_id, id) " \ "values (%s, %s, %s, %s, %s, %s)" sql.insertmany(sql_article, article) sql.insertmany(sql_comment, comment)
def insert_to_sql(data): truncate_sql = "truncate table AMiner_Author2Paper" sql_util.execute(truncate_sql) sql_str = "insert into AMiner_Author2Paper(index_id, author_id, paper_id, position) values(%s, %s, %s, %s)" num = 0 for i in range(0, len(data), 100): if i + 100 > len(data): insert_data = data[i:len(data - i)] else: insert_data = data[i:i + 100] result = sql_util.insertmany(sql_str, insert_data) num += result print("[{}]--insert data num is {}...".format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), num))
def insert_to_sql(data): truncate_sql = "truncate table AMiner_Author" sql_util.execute(truncate_sql) sql_str = "insert into AMiner_Author(index_id, n, a, pc, cn, hi, pi, upi, t) values(%s, %s, %s, %s, %s, %s, %s, " \ "%s, %s)" for i in range(0, len(data), 100): if i + 100 > len(data): insert_data = data[i:len(data - i)] else: insert_data = data[i:i + 100] result = sql_util.insertmany(sql_str, insert_data) print("[{}]--insert data num is {}...".format( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), result))
def insert_to_sql(data): truncate_sql = "truncate table AMiner_Coauthor" sql_util.execute(truncate_sql) sql_str = "insert into AMiner_Coauthor(one_author, another_author, collaborations) values(%s, %s, %s)" num = 0 for i in range(0, len(data), 100): if i + 100 > len(data): insert_data = data[i:len(data - i)] else: insert_data = data[i:i + 100] result = sql_util.insertmany(sql_str, insert_data) num += result print("[{}]--insert data num is {}...".format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), num))
def tianya_data(): print("[{}]--start process tianya!".format( time.strftime("%Y-%m-%d %H-%M-%S", time.localtime()))) sql.execute("truncate table sj_tianya_article") sql.execute("truncate table sj_tianya_comment") print("[{}]--truncate table finally!".format( time.strftime("%Y-%m-%d %H-%M-%S", time.localtime()))) article = [] comment = [] result = sql.queryall( "select * from tianya where question_link <> '[]' limit %s", 200) for item1 in result: article_id = str(uuid.uuid4()) temp = [ article_id, item1.get('question_title'), item1.get('get_time'), item1.get('question_detail'), item1.get('question_author'), item1.get('question_publish_time'), item1.get('question_topics'), item1.get('question_link') ] article.append(temp) for item in json.loads(item1.get('question_answer')): temp = [ str(uuid.uuid4()), article_id, item.get('question_answer_content'), item.get('question_answer_author'), item.get('question_answer_agree_count'), item.get('question_answer_publish_time') ] comment.append(temp) print("[{}]--data integration finally!".format( time.strftime("%Y-%m-%d %H-%M-%S", time.localtime()))) # 添加数据到数据库 insert_article_sql = 'insert into sj_tianya_article(id, question_title, get_time, question_detail,' \ ' question_author, question_publish_time, question_topics, question_link)' \ ' values(%s, %s, %s, %s, %s, %s, %s, %s)' insert_comment_sql = 'insert into sj_tianya_comment(id, article_id, question_answer_content, ' \ 'question_answer_author, question_answer_agree_count, question_answer_publish_time)' \ ' values(%s, %s, %s, %s, %s, %s)' cnt = sql.insertmany(insert_article_sql, article) print("[{}]--article data insert to sql success. data count is {}.".format( time.strftime("%Y-%m-%d %H-%M-%S", time.localtime()), cnt)) cnt = sql.insertmany(insert_comment_sql, comment) print("[{}]--comment data insert to sql success. data count is {}.".format( time.strftime("%Y-%m-%d %H-%M-%S", time.localtime()), cnt))
return arrays def get_file_list(dir_path, file_list): if os.path.isfile(dir_path): file_list.append(dir_path) elif os.path.isdir(dir_path): for s in os.listdir(dir_path): new_dir = os.path.join(dir_path, s) get_file_list(new_dir, file_list) return file_list if __name__ == '__main__': sql = "truncate table origin_data" sql_util.execute(sql) sql_str = "insert into origin_data(id, title, time, content, read_num, user, comment, comment_time, like_num) " \ "values (%s, %s, %s, %s, %s, %s, %s, %s, %s)" file_list = get_file_list("/Users/red/Desktop/temp/news/data", []) for file_path in file_list: suffix = str.split(file_path, ".") if suffix[-1] == "xlsx" or suffix[-1] == "xls": print("处理excel文件%s" % file_path) arrays = read_xls_file(file_path) result = sql_util.insertmany(sql_str, arrays) print("插入数据%s条" % result) elif suffix[-1] == "csv": print("处理csv文件%s" % file_path) arrays = read_csv_file(file_path) result = sql_util.insertmany(sql_str, arrays)