Пример #1
0
def sina_data():
    print("[{}]--start process sina!".format(
        time.strftime("%Y-%m-%d %H-%M-%S", time.localtime())))
    sql.execute("truncate table sj_sina_article")
    sql.execute("truncate table sj_sina_comment")
    print("[{}]--truncate table finally!".format(
        time.strftime("%Y-%m-%d %H-%M-%S", time.localtime())))

    article = []
    comment = []
    result = sql.queryall(
        "select * from sina where isLongText = 'False' and is_repost = 'false' and comments <> '[]' limit %s",
        200)
    for item1 in result:
        article_id = str(uuid.uuid4())
        temp = [
            article_id,
            item1.get('url'),
            item1.get('post_time'),
            item1.get('nickname'),
            item1.get('post_time'),
            item1.get('post_content_txt'),
            item1.get('reposts_count'),
            item1.get('comments_count'),
            item1.get('attitudes_count'),
            item1.get('topic')
        ]

        article.append(temp)
        for item in json.loads(item1.get('comments')):
            temp = [
                str(uuid.uuid4()), article_id,
                item.get('comment_userid'),
                item.get('comment_nickname'),
                item.get('comment_content'),
                item.get('comment_attitudes_count'),
                item.get('comment_time'),
                item.get('comment_source')
            ]
            comment.append(temp)
    print("[{}]--data integration finally!".format(
        time.strftime("%Y-%m-%d %H-%M-%S", time.localtime())))
    # 添加数据到数据库
    insert_article_sql = 'insert into sj_sina_article(id, url, time, nickname, post_time, post_content_txt,' \
                         ' reposts_count, comments_count, attitudes_count, topic)' \
                         ' values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'
    insert_comment_sql = 'insert into sj_sina_comment(id, article_id, comment_userid, comment_nickname,' \
                         ' comment_content, comment_attitudes_count, comment_time, comment_source)' \
                         ' values(%s, %s, %s, %s, %s, %s, %s, %s)'

    cnt = sql.insertmany(insert_article_sql, article)
    print("[{}]--article data insert to sql success. data count is {}.".format(
        time.strftime("%Y-%m-%d %H-%M-%S", time.localtime()), cnt))
    cnt = sql.insertmany(insert_comment_sql, comment)
    print("[{}]--comment data insert to sql success. data count is {}.".format(
        time.strftime("%Y-%m-%d %H-%M-%S", time.localtime()), cnt))
Пример #2
0
def insert_data(article, comment):
    sql_str1 = "truncate table article"
    sql_str2 = "truncate table comment"
    sql.execute(sql_str1)
    sql.execute(sql_str2)
    sql_article = "insert into article(id, title, time, content, read_num) " \
                  "values (%s, %s, %s, %s, %s)"
    sql_comment = "insert into comment(user, comment, comment_time, like_num, article_id, id) " \
                  "values (%s, %s, %s, %s, %s, %s)"
    sql.insertmany(sql_article, article)
    sql.insertmany(sql_comment, comment)
Пример #3
0
def insert_to_sql(data):
	truncate_sql = "truncate table AMiner_Author2Paper"
	sql_util.execute(truncate_sql)
	sql_str = "insert into AMiner_Author2Paper(index_id, author_id, paper_id, position) values(%s, %s, %s, %s)"
	num = 0
	for i in range(0, len(data), 100):
		if i + 100 > len(data):
			insert_data = data[i:len(data - i)]
		else:
			insert_data = data[i:i + 100]
		result = sql_util.insertmany(sql_str, insert_data)
		num += result
		print("[{}]--insert data num is {}...".format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), num))
Пример #4
0
def insert_to_sql(data):
    truncate_sql = "truncate table AMiner_Author"
    sql_util.execute(truncate_sql)
    sql_str = "insert into AMiner_Author(index_id, n, a, pc, cn, hi, pi, upi, t) values(%s, %s, %s, %s, %s, %s, %s, " \
        "%s, %s)"
    for i in range(0, len(data), 100):
        if i + 100 > len(data):
            insert_data = data[i:len(data - i)]
        else:
            insert_data = data[i:i + 100]
        result = sql_util.insertmany(sql_str, insert_data)
    print("[{}]--insert data num is {}...".format(
        time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), result))
Пример #5
0
def insert_to_sql(data):
	truncate_sql = "truncate table AMiner_Coauthor"
	sql_util.execute(truncate_sql)
	sql_str = "insert into AMiner_Coauthor(one_author, another_author, collaborations) values(%s, %s, %s)"
	num = 0
	for i in range(0, len(data), 100):
		if i + 100 > len(data):
			insert_data = data[i:len(data - i)]
		else:
			insert_data = data[i:i + 100]
		result = sql_util.insertmany(sql_str, insert_data)
		num += result
		print("[{}]--insert data num is {}...".format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), num))
Пример #6
0
def tianya_data():
    print("[{}]--start process tianya!".format(
        time.strftime("%Y-%m-%d %H-%M-%S", time.localtime())))
    sql.execute("truncate table sj_tianya_article")
    sql.execute("truncate table sj_tianya_comment")
    print("[{}]--truncate table finally!".format(
        time.strftime("%Y-%m-%d %H-%M-%S", time.localtime())))

    article = []
    comment = []
    result = sql.queryall(
        "select * from tianya where question_link <> '[]' limit %s", 200)
    for item1 in result:
        article_id = str(uuid.uuid4())
        temp = [
            article_id,
            item1.get('question_title'),
            item1.get('get_time'),
            item1.get('question_detail'),
            item1.get('question_author'),
            item1.get('question_publish_time'),
            item1.get('question_topics'),
            item1.get('question_link')
        ]

        article.append(temp)
        for item in json.loads(item1.get('question_answer')):
            temp = [
                str(uuid.uuid4()), article_id,
                item.get('question_answer_content'),
                item.get('question_answer_author'),
                item.get('question_answer_agree_count'),
                item.get('question_answer_publish_time')
            ]
            comment.append(temp)
    print("[{}]--data integration finally!".format(
        time.strftime("%Y-%m-%d %H-%M-%S", time.localtime())))
    # 添加数据到数据库
    insert_article_sql = 'insert into sj_tianya_article(id, question_title, get_time, question_detail,' \
                         ' question_author, question_publish_time, question_topics, question_link)' \
                         ' values(%s, %s, %s, %s, %s, %s, %s, %s)'
    insert_comment_sql = 'insert into sj_tianya_comment(id, article_id, question_answer_content, ' \
                         'question_answer_author, question_answer_agree_count, question_answer_publish_time)' \
                         ' values(%s, %s, %s, %s, %s, %s)'

    cnt = sql.insertmany(insert_article_sql, article)
    print("[{}]--article data insert to sql success. data count is {}.".format(
        time.strftime("%Y-%m-%d %H-%M-%S", time.localtime()), cnt))
    cnt = sql.insertmany(insert_comment_sql, comment)
    print("[{}]--comment data insert to sql success. data count is {}.".format(
        time.strftime("%Y-%m-%d %H-%M-%S", time.localtime()), cnt))
Пример #7
0
    return arrays


def get_file_list(dir_path, file_list):
    if os.path.isfile(dir_path):
        file_list.append(dir_path)
    elif os.path.isdir(dir_path):
        for s in os.listdir(dir_path):
            new_dir = os.path.join(dir_path, s)
            get_file_list(new_dir, file_list)
    return file_list


if __name__ == '__main__':
    sql = "truncate table origin_data"
    sql_util.execute(sql)

    sql_str = "insert into origin_data(id, title, time, content, read_num, user, comment, comment_time, like_num) " \
              "values (%s, %s, %s, %s, %s, %s, %s, %s, %s)"
    file_list = get_file_list("/Users/red/Desktop/temp/news/data", [])
    for file_path in file_list:
        suffix = str.split(file_path, ".")
        if suffix[-1] == "xlsx" or suffix[-1] == "xls":
            print("处理excel文件%s" % file_path)
            arrays = read_xls_file(file_path)
            result = sql_util.insertmany(sql_str, arrays)
            print("插入数据%s条" % result)
        elif suffix[-1] == "csv":
            print("处理csv文件%s" % file_path)
            arrays = read_csv_file(file_path)
            result = sql_util.insertmany(sql_str, arrays)