Пример #1
0
def get_record():
    print('[{}]--start save word'.format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
    result = []
    num = sql.queryone("select count(*) from bazhuayu_article")
    step = 100
    count = 0
    for i in range(int(num / step)):
        array = sql.queryall("select * from bazhuayu_article limit %s, %s", ((count + 1), (count + step)))
        count += step
        print("[{}]--{} data has get from databases......".format(
            time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), count))
        for item in array:
            temp = []
            temp.append(item.get('title'))
            temp.append(item.get('time'))
            temp.append(item.get('content'))
            temp.append(item.get('read_num'))

            comment_array = sql.queryall("select * from bazhuayu_comment where article_id = %s", item.get('id'))
            comment = []
            for item1 in comment_array:
                temp1 = []
                temp1.append(item1.get('user'))
                temp1.append(item1.get('comment'))
                temp1.append(item1.get('comment_time'))
                temp1.append(item1.get('like_num'))

                comment.append(temp1)

            temp.append(comment)

            result.append(temp)
    print('[{}]--data process finally'.format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
    return result
Пример #2
0
def get_all_data():
    # start = time.time()
    title_lst = sql.queryall("select title from origin_data GROUP BY title")
    # print("time is %s" % (time.time() - start))
    # print("data num is %s" % (sql.queryone("select count(*) from origin_data")))
    article = []
    comment = []
    for title in title_lst:
        array1 = []
        lst = sql.queryall("select * from origin_data where title = %s", title)
        article_id = str(uuid.uuid4())
        array1.append(article_id)
        array1.append(lst[0].get('title'))
        array1.append(lst[0].get('time'))
        array1.append(lst[0].get('content'))
        array1.append(lst[0].get('read_num'))
        article.append(array1)

        comment_lst = []
        for item in lst:
            if item.get('comment') in comment_lst:
                continue
            else:
                comment_lst.append(item.get('comment'))
                array2 = []
                array2.append(item.get('user'))
                array2.append(item.get('comment'))
                array2.append(item.get('comment_time'))
                array2.append(item.get('like_num'))
                array2.append(article_id)
                array2.append(str(uuid.uuid4()))
                comment.append(array2)
    return article, comment
Пример #3
0
def sina_record():
    print('[{}]--start save word'.format(
        time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
    result = []
    num = sql.queryone("select count(*) from sj_sina_article")
    step = 100
    count = 0
    for i in range(int(num / step)):
        array = sql.queryall("select * from sj_sina_article limit %s, %s",
                             ((count + 1), (count + step)))
        count += step
        print("[{}]--{} data has get from databases......".format(
            time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), count))
        for item in array:
            temp = [
                item.get('id'),
                item.get('nickname'),
                item.get('post_time'),
                item.get('post_content_txt'),
                item.get('reposts_count'),
                item.get('comments_count'),
                item.get('attitudes_count')
            ]

            comment_array = sql.queryall(
                "select * from sj_sina_comment where article_id = %s",
                item.get('id'))
            comment = []
            for item1 in comment_array:
                temp1 = [
                    item1.get('comment_nickname'),
                    item1.get('comment_content'),
                    item1.get('comment_attitudes_count'),
                    item1.get('comment_time'),
                    item1.get('comment_source')
                ]

                comment.append(temp1)

            temp.append(comment)

            result.append(temp)
    print('[{}]--data process finally'.format(
        time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
    return result
Пример #4
0
def sina_data():
    print("[{}]--start process sina!".format(
        time.strftime("%Y-%m-%d %H-%M-%S", time.localtime())))
    sql.execute("truncate table sj_sina_article")
    sql.execute("truncate table sj_sina_comment")
    print("[{}]--truncate table finally!".format(
        time.strftime("%Y-%m-%d %H-%M-%S", time.localtime())))

    article = []
    comment = []
    result = sql.queryall(
        "select * from sina where isLongText = 'False' and is_repost = 'false' and comments <> '[]' limit %s",
        200)
    for item1 in result:
        article_id = str(uuid.uuid4())
        temp = [
            article_id,
            item1.get('url'),
            item1.get('post_time'),
            item1.get('nickname'),
            item1.get('post_time'),
            item1.get('post_content_txt'),
            item1.get('reposts_count'),
            item1.get('comments_count'),
            item1.get('attitudes_count'),
            item1.get('topic')
        ]

        article.append(temp)
        for item in json.loads(item1.get('comments')):
            temp = [
                str(uuid.uuid4()), article_id,
                item.get('comment_userid'),
                item.get('comment_nickname'),
                item.get('comment_content'),
                item.get('comment_attitudes_count'),
                item.get('comment_time'),
                item.get('comment_source')
            ]
            comment.append(temp)
    print("[{}]--data integration finally!".format(
        time.strftime("%Y-%m-%d %H-%M-%S", time.localtime())))
    # 添加数据到数据库
    insert_article_sql = 'insert into sj_sina_article(id, url, time, nickname, post_time, post_content_txt,' \
                         ' reposts_count, comments_count, attitudes_count, topic)' \
                         ' values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'
    insert_comment_sql = 'insert into sj_sina_comment(id, article_id, comment_userid, comment_nickname,' \
                         ' comment_content, comment_attitudes_count, comment_time, comment_source)' \
                         ' values(%s, %s, %s, %s, %s, %s, %s, %s)'

    cnt = sql.insertmany(insert_article_sql, article)
    print("[{}]--article data insert to sql success. data count is {}.".format(
        time.strftime("%Y-%m-%d %H-%M-%S", time.localtime()), cnt))
    cnt = sql.insertmany(insert_comment_sql, comment)
    print("[{}]--comment data insert to sql success. data count is {}.".format(
        time.strftime("%Y-%m-%d %H-%M-%S", time.localtime()), cnt))
Пример #5
0
def get_record():
    filters = filter_tags_util.FilterTag()

    print('[{}]--start save word'.format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
    article_comment = []
    article = []
    excel = []
    num = sql.queryone("select count(*) from sj_tianya_article")
    step = 100
    count = 0
    sum = 0
    for i in range(int(num / step)):
        array = sql.queryall("select * from sj_tianya_article limit %s, %s", (count, (count + step)))
        count += step
        print("[{}]--{} data has get from databases......".format(
            time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), count))
        for item in array:
            sum += 1
            txt_id = '0' * (5 - len(str(sum))) + str(sum)
            excel_temp = [txt_id, item.get('question_link'),
                          time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(item.get('question_publish_time')))),
                          item.get('question_topics')]
            temp = [item.get('question_detail')]

            comment_array = sql.queryall("select * from sj_tianya_comment where article_id = %s", item.get('id'))
            temp_filter = ''
            for item1 in comment_array:
                temp1 = [item1.get('question_answer_content')]
                for item2 in temp1:
                    temp_filter += (filters.stripTagSimple(item2) + '\r')

            #  只有文章
            article.append(temp)

            temp.append(temp_filter)
            # 文章加评论
            article_comment.append(temp)
            # excel索引
            excel.append(excel_temp)
    print('[{}]--data process finally'.format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
    return article_comment, article, excel
Пример #6
0
def tianya_data():
    print("[{}]--start process tianya!".format(
        time.strftime("%Y-%m-%d %H-%M-%S", time.localtime())))
    sql.execute("truncate table sj_tianya_article")
    sql.execute("truncate table sj_tianya_comment")
    print("[{}]--truncate table finally!".format(
        time.strftime("%Y-%m-%d %H-%M-%S", time.localtime())))

    article = []
    comment = []
    result = sql.queryall(
        "select * from tianya where question_link <> '[]' limit %s", 200)
    for item1 in result:
        article_id = str(uuid.uuid4())
        temp = [
            article_id,
            item1.get('question_title'),
            item1.get('get_time'),
            item1.get('question_detail'),
            item1.get('question_author'),
            item1.get('question_publish_time'),
            item1.get('question_topics'),
            item1.get('question_link')
        ]

        article.append(temp)
        for item in json.loads(item1.get('question_answer')):
            temp = [
                str(uuid.uuid4()), article_id,
                item.get('question_answer_content'),
                item.get('question_answer_author'),
                item.get('question_answer_agree_count'),
                item.get('question_answer_publish_time')
            ]
            comment.append(temp)
    print("[{}]--data integration finally!".format(
        time.strftime("%Y-%m-%d %H-%M-%S", time.localtime())))
    # 添加数据到数据库
    insert_article_sql = 'insert into sj_tianya_article(id, question_title, get_time, question_detail,' \
                         ' question_author, question_publish_time, question_topics, question_link)' \
                         ' values(%s, %s, %s, %s, %s, %s, %s, %s)'
    insert_comment_sql = 'insert into sj_tianya_comment(id, article_id, question_answer_content, ' \
                         'question_answer_author, question_answer_agree_count, question_answer_publish_time)' \
                         ' values(%s, %s, %s, %s, %s, %s)'

    cnt = sql.insertmany(insert_article_sql, article)
    print("[{}]--article data insert to sql success. data count is {}.".format(
        time.strftime("%Y-%m-%d %H-%M-%S", time.localtime()), cnt))
    cnt = sql.insertmany(insert_comment_sql, comment)
    print("[{}]--comment data insert to sql success. data count is {}.".format(
        time.strftime("%Y-%m-%d %H-%M-%S", time.localtime()), cnt))
Пример #7
0
def get_record():
    title = []
    result = []
    comment = defaultdict(list)
    # num = sql.queryone("select count(*) from origin_data")
    num = 21000
    step = 1000
    count = 0
    for i in range(int(num / step)):
        array = sql.queryall("select * from origin_data limit %s, %s",
                             ((count + 1), (count + step)))
        count += step
        print("%s条数据读取完毕。。。。。。" % count)
        if i == 0:
            result1 = []
            title.append(array[0].get('title'))

            result1.append(array[0].get('title'))
            result1.append(array[0].get('time'))
            result1.append(array[0].get('content'))
            result1.append(array[0].get('readNum'))
            result.append(result1)

        for item in array:
            result_temp = []
            if item.get('title') not in title:
                title.append(item.get('title'))

                result_temp.append(item.get('title'))
                result_temp.append(item.get('time'))
                result_temp.append(item.get('content'))
                result_temp.append(item.get('readNum'))

                comment[item.get('title')].append(item.get('user'))
                comment[item.get('title')].append(item.get('comment'))
                comment[item.get('title')].append(item.get('commentTime'))
                comment[item.get('title')].append(item.get('likeNum'))

                result.append(result_temp)
            else:
                comment[item.get('title')].append(item.get('user'))
                comment[item.get('title')].append(item.get('comment'))
                comment[item.get('title')].append(item.get('commentTime'))
                comment[item.get('title')].append(item.get('likeNum'))
    return result, comment
Пример #8
0
def get_record():
    filters = filter_tags_util.FilterTag()

    print('[{}]--start save word'.format(
        time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
    article = []
    excel = []
    # num = sql.queryone("select count(*) from souhu limit 200")
    num = 200
    step = 100
    count = 0
    sum = 0
    for i in range(int(num / step)):
        array = sql.queryall("select * from souhu limit %s, %s",
                             (count, (count + step)))
        count += step
        print("[{}]--{} data has get from databases......".format(
            time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), count))
        for item in array:
            sum += 1
            txt_id = '0' * (5 - len(str(sum))) + str(sum)
            excel_temp = [
                txt_id,
                item.get('article_link'),
                time.strftime("%Y-%m-%d %H:%M:%S",
                              time.localtime(item.get('time'))),
                item.get('article_category')
            ]
            temp = [filters.stripTagSimple(item.get('article_content'))]
            #  只有文章
            article.append(temp)

            # excel索引
            excel.append(excel_temp)
    print('[{}]--data process finally'.format(
        time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
    return article, excel
def get_data():
    # sql_str = "select n, a , pc, cn, hi, pi, upi, t from AMiner_Author limit 1000"
    sql_str = "select index_id, pc, cn, hi, pi, upi from AMiner_Author limit 3000"
    data = sql.queryall(sql_str)
    print(data)
    return data
Пример #10
0
        count = 0
        for content in self._data_list:
            count += 1
            split(stop_word_list, content)
            if count % 100 == 0:
                print("processed %d records" % count)
        print("process the %d thread, records total num is %d" %
              (self._num, count))
        end_writer_file = time.time() - start_writer_file
        print("split the {:.0f} thread, records  time  {:.0f}m {:.0f}s".format(
            self._num, end_writer_file // 60, end_writer_file % 60))


if __name__ == '__main__':
    stop_word_list = get_stop_word()

    num = sql_util.queryone("select count(*) from origin_data")

    limit = 1000
    count = 0
    for temp in range(0, num, limit):
        data = sql_util.queryall("select content from origin_data limit %s,%s",
                                 (temp, count + limit))
        count += limit
        step = 100

        temp_list = [data[i:i + step] for i in range(0, len(data), step)]
        thr_list = [MyThread(i, temp_list[i]) for i in range(len(temp_list))]
        [thr.start() for thr in thr_list]
        [thr.join() for thr in thr_list]
Пример #11
0
def get_data_from_sql():
    sql_str = "select pc, cn, hi, pi, upi from AMiner_Author where pc<>0 and cn<>0 and hi <>0 and pi<>0 and upi <> 0 " \
        "limit 100000"
    data_result = sql_util.queryall(sql_str)
    return data_result