示例#1
0
def main():
    print('Program initiating ... ...')
    print()

    status = True

    while status:
        try:
            # ============= 测试Connection =============
            mydb = connectDB()
            mycursor = mydb.cursor()
            mycursor.execute('SELECT * FROM ttd.news LIMIT 10;')
            print(len(mycursor.fetchall()), ' Connection works')
            print()
            # ============= 测试Connection END =============
            r = requests.get('http://finance.people.com.cn/')
            r.encoding = 'GBK'
            soup = BeautifulSoup(r.content, features="html.parser")
            result = []

            # 版面头条
            try:
                pageMainHeadURL = 'http://finance.people.com.cn/' + soup.find('div', {'class': 'title mt15'}).find('a').get('href')
                result.append(pageMainHeadURL)
            except:
                print('Main page heading extraction error')
                print()

            # 新闻盒子组
            newsBoxList = soup.findAll('div', {'class': 'news_box'})
            for item in newsBoxList:
                sectionHeadURL = 'http://finance.people.com.cn/' + item.find('a').get('href')
                result.append(sectionHeadURL)
                # 找到子目录 news_box
                minorList = item.findAll('a')
                for a in minorList:
                    minorHeadURL = 'http://finance.people.com.cn/' + a.get('href')
                    result.append(minorHeadURL)

            # 切换新闻组
            qiehuanList = soup.findAll('div', {'class': 'headingNews qiehuan1_c'})
            for qiehuan in qiehuanList:
                minorList = qiehuan.findAll('div', {'class': 'hdNews clearfix'})
                for item in minorList:
                    minorHeadURL = item.find('a').get('href')
                    if 'http' not in minorHeadURL:
                        minorHeadURL = 'http://finance.people.com.cn/' + minorHeadURL
                    
                    result.append(minorHeadURL)

            print('This round the result has ', len(result), ' items')
            print()

            # ======== 与数据库对比是否有重复 =========
            new_result = []
            for link in result:
                try:
                    sql = 'SELECT news_id FROM ttd.news WHERE news_link=\'' + str(link).strip() + '\';'
                    mycursor.execute(sql)
                    news_link = mycursor.fetchall()
                    if len(news_link) < 1:
                        print('New link found', link)
                        print()
                        new_result.append(link)
                    else:
                        print(link, ' Existed')
                        print()
                except:
                    print('Adding new item error')
                    print()
                    break

            print('This round has ', len(new_result), ' new items')
            print()

            # ======== 与数据库对比是否有重复 END =========
            # ======== 插入新数据 =========
            if len(new_result) == 0:
                majorRandomPause()
            else:
                for link in new_result:
                    try:
                        mycursor.execute('SELECT news_id FROM ttd.news ORDER BY news_id DESC LIMIT 1;')
                        print('Execute Successfully')
                        news_id_count = mycursor.fetchall()[0][0] + 1
                        print(news_id_count)
                        sql = 'INSERT INTO ttd.news (news_id, news_title, news_source, news_date, news_content, news_link, gov_tag, com_tag) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)'
                        rst = parsingContent(link)
                        # ======= 政府标签 - 新增 12.15 ==========
                        gov_tag = module_news_govTag.tagGov(mycursor, str(rst['news_title']), str(rst['news_content']))
                        com_tag = module_news_comTag.tagCom(mycursor, str(rst['news_title']), str(rst['news_content']))
                        # ======= 政府标签 - 新增 12.15 END ==========
                        val = (news_id_count, str(rst['news_title']), str(rst['news_source']), str(rst['news_date']), str(rst['news_content']), str(rst['news_link']), gov_tag, com_tag)
                        mycursor.execute(sql, val)
                        mydb.commit()
                        print(mycursor.rowcount, "record inserted.")
                        print()
                        minorRandomPause()
                    except:
                        print('Getting info error')
                        print()
                        break

                # ======== 插入新数据 END =========
            print('This round has end, close connection')
            print()
            mydb.close()

        except:
            print('An error happend, minor stop')
            print()
            mydb.close()
            minorRandomPause()
            print('Reconnecting')
            print()
            mydb = connectDB()
            mycursor = mydb.cursor()
示例#2
0
def main():
    print('Program initiating ... ...')
    print()

    status = True

    while status:
        try:
            # ============= 测试Connection =============
            mydb = connectDB()
            mycursor = mydb.cursor()
            mycursor.execute('SELECT * FROM ttd.news LIMIT 10;')
            print(len(mycursor.fetchall()), ' Connection works')
            print()
            # ============= 测试Connection END =============
            r = requests.get('http://www.sasac.gov.cn/n2588025/index.html') # URL 地址
            soup = BeautifulSoup(r.content, features="html.parser")
            result = []
            # 头版靠左
            links = soup.findAll('div', {'class': 'wrz-alist'})
            for link in links[:-1]:
                tmp = link.findAll('li')
                for item in tmp:
                    if 'http' in item.find('a').get('href'):
                        result.append(item.find('a').get('href'))
                    else:
                        result.append('http://www.sasac.gov.cn/' + item.find('a').get('href').replace('..', ''))

            print('This round the result has ', len(result), ' items')
            print()
            # ======== 与数据库对比是否有重复 =========
            new_result = []
            for link in result:
                try:
                    sql = 'SELECT news_id FROM ttd.news WHERE news_link=\'' + str(link).strip() + '\';'
                    mycursor.execute(sql)
                    news_link = mycursor.fetchall()
                    if len(news_link) < 1:
                        print('New link found', link)
                        print()
                        new_result.append(link)
                    else:
                        print(link, ' Existed')
                        print()
                except:
                    print('Adding new item error')
                    print()
                    break

            print('This round has ', len(new_result), ' new items')
            print()

            # ======== 与数据库对比是否有重复 END =========
            # ======== 插入新数据 =========
            if len(new_result) == 0:
                majorRandomPause()
            else:
                for link in new_result:
                    try:
                        mycursor.execute('SELECT news_id FROM ttd.news ORDER BY news_id DESC LIMIT 1;')
                        print('Execute Successfully')
                        news_id_count = mycursor.fetchall()[0][0] + 1
                        print(news_id_count)
                        sql = 'INSERT INTO ttd.news (news_id, news_title, news_source, news_date, news_content, news_link, gov_tag, com_tag) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)'
                        rst = parsingContent(link)
                        # ======= 政府标签 - 新增 12.15 ==========
                        gov_tag = module_news_govTag.tagGov(mycursor, str(rst['news_title']), str(rst['news_content']))
                        com_tag = module_news_comTag.tagCom(mycursor, str(rst['news_title']), str(rst['news_content']))
                        # ======= 政府标签 - 新增 12.15 END ==========
                        val = (news_id_count, str(rst['news_title']), str(rst['news_source']), str(rst['news_date']), str(rst['news_content']), str(rst['news_link']), gov_tag, com_tag)
                        mycursor.execute(sql, val)
                        mydb.commit()
                        print(mycursor.rowcount, "record inserted.")
                        print()
                        minorRandomPause()
                    except:
                        print('Getting info error')
                        print()
                        break

                # ======== 插入新数据 END =========
            print('This round has end, close connection')
            print()
            mydb.close()
        except:
            print('An error happend, minor stop')
            print()
            mydb.close()
            minorRandomPause()
            print('Reconnecting')
            print()
            mydb = connectDB()
            mycursor = mydb.cursor()
示例#3
0
def main():
    print('新华网财经')
    print()

    # ============= 测试Connection =============

    mydb = connectDB()
    mycursor = mydb.cursor()

    # ============= 测试Connection END =============

    try:
        r = requests.get('http://xinhuanet.com/fortunepro/')
        soup = BeautifulSoup(r.content, features='html.parser')

    except:
        return

    # ============== 主页面爬取 ================

    news_list = soup.find('ul', {'class': 'silder_nav clearfix'}).findAll('li')
    news_list_item = {}
    # news_list_item_belong = {}

    for i in news_list:
        # currentLi = i.findAll('a')
        # for a in currentLi:
        # news_list_item[a.find('a').text] = a.find('a').get('href')
        try:
            news_list_item[i.find('a').get('href')] = i.find('a').text
        except:
            pass
            # news_list_item_belong[a.find('a').get('href')] = a.find('div', {'class': 'dysMiddleResultConItemRelevant clearfix'}).text

    print('共', len(news_list_item), '个结果')
    print()
    # ============== 主页面爬取 END ===============

    # ============== 数据库对照 =================
    confirmed_new = []
    for a in news_list_item:
        try:
            sql = 'SELECT news_title, news_link FROM ttd.news WHERE news_title =\'' + str(
                a) + '\' or news_link = \'' + str(news_list_item[a]) + '\';'
            mycursor.execute(sql)
            compareResult = mycursor.fetchall()
            if len(compareResult) == 0:
                confirmed_new.append(a)
            else:
                pass
        except:
            lw.log_writer('新华网财经添加新闻错误')
            pass

    lw.log_writer('新华网财经本轮新增新闻有' + str(len(confirmed_new)) + '条')

    # ============== 数据库对照 END =================

    # ============== 爬取主代码 =================
    if len(confirmed_new) == 0:
        print('没有发现新增新闻,即将关闭DB链接')
        print()
        mydb.close()
    else:
        for link in confirmed_new:
            sql = 'INSERT INTO ttd.news (news_title, news_source, news_date, news_content, news_link, gov_tag, com_tag, topic_tag) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)'
            rst = parsingContent(link, news_list_item[link])

            # ======= 标签 - 新增 12.15 ==========
            gov_tag = module_news_govTag.tagGov(mycursor,
                                                str(rst['news_title']),
                                                str(rst['news_content']))
            com_tag = module_news_comTag.tagCom(mycursor,
                                                str(rst['news_title']),
                                                str(rst['news_content']))
            topic_tag = module_news_topicTag.tagTopic(mycursor,
                                                      str(rst['news_title']),
                                                      str(rst['news_content']))
            # ======= 标签 - 新增 12.15 END ==========
            val = (str(rst['news_title']), str(rst['news_source']),
                   str(rst['news_date']), str(rst['news_content']),
                   str(rst['news_link']), gov_tag, com_tag, topic_tag)
            try:
                mycursor.execute(sql, val)
                mydb.commit()
            except:
                lw.log_writer('新华网财经在添加数据时失败')

            lw.log_writer('新华网财经新增' + str(mycursor.rowcount) + '条')
            minorRandomPause()

        lw.log_writer('新华网财经本轮结束')
        mydb.close()
示例#4
0
def main():
    print('中共中央政法委员会')
    print()

    # ============= 测试Connection =============

    mydb = connectDB()
    mycursor = mydb.cursor()

    # ============= 测试Connection END =============

    try:
        r = requests.get(
            'http://www.chinapeace.gov.cn/chinapeace/c100008/list2020.shtml')
        soup = BeautifulSoup(r.content, features='html.parser')

    except:
        return

    # ============== 主页面爬取 ================

    news_list = soup.find('div', {
        'class': "w1200 bgfff"
    }).find('div', {
        'class': 'list_box_left'
    }).findAll('li')
    news_list_item = {}

    for i in news_list:
        currentLi = i.findAll('a')
        for a in currentLi:
            news_list_item[a.text] = a.get('href')

    print('共', len(news_list_item), '个结果')
    print()
    # ============== 主页面爬取 END ===============

    # ============== 数据库对照 =================
    confirmed_new = []
    for a in news_list_item:
        try:
            sql = 'SELECT news_title, news_link FROM ttd.news WHERE news_title =\'' + str(
                a) + '\' or news_link = \'' + str(news_list_item[a]) + '\';'
            mycursor.execute(sql)
            compareResult = mycursor.fetchall()
            if len(compareResult) == 0:
                confirmed_new.append(news_list_item[a])
            else:
                pass
        except:
            lw.log_writer('中央政协委员会首页添加新闻错误')
            pass

    lw.log_writer('中央政协委员会本轮新增新闻有' + str(len(confirmed_new)) + '条')

    # ============== 数据库对照 END =================

    # ============== 爬取主代码 =================
    if len(confirmed_new) == 0:
        print('没有发现新增新闻,即将关闭DB链接')
        print()
        mydb.close()
    else:
        for link in confirmed_new:
            sql = 'INSERT INTO ttd.news (news_title, news_source, news_date, news_content, news_link, gov_tag, com_tag, topic_tag) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)'
            rst = parsingContent(link)

            # ======= 标签 - 新增 12.15 ==========
            gov_tag = module_news_govTag.tagGov(mycursor,
                                                str(rst['news_title']),
                                                str(rst['news_content']))
            com_tag = module_news_comTag.tagCom(mycursor,
                                                str(rst['news_title']),
                                                str(rst['news_content']))
            topic_tag = module_news_topicTag.tagTopic(mycursor,
                                                      str(rst['news_title']),
                                                      str(rst['news_content']))
            # ======= 标签 - 新增 12.15 END ==========
            val = (str(rst['news_title']), str(rst['news_source']),
                   str(rst['news_date']), str(rst['news_content']),
                   str(rst['news_link']), gov_tag, com_tag, topic_tag)
            try:
                mycursor.execute(sql, val)
                mydb.commit()
            except:
                lw.log_writer('中央政协委员会在添加数据时失败')

            lw.log_writer('中央政协新增' + str(mycursor.rowcount) + '条')
            minorRandomPause()

        lw.log_writer('中央政协本轮结束')
        mydb.close()
def main():
    print('Program initiating ... ...')
    print()

    status = True

    while status:
        try:
            # ============= 测试Connection =============
            mydb = connectDB()
            mycursor = mydb.cursor()
            mycursor.execute('SELECT * FROM ttd.news LIMIT 10;')
            print(len(mycursor.fetchall()), ' Connection works')
            print()
            # ============= 测试Connection END =============
            r = requests.get('http://caijing.chinadaily.com.cn/')  # URL 地址
            soup = BeautifulSoup(r.content, features="html.parser")
            result = []  # 储存结果

            # 获取这一页所有title

            # 头版靠右
            topRightLinks = soup.find('div', {
                'class': 'yaowen'
            }).findAll('a')[1:]
            for link in topRightLinks:
                if '//' in link.get('href'):
                    result.append(link.get('href').replace('//', 'https://'))

            # 跨国公司

            left_liebiao_1 = soup.find('div', {
                'class': 'left-liebiao'
            }).findAll('div', {'class': 'busBox1'})
            for link in left_liebiao_1:
                result.append(
                    link.find('a').get('href').replace('//', 'https://'))

            # 产业与公司板块

            left_liebiao_2 = soup.findAll(
                'div',
                {'class': 'left-liebiao'})[1].findAll('div',
                                                      {'class': 'busBox1'})
            for link in left_liebiao_2:
                result.append(
                    link.find('a').get('href').replace('//', 'https://'))

            print('This round the result has ', len(result), ' items')
            print()

            # ======== 与数据库对比是否有重复 =========
            new_result = []
            for link in result:
                try:
                    sql = 'SELECT news_id FROM ttd.news WHERE news_link=\'' + str(
                        link).strip() + '\';'
                    mycursor.execute(sql)
                    news_link = mycursor.fetchall()
                    if len(news_link) < 1:
                        print('New link found', link)
                        print()
                        new_result.append(link)
                    else:
                        print(link, ' Existed')
                        print()
                except:
                    print('Adding new item error')
                    print()
                    break

            print('This round has ', len(new_result), ' new items')
            print()

            # ======== 与数据库对比是否有重复 END =========
            # ======== 插入新数据 =========
            if len(new_result) == 0:
                majorRandomPause()
            else:
                for link in new_result:
                    try:
                        mycursor.execute(
                            'SELECT news_id FROM ttd.news ORDER BY news_id DESC LIMIT 1;'
                        )
                        print('Execute Successfully')
                        news_id_count = mycursor.fetchall()[0][0] + 1
                        print(news_id_count)
                        sql = 'INSERT INTO ttd.news (news_id, news_title, news_source, news_date, news_content, news_link, gov_tag, com_tag) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)'
                        rst = parsingContent(link)
                        # ======= 政府标签 - 新增 12.15 ==========
                        gov_tag = module_news_govTag.tagGov(
                            mycursor, str(rst['news_title']),
                            str(rst['news_content']))
                        com_tag = module_news_comTag.tagCom(
                            mycursor, str(rst['news_title']),
                            str(rst['news_content']))
                        # ======= 政府标签 - 新增 12.15 END ==========
                        val = (news_id_count, str(rst['news_title']),
                               str(rst['news_source']), str(rst['news_date']),
                               str(rst['news_content']), str(rst['news_link']),
                               gov_tag, com_tag)
                        mycursor.execute(sql, val)
                        mydb.commit()
                        print(mycursor.rowcount, "record inserted.")
                        print()
                        minorRandomPause()
                    except:
                        print('Getting info error')
                        print()
                        break

                # ======== 插入新数据 END =========
            print('This round has end, close connection')
            print()
            mydb.close()

        except:
            print('An error happend, minor stop')
            print()
            mydb.close()
            minorRandomPause()
            print('Reconnecting')
            print()
            mydb = connectDB()
            mycursor = mydb.cursor()
示例#6
0
def main():
    print('Program initiating ... ...')
    print()

    status = True

    while status:
        try:
            # ============= 测试Connection =============
            mydb = connectDB()
            mycursor = mydb.cursor()
            mycursor.execute('SELECT * FROM ttd.news LIMIT 10;')
            print(len(mycursor.fetchall()), ' Connection works')
            print()
            # ============= 测试Connection END =============
            r = requests.get('http://www.caixin.com/')
            soup = BeautifulSoup(r.content, features='html.parser')
            result = []  # 储存结果

            # 主页面
            main_list = soup.find('div', {'class': 'news_list'}).findAll('dl')

            for item in main_list:
                result.append(item.find('dd').find('p').find('a').get('href'))

            print('This round has ', len(result), ' items')
            print()

            # 与数据库比照是否有重复
            new_result = []
            for link in result:
                try:
                    sql = 'SELECT news_id FROM ttd.news WHERE news_link=\'' + str(
                        link).strip() + '\';'
                    mycursor.execute(sql)
                    news_link = mycursor.fetchall()
                    if len(news_link) < 1:
                        new_result.append(link)

                except:
                    print('Adding new item error')
                    print()
                    break

            print('This round has ', len(new_result), ' new items')
            print()

            if len(new_result) == 0:
                print('No new item found, restart DB connection')
                print()
                mydb.close()
                majorRandomPause()
            else:
                for link in new_result:
                    try:
                        mycursor.execute(
                            'SELECT news_id FROM ttd.news ORDER BY news_id DESC LIMIT 1;'
                        )
                        print('Execute Successfully')
                        news_id_count = mycursor.fetchall()[0][0] + 1
                        print(news_id_count)
                        sql = 'INSERT INTO ttd.news (news_id, news_title, news_source, news_date, news_content, news_link, gov_tag, com_tag) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)'
                        rst = parsingContent(link)
                        # ======= 标签 - 新增 12.15 ==========
                        gov_tag = module_news_govTag.tagGov(
                            mycursor, str(rst['news_title']),
                            str(rst['news_content']))
                        com_tag = module_news_comTag.tagCom(
                            mycursor, str(rst['news_title']),
                            str(rst['news_content']))
                        # ======= 标签 - 新增 12.15 END ==========
                        val = (news_id_count, str(rst['news_title']),
                               str(rst['news_source']), str(rst['news_date']),
                               str(rst['news_content']), str(rst['news_link']),
                               gov_tag, com_tag)
                        mycursor.execute(sql, val)
                        mydb.commit()
                        print(mycursor.rowcount, "record inserted.")
                        print()
                        minorRandomPause()
                    except:
                        print('Getting info error')
                        print()
                        break

            print('This round has end, close connection')
            print()
            mydb.close()

        except:
            print('An error happend, minor stop')
            print()
            mydb.close()
            minorRandomPause()
            print('Reconnecting')
            print()
            mydb = connectDB()
            mycursor = mydb.cursor()
示例#7
0
def main():
    print('财新网')
    print()

    # ============= 测试Connection =============
    mydb = connectDB()
    mycursor = mydb.cursor()
    mycursor.execute('SELECT * FROM ttd.news LIMIT 10;')
    # ============= 测试Connection END =============

    r = requests.get('http://www.caixin.com/')
    soup = BeautifulSoup(r.content, features='html.parser')

    main_page_item = {}  # 用于储存全部该页面数据

    # ============= 主页面爬取 =============
    main_list = soup.find('div', {'class': 'news_list'}).findAll('dl')

    for item in main_list:
        a = item.find('dd').find('p').find('a')
        main_page_item[a.text] = a.get('href')

    print('This round has ', len(main_page_item), ' items')
    print()
    # ============= 主页面爬取 END =============

    # ============== 数据库对照 =================
    confirmed_new = []
    for a in main_page_item:
        try:
            sql = 'SELECT news_id, news_title FROM ttd.news WHERE news_title=\'' + str(
                a) + '\';'
            mycursor.execute(sql)
            compareResult = mycursor.fetchall()
            if len(compareResult) == 0:
                confirmed_new.append(main_page_item[a])
            else:
                pass
        except:
            print('添加新的新闻错误')
            print()
            pass
    print('本轮新的新闻有', len(confirmed_new), '条')
    # ============== 数据库对照 END =================

    if len(confirmed_new) == 0:
        print('没有发现新增新闻,即将关闭DB链接')
        print()
        mydb.close()
    else:
        for link in confirmed_new:
            try:
                sql = 'INSERT INTO ttd.news (news_title, news_source, news_date, news_content, news_link, gov_tag, com_tag, topic_tag) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)'
                rst = parsingContent(link)
                # ======= 标签 - 新增 12.15 ==========
                gov_tag = module_news_govTag.tagGov(mycursor,
                                                    str(rst['news_title']),
                                                    str(rst['news_content']))
                com_tag = module_news_comTag.tagCom(mycursor,
                                                    str(rst['news_title']),
                                                    str(rst['news_content']))
                topic_tag = module_news_topicTag.tagTopic(
                    mycursor, str(rst['news_title']), str(rst['news_content']))
                # ======= 标签 - 新增 12.15 END ==========
                val = (str(rst['news_title']), str(rst['news_source']),
                       str(rst['news_date']), str(rst['news_content']),
                       str(rst['news_link']), gov_tag, com_tag, topic_tag)
                mycursor.execute(sql, val)
                mydb.commit()
                lw.log_writer('财新网' + str(mycursor.rowcount) + '条')
                minorRandomPause()
            except:
                print('Getting info error')
                print()
                break

    lw.log_writer('财新网脚本本轮结束')
    mydb.close()
示例#8
0
def main():
    print('Program initating ... ...')
    print()

    status = True

    while status:
        try:
            # ============= 测试Connection =============
            mydb = connectDB()
            mycursor = mydb.cursor()
            mycursor.execute('SELECT * FROM ttd.news LIMIT 10;')
            print(len(mycursor.fetchall()), ' Connection works')
            print()
            # ============= 测试Connection END =============
            r = requests.get('http://finance.china.com.cn/')
            soup = BeautifulSoup(r.content, features="html.parser")
            results = [] # 储存结果
            # 获取这一页所有title 
            
            # 头版靠左
            headLinks = soup.find('div', {'class': 'hot c'}).find('div', {'fl hot-lf'}).findAll('a')
            for link in headLinks:
                if 'http' in link.get('href') and 'photo' not in link.get('href'):
                    results.append(link.get('href'))

            # 金融和资本板块

            f_and_c = soup.find('div', {'class': 'mt20'}).find('div', {'class': 'fl f-list'}).findAll('a')
            for link in f_and_c:
                if len(link.get('href')) > 26 and 'photo' not in link.get('href'):
                    results.append(link.get('href'))

            # 产业与公司板块

            i_and_c = soup.find('div', {'class': 'indus mt20'}).find('div', {'class': 'fl f-list pt10'}).findAll('a')
            for link in i_and_c:
                if 'finance' in link.get('href') and 'photo' not in link.get('href'):
                    results.append(link.get('href'))

            print('This round the result has ', len(results), ' items')

            new_results = []

            # ======== 与数据库对比是否有重复 =========
            new_results = []
            for link in results:
                try:
                    sql = 'SELECT news_id FROM ttd.news WHERE news_link=\'' + str(link).strip() + '\';'
                    mycursor.execute(sql)
                    news_link = mycursor.fetchall()
                    if len(news_link) < 1:
                        print('New link found', link)
                        print()
                        new_results.append(link)
                    else:
                        print(link, ' Existed')
                        print()
                except:
                    print('Adding new item error')
                    print()
                    break

            print('This round has ', len(new_results), ' new items')
            print()

            # ======== 与数据库对比是否有重复 END =========
            # ======== 插入新数据 =========
            if len(new_results) == 0:
                majorRandomPause()
            else:
                for link in new_results:
                    try:
                        mycursor.execute('SELECT news_id FROM ttd.news ORDER BY news_id DESC LIMIT 1;')
                        print('Execute Successfully')
                        news_id_count = mycursor.fetchall()[0][0] + 1
                        print(news_id_count)
                        sql = 'INSERT INTO ttd.news (news_id, news_title, news_source, news_date, news_content, news_link, gov_tag, com_tag) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)'
                        rst = parsingContent(link)
                        # ======= 政府标签 - 新增 12.15 ==========
                        gov_tag = module_news_govTag.tagGov(mycursor, str(rst['news_title']), str(rst['news_content']))
                        com_tag = module_news_comTag.tagCom(mycursor, str(rst['news_title']), str(rst['news_content']))
                        # ======= 政府标签 - 新增 12.15 END ==========
                        val = (news_id_count, str(rst['news_title']), str(rst['news_source']), str(rst['news_date']), str(rst['news_content']), str(rst['news_link']), gov_tag, com_tag)
                        mycursor.execute(sql, val)
                        mydb.commit()
                        print(mycursor.rowcount, "record inserted.")
                        print()
                        minorRandomPause()
                    except:
                        print('Getting info error')
                        print()
                        break

                # ======== 插入新数据 END =========
            print('This round has end, close connection')
            print()
            mydb.close()


        except:
            print('An error happend, minor stop')
            print()
            mydb.close()
            minorRandomPause()
            print('Reconnecting')
            print()
            mydb = connectDB()
            mycursor = mydb.cursor()
示例#9
0
def main():
    print('天天基金网新闻')
    print()

    # ============= 测试Connection =============
    mydb = connectDB()
    mycursor = mydb.cursor()
    mycursor.execute('SELECT * FROM ttd.news LIMIT 10;')
    print(len(mycursor.fetchall()), ' Connection works')
    print()
    # ============= 测试Connection END =============

    r = requests.get('http://fund.eastmoney.com/a/cjjyw.html')
    soup = BeautifulSoup(r.content, features = 'html.parser')

    # ============== 主页面爬取 ==============
    main_list = soup.find('div', {'class': 'mainCont'}).findAll('ul') # 此处包含页面4个ul
    main_page_item = {} # 用于储存全部该页面的数据
    
    for i in main_list:
        currentUl = i.findAll('a')
        for a in currentUl:
            main_page_item[a.text] = a.get('href')

    print('共', len(main_page_item), '个结果')
    print()
    # ============== 主页面爬取 END ==============

    # ============== 数据库对照 =================
    confirmed_new = []
    for a in main_page_item:
        try:
            sql = 'SELECT news_id, news_title FROM ttd.news WHERE news_title=\'' + str(a) + '\';'
            mycursor.execute(sql)
            compareResult = mycursor.fetchall()
            if len(compareResult) == 0:
                confirmed_new.append(main_page_item[a])
            else:
                pass
        except:
            lw.log_writer('东方财富脚本首页添加新闻错误')
            pass

    lw.log_writer('东方财富脚本本轮新增新闻有' + str(len(confirmed_new)) + '条')
    # ============== 数据库对照 END =================

    # ============== 爬取主代码 =================
    if len(confirmed_new) == 0:
        print('没有发现新增新闻,即将关闭DB链接')
        print()
        mydb.close()
    else:
        for link in confirmed_new:
            sql = 'INSERT INTO ttd.news (news_title, news_source, news_date, news_content, news_link, gov_tag, com_tag, topic_tag) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)'
            rst = parsingContent(link)
            # ======= 标签 - 新增 12.15 ==========
            gov_tag = module_news_govTag.tagGov(mycursor, str(rst['news_title']), str(rst['news_content']))
            com_tag = module_news_comTag.tagCom(mycursor, str(rst['news_title']), str(rst['news_content']))
            topic_tag = module_news_topicTag.tagTopic(mycursor, str(rst['news_title']), str(rst['news_content']))
            # ======= 标签 - 新增 12.15 END ==========
            val = (str(rst['news_title']), str(rst['news_source']), str(rst['news_date']), str(rst['news_content']), str(rst['news_link']), gov_tag, com_tag, topic_tag)
            try:
                mycursor.execute(sql, val)
                mydb.commit()
            except:
                lw.log_writer('东方财富脚本在添加数据时失败')
            lw.log_writer('东方财富脚本新增' + str(mycursor.rowcount) + '条')
            minorRandomPause()
        
        lw.log_writer('东方财富脚本轮结束')
        mydb.close()
示例#10
0
def main():
    print('中国证券网')
    print()

    # ============= 测试Connection =============
    mydb = connectDB()
    mycursor = mydb.cursor()
    mycursor.execute('SELECT * FROM ttd.news LIMIT 10;')
    print(len(mycursor.fetchall()), ' Connection works')
    print()
    # ============= 测试Connection END =============

    r = requests.get('http://www.cs.com.cn/')
    soup = BeautifulSoup(r.content, features='html.parser')

    # ============== 主页面爬取 ==============
    main_page_item = {}

    top_part = soup.find('div', {
        'class': 'box410 ch_focus space_l1'
    }).findAll('li')
    for i in top_part:
        try:
            if 'http' not in i.find('a').get('href'):
                main_page_item[i.text] = i.find('a').get('href')
        except:
            pass

    mid_part = soup.find('div', {'class': 'box_l1'}).findAll('li')
    for i in mid_part:
        try:
            if 'http' not in i.find('a').get('href'):
                main_page_item[i.text] = i.find('a').get('href')
        except:
            pass

    print('共', len(main_page_item), '个结果')
    print()
    # ============== 主页面爬取 END ==============

    # ============== 数据库对照 =================
    confirmed_new = []
    for a in main_page_item:
        try:
            sql = 'SELECT news_id, news_title FROM ttd.news WHERE news_title=\'' + str(
                a) + '\';'
            mycursor.execute(sql)
            compareResult = mycursor.fetchall()
            if len(compareResult) == 0:
                confirmed_new.append(main_page_item[a])
            else:
                pass
        except:
            lw.log_writer('中国证券报首页添加新闻错误')
            pass

    print('本轮新的新闻有', len(confirmed_new), '条')
    # ============== 数据库对照 END =================

    # ============== 爬取主代码 =================
    if len(confirmed_new) == 0:
        print('没有发现新增新闻,即将关闭DB链接')
        print()
        mydb.close()
    else:
        for link in confirmed_new:
            sql = 'INSERT INTO ttd.news (news_title, news_source, news_date, news_content, news_link, gov_tag, com_tag, topic_tag) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)'
            rst = parsingContent(link[1:])
            # ======= 标签 - 新增 12.15 ==========
            gov_tag = module_news_govTag.tagGov(mycursor,
                                                str(rst['news_title']),
                                                str(rst['news_content']))
            com_tag = module_news_comTag.tagCom(mycursor,
                                                str(rst['news_title']),
                                                str(rst['news_content']))
            topic_tag = module_news_topicTag.tagTopic(mycursor,
                                                      str(rst['news_title']),
                                                      str(rst['news_content']))
            # ======= 标签 - 新增 12.15 END ==========
            val = (str(rst['news_title']), str(rst['news_source']),
                   str(rst['news_date']), str(rst['news_content']),
                   str(rst['news_link']), gov_tag, com_tag, topic_tag)
            mycursor.execute(sql, val)
            mydb.commit()
            lw.log_writer('中国证券报' + str(mycursor.rowcount) + '条')
            minorRandomPause()

        lw.log_writer('中国证券报脚本本轮结束')
        mydb.close()