示例#1
0
文件: main.py 项目: eddiezzz/crawler
 def get_url(self):
     hu = huxiu()
     tmt = tmtpost()
     zao = zaodu()
     chan = chanpin()
     pmt = pmtoo()
     woshi = woshipm()
     url_list = hu.get_url() + tmt.get_url() + zao.get_url() \
     + chan.get_url() + pmt.get_url() + woshi.get_url()
     # print(url_list)
     return url_list
示例#2
0
文件: main.py 项目: eddiezzz/crawler
    def get_news(self, url):
        """
        :param url: 需要抓取内容的URL地址
        :url[12]: 判断对应的url,h, m, 3, l分别对应:huxiu, tmtpost, 36kr, leiphone
        :return: news
        """
        if url:
            if url[11] == 'c':
                chan = chanpin()
                return chan.get_news(url)

            elif url[12] == 'h':
                hu = huxiu()
                return hu.get_news(url)

            elif url[11] == 'p':
                pmt = pmtoo()
                return pmt.get_news(url)

            elif url[12] == 'm':
                tmt = tmtpost()
                return tmt.get_news(url)

            elif url[12] == 'z':
                zao = zaodu()
                return zao.get_news(url)

            elif url[12] == 'o':
                woshi = woshipm()
                return woshi.get_news(url)

            else:
                print(self.time_now(), '\tAppear error url=', url, '\n')
                return None
        else:
            print(self.time_now(), '\tAppear error url= None\n')
            return None
示例#3
0
    cursor.executemany(INSERT_TERM_REATIONSHIPS,
                       [(last_id, last_taxonomy_id, 0), (last_id, term_id, 0)])

    # 尝试提交数据库,失败则回滚操作
    try:
        conn.commit()
        conn.close()
        print(time, '\tThe news write MySql Success @SKYNE\n')
        return True
    except Exception as e:
        conn.rollback()
        conn.close()
        print(time, '\tThe news write MySql Failed @SKYNE\n')
        return False


if __name__ == '__main__':
    from spider.huxiu import huxiu
    news = {
        'url': 'https://www.huxiu.com/article/227432.html',
        'link': 'https://m.huxiu.com/article/227432.html',
        'title': 'vfkhvbjkhbjkhgbhjmgkjh',
        'text': 'asdasdqwdqwdasdqwdqwdqwdqwd',
        'author': '虎嗅网',
        'labels': '金融地产',
        'service': 'Article.AddArticle'
    }
    hu = huxiu()
    news = hu.get_news("https://www.huxiu.com/article/236527.html")
    write(news)
    def main(self):
        print(self.time_now(),'\t程序启动中,请等待......\n')
        hour_counter = 1

        # 打开数据库连接
        cfg = configparser.ConfigParser()
        cfg.read("conf.ini")
        db_host = cfg.get("database", "host")
        db_port = cfg.getint("database", "port")
        db_name = cfg.get("database", "dbname")
        db_user = cfg.get("database", "user")
        db_pass = cfg.get("database", "pass")
        pref_write_file = cfg.getint("preference", "writefile")

        while(True):
            print(self.time_now(), '\t-------- 开始处理第{}次调度! --------\n'.format(hour_counter))

            db = pymysql.connect(host=db_host, user=db_user, password=db_pass,db=db_name,port=db_port, use_unicode=True, charset="utf8")
            cur = db.cursor()
            sql_select_from_web_src = "select id,name,platform_id,url,img from 91_web_src"
            cur.execute(sql_select_from_web_src)
            result_data = cur.fetchall()
            for id,name,platform_id,url,img in result_data:
                print(" 从91_web_src表中查询到记录:", id, name, platform_id, url)
                print("\n 处理中,详情请查看log文件......\n")
                if url.startswith("https://www.huxiu.com"):
                    hu = huxiu()
                    inner_url_list = hu.get_inner_url_list_new(url)
                    for inner_url in inner_url_list:
                        if is_url_processed(inner_url['link']) == True:
                            continue
                        news = hu.get_news(url = inner_url['link'])
                        if pref_write_file == 1:
                            write(news)
                        writeIntoMysql(news, id, name, platform_id, inner_url['img'], inner_url['desc'])
                elif url.startswith("https://36kr.com"):
                    kr36 = kr ()
                    inner_url_list = kr36.get_inner_url_list_new(url)
                    for inner_url in inner_url_list:
                        if is_url_processed(inner_url['link']) == True:
                            continue
                        news = kr36.get_news(url = inner_url['link'], title = inner_url['title'], summary = inner_url['desc'])
                        if pref_write_file == 1:
                            write(news)
                        writeIntoMysql(news, id, name, platform_id, inner_url['img'], inner_url['desc'])
                elif url.startswith("http://36kr.com"):
                    kr36 = kr ()
                    inner_url_list = kr36.get_inner_url_list_new(url)
                    for inner_url in inner_url_list:
                        if is_url_processed(inner_url['link']) == True:
                            continue
                        news = kr36.get_news(url = inner_url['link'], title = inner_url['title'], summary = inner_url['desc'])
                        if pref_write_file == 1:
                            write(news)
                        writeIntoMysql(news, id, name, platform_id, inner_url['img'], inner_url['desc'])

            cur.close()
            db.close()

            print(self.time_now(), '\t======== 第{}次调度处理结束! ========\n'.format(hour_counter))

            # 两小时扫描一次数据库
            time.sleep(7200)
            hour_counter += 1