def get_inner_url_list(self, url):
        """
        : param
        : return: url_list  返回从频道首页通栏中的文章的url
        """
        writelog("huxiu开始解析原始URL:" + url)

        selector = self.parser(url)
        url_tmp_list = list(
            set(
                selector.xpath(
                    '//*[@id="index"]//div[@class="mod-info-flow"]//div[@class="mob-ctt channel-list-yh"]//a[@class="transition msubstr-row2"]/@href'
                )))
        """
        : param: url {'/article/220006.html', '/article/219989.html', '/article/220008.html'}
        添加url,获取完整的url地址。
        """
        url_list = []
        for url_tmp in url_tmp_list:
            full_url_tmp = self.base_url + url_tmp
            url_list.append(full_url_tmp)

        writelog("huxiu返回inner_url_list内容如下:\n" + json.dumps(url_list) + "\n")

        return url_list
示例#2
0
    def get_inner_url_list(self, url):
        """
        : param
        : return: url_list  返回从频道首页通栏中的文章的url
        """
        writelog("tmtpost开始解析原始URL:" + url)

        selector = self.parser(url)
        url_tmp_list = list(
            set(
                selector.xpath(
                    '/html/body//section/div[1]/div//div[3]/a/@href')))
        """
        : param: url {'/article/220006.html', '/article/219989.html', '/article/220008.html'}
        添加url,获取完整的url地址。
        """
        url_list = []
        for url_tmp in url_tmp_list:
            full_url_tmp = self.base_url + url_tmp
            url_list.append(full_url_tmp)

        writelog("tmtpost返回inner_url_list内容如下:\n" + json.dumps(url_list) +
                 "\n")

        return url_list
示例#3
0
def is_url_processed(url):
    str = url
    hashlibMd5 = hashlib.md5()
    hashlibMd5.update(str.encode(encoding='utf-8'))
    id = hashlibMd5.hexdigest()
    # 打开数据库连接
    cfg = configparser.ConfigParser()
    cfg.read("conf.ini")
    db_host = cfg.get("database", "host")
    db_port = cfg.getint("database", "port")
    db_name = cfg.get("database", "dbname")
    db_user = cfg.get("database", "user")
    db_pass = cfg.get("database", "pass")

    db = pymysql.connect(host=db_host,
                         user=db_user,
                         password=db_pass,
                         db=db_name,
                         port=db_port,
                         use_unicode=True,
                         charset="utf8")
    cur = db.cursor()
    sql_select_from_article = "select 1 as cnt from 91_article where id=%s"
    values = (id)
    result_data = cur.execute(sql_select_from_article, values)

    cur.close()
    db.close()

    if result_data > 0:
        writelog("该url已存在数据库中:{}\n".format(url))
        return True

    return False
    def get_inner_url_list_new(self, url):
        writelog("====>>>> huxiu开始解析原始URL:{}\n".format(url))
        inner_url_list = []
        selector = self.parser(url)
        for sel in selector.xpath(
                "//*[@id='index']//div[@class='mod-info-flow']/div[@class='mod-b mod-art clearfix']"
        ):
            item = {}
            title_datas = sel.xpath(
                "div[@class='mob-ctt channel-list-yh']/h2/a/text()")
            item['title'] = '' + title_datas[0]
            link_datas = sel.xpath(
                "div[@class='mob-ctt channel-list-yh']/h2/a/@href")
            item['link'] = self.base_url + link_datas[0]
            desc_datas = sel.xpath(
                "div[@class='mob-ctt channel-list-yh']/div[@class='mob-sub']/text()"
            )
            item['desc'] = '' + desc_datas[0]
            img_datas = sel.xpath(
                "div[@class='mod-thumb pull-left ']/a[@class='transition']/img[@class='lazy']/@data-original"
            )

            # 头图是视频缩略图时,xpath解析较复杂,暂跳过
            if img_datas == False:
                continue
            if len(img_datas) == False:
                continue

            item['img'] = '' + img_datas[0]
            inner_url_list.append(item)

        # writelog("huxiu返回inner_url_list:{}\n".format(json.dumps(inner_url_list)))
        return inner_url_list
示例#5
0
    def get_inner_url_list_new(self, url):
        writelog("====>>>> 36kr开始解析原始URL:{}\n".format(url))
        inner_url_list = []
        get_payload = {
            'per_page': 20,
            'page': 1,
            '_': str(int(time.time() * 1000))
        }
        headers = {
            'Host':
            '36kr.com',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
        }
        data = self._craw('get', url, headers, get_payload)

        data = json.loads(data)
        articles = data['data']['items']
        for article in articles:
            # 每条资讯对应一个字典
            item = {}
            item['title'] = article['title']
            item['link'] = 'https://36kr.com/p/{}.html'.format(article['id'])
            item['desc'] = article['summary']
            item['img'] = article['cover']
            inner_url_list.append(item)

        # writelog("36kr返回inner_url_list内容如下:\n" + json.dumps(inner_url_list) + "\n")

        return inner_url_list
    def _excepthook(type, value, trace):

        print(self.time_now(), "捕获到全局异常,类型:{},值:{}\n".format(str(type), str(value)))
        print(self.time_now(), "按任意键继续")
        os.system('pause')

        err_msg = '\n ======================== 捕获到全局异常 ======================== \n'
        err_msg += ''.join(traceback.format_exception(type, value, trace))
        writelog(err_msg)

        sys.__excepthook__(type, value, trace)
 def parser(self, url):
     """
     : param url: 需要解析页面的url地址
     : return: selector
     """
     response = self.session.get(url=url, headers=self.headers, verify=True)
     if response.status_code == 200:
         selector = html.fromstring(response.text)
         return selector
     else:
         writelog("woshipm,网络请求出现异常,请检查! url:" + url)
示例#8
0
def writeIntoMysql(news, web_src_id, web_src_name, web_platform_id, img, desc):
    # 打开数据库连接
    cfg = configparser.ConfigParser()
    cfg.read("conf.ini")
    db_host = cfg.get("database", "host")
    db_port = cfg.getint("database", "port")
    db_name = cfg.get("database", "dbname")
    db_user = cfg.get("database", "user")
    db_pass = cfg.get("database", "pass")

    db = pymysql.connect(host=db_host,
                         user=db_user,
                         password=db_pass,
                         db=db_name,
                         port=db_port,
                         use_unicode=True,
                         charset="utf8")
    cur = db.cursor()

    try:
        str = news['url']
        hashlibMd5 = hashlib.md5()
        hashlibMd5.update(str.encode(encoding='utf-8'))
        id = hashlibMd5.hexdigest()

        # 新增91_article记录
        sql_insert1 = "insert into 91_article(id,create_by,create_date,title,keywords,image,websrc_id,websrc_name,web_platform_id,description) values(%s,%s,now(),%s,%s,%s,%s,%s,%s,%s)"
        values1 = (id, "webcrawler", news['title'], news['labels'], img,
                   web_src_id, web_src_name, web_platform_id, desc)
        cur.execute(sql_insert1, values1)

        # 新增91_article_data记录
        sql_insert2 = "insert into 91_article_data(id,content,copyfrom) values(%s,%s,%s)"
        values2 = (id, news['text'], news['author'])
        cur.execute(sql_insert2, values2)
        # 提交
        db.commit()
    except Exception as e:
        # 错误回滚
        db.rollback()
        writelog("数据库写入失败!异常原因:{}\n".format(e))
        return False
    finally:
        cur.close()
        db.close()

    writelog("数据库写入成功!\n")
    return True
示例#9
0
    def get_news(self, url, title, summary):

        writelog("36kr,即将处理url:" + url)
        driver = self.create_phantomJS()
        driver.get(url)
        # 等待页面加载完成
        driver.implicitly_wait(30)
        page_src_code = driver.page_source
        # print(page_src_code)
        try:
            news = {}
            news['url'] = url
            news['author'] = u"36Kr"
            news['title'] = driver.title
            news['content'] = summary
            news['labels'] = "36kr默认标签"

            full_content = u""
            pattern = re.compile(
                r'<section class="textblock">([\s\S]*?)<\/section>',
                re.RegexFlag.S)
            items_withtag = re.findall(pattern, page_src_code)
            for item in items_withtag:
                full_content += item
            # print(full_content)
            news['text'] = full_content

            textblock_element = driver.find_element_by_xpath(
                '//section[@class="textblock"]')
            # print(textblock_element.text)
            # news['text'] = textblock_element.text

            driver.quit()

            writelog("36kr,处理正常结束!url:" + url)
            return news
        except Exception as e:
            writelog("36kr,解析出现异常!url=" + url)
            exc_type, exc_value, exc_traceback = sys.exc_info()
            writelog("*** 异常堆栈如下:")
            traceback.print_exception(exc_type,
                                      exc_value,
                                      exc_traceback,
                                      limit=5,
                                      file=sys.stdout)
            writelog("-" * 100)
            return None
    def get_inner_url_list(self, url):
        """
        : param
        : return: url_list  返回从频道首页通栏中的文章的url
        """
        writelog("woshipm开始解析原始URL:" + url)

        selector = self.parser(url)
        url_tmp_list = list(
            set(selector.xpath('//h2[@class="post-title"]/a/@href')))
        """
        : param: url {'/article/220006.html', '/article/219989.html', '/article/220008.html'}
        添加url,获取完整的url地址。
        """
        url_list = []
        for url_tmp in url_tmp_list:
            url_list.append(url_tmp)

        writelog("woshipm返回inner_url_list内容如下:\n" + json.dumps(url_list) +
                 "\n")

        return url_list
示例#11
0
    def get_inner_url_list(self, url):
        """
        : param
        : return: url_list  返回从频道首页通栏中的文章的url
        """
        writelog("chanpin100开始解析原始URL:" + url)

        selector = self.parser(url)
        url_tmp_list = list(
            set(selector.xpath('//h4[@class="media-heading"]/a/@href')))
        """
        : param: url {'/article/220006.html', '/article/219989.html', '/article/220008.html'}
        添加url,获取完整的url地址。
        """
        url_list = []
        for url_tmp in url_tmp_list:
            full_url_tmp = self.base_url + url_tmp
            url_list.append(full_url_tmp)

        writelog("chanpin100返回inner_url_list内容如下:\n" + json.dumps(url_list) +
                 "\n")

        return url_list
    def get_inner_url_list(self, url):
        """
        : param
        : return: url_list  返回从频道首页通栏中的文章的urll
        """

        writelog("leiphone开始解析原始URL:" + url)

        selector = self.parser(url)
        # url_tmp_list = list(set(selector.xpath('//div[@class="lph-picShow idx-picShow clr"]//a/@href')))
        url_tmp_list = list(set(
            selector.xpath('//div[@class="img"]//a/@href')))
        """
        : param: url {'/article/220006.html', '/article/219989.html', '/article/220008.html'}
        添加url,获取完整的url地址。
        """

        url_list = []
        for url_tmp in url_tmp_list:
            if ("javascript" in url_tmp):
                continue
            # full_url_tmp = self.base_url + url_tmp
            full_url_tmp = url_tmp

            # html = requests.get(url)
            # soup = BeautifulSoup(html.text, 'lxml')
            # a = soup.findAll(name = 'a')
            # for a_ in a:
            #    writelog("leiphone,找到一个href链接:" + a_.get('href'))

            url_list.append(full_url_tmp)

        writelog("leiphone返回inner_url_list内容如下:\n" + json.dumps(url_list) +
                 "\n")

        return url_list
示例#13
0
    def get_news(self, url):
        """
        : param
        url : 需要进行获取信息的url地址
        flag : 标志位,判断是否抓取成功
        news : 字典,存储各信息
        : return: news 正常返回news,错误返回 -1
        """
        writelog("tmtpost,即将解析url:" + url)
        news = {}
        flag = None
        try:
            news['url'] = url
            news['link'] = url[0:7:1] + "m" + url[10:-1:1] + url[-1]
            selector = self.parser(url)
            news['author'] = u'钛媒体'
            title = selector.xpath('/html/head/title/text()')[0]

            tmp = ""
            for i in title:
                if i == '-' or i == '|':
                    break
                tmp += i
            # selector.xpath('/html/body/div[5]//div[1]/div[1]/a/span/text()')[0]
            news['title'] = tmp
            content = selector.xpath('//div[@class="inner"]//p')

            article = ""
            temp = []
            for i in content:
                img_url = i.xpath('img/@src')
                temp.append(i.text)
                temp.append(img_url)

            temp.pop()

            for i in temp:
                if i:
                    if type(i) == list:
                        article = article + "\n" + "![](" + i[0] + ")" + "\n\n"
                    else:
                        article = article + i + "\n\n"

            summary = ""
            for i in temp:
                if len(summary) > 400:
                    break
                else:
                    if i:
                        if type(i) == list:
                            pass
                        else:
                            summary = summary + i + "\n"

            news['content'] = summary

            full_article = self.getContent(url)
            news['text'] = full_article
            # news['text'] = article

            cover_list = selector.xpath('//img[@class="aligncenter"]/@src')
            if cover_list:
                news['cover'] = cover_list[0]
            else:
                writelog("tmtpost,无法解析封面!url=" + url)
                news['cover'] = "钛媒体默认封面"
            labels_list = selector.xpath(
                '/html/body//section//span[1]/a/text()')
            if labels_list:
                news['labels'] = '' + labels_list[0]
            else:
                writelog("tmtpost,无法解析标签!url=" + url)
                news['labels'] = "钛媒体默认标签"
            news['service'] = 'Article.AddArticle'
        except Exception as e:
            writelog("tmtpost,解析时出现异常,请检查!url=" + url)
            exc_type, exc_value, exc_traceback = sys.exc_info()
            writelog("*** print_exception:")
            traceback.print_exception(exc_type,
                                      exc_value,
                                      exc_traceback,
                                      limit=5,
                                      file=sys.stdout)
            writelog("-" * 100)
            flag = 1

        if flag == None:
            writelog("tmtpost,处理正常结束!url=" + url)
            return news
        else:
            return None
示例#14
0
                news['labels'] = '' + labels_list[0]
            else:
                writelog("zaodula,无法解析标签!url=" + url)
                news['labels'] = "早读课默认标签"
        except Exception as e:
            writelog("zaodula,解析时异常,请检查!url=" + url)
            exc_type, exc_value, exc_traceback = sys.exc_info()
            writelog("*** 异常堆栈:")
            traceback.print_exception(exc_type,
                                      exc_value,
                                      exc_traceback,
                                      limit=5,
                                      file=sys.stdout)
            writelog("-" * 100)
            flag = 1

        if flag == None:
            writelog("zaodula,处理正常结束!url=" + url)
            return news
        else:
            return None


if __name__ == '__main__':
    spider = zaodu()

    url = 'https://www.zaodula.com'
    inner_url_list = spider.get_inner_url_list(url)
    writelog("早读课inner_url_list:" + json.dumps(inner_url_list))
    for inner_url in inner_url_list:
        writelog("早读课url_title:" + spider.get_news(inner_url)['title'])
    def get_news(self, url):
        """
        : param
        url : 需要进行获取信息的url地址
        flag : 标志位,判断是否抓取成功
        news : 字典,存储各信息
        : return: news 正常返回news,错误返回 -1
        """
        writelog("woshipm,即将处理url:" + url)
        news = {}
        flag = None
        try:
            news['url'] = url
            selector = self.parser(url)
            title = selector.xpath('/html/head/title/text()')[0]
            news['author'] = u'人人都是产品经理'

            tmp = ""
            for i in title:
                if i == '|':
                    break
                tmp += i
            news['title'] = tmp

            content = selector.xpath('//div[@class="grap"]//p')
            article = ""
            temp = []
            cover_list = []
            for i in content:
                img_url = i.xpath('img/@src')
                temp.append(i.text)
                temp.append(img_url)
                cover_list.append(img_url)

            # 去除尾部多余信息
            for i in range(6):
                temp.pop()

            for i in temp:
                if i:
                    if type(i) == list:
                        article = article + "\n" + "![](" + i[0] + ")" + "\n\n"
                    else:
                        article = article + i + "\n\n"

            summary = ""
            for i in temp:
                if len(summary) > 400:
                    break
                else:
                    if i:
                        if type(i) == list:
                            pass
                        else:
                            summary = summary + i + "\n"

            news['content'] = summary

            full_article = self.getContent(url)
            news['text'] = full_article
            # news['text'] = article

            for i in cover_list:
                if i:
                    news['cover'] = i[0]
                    break

            news['labels'] = u"产品项目"
        except Exception as e:
            writelog("woshipm,解析时出现异常,请检查!url=" + url)
            exc_type, exc_value, exc_traceback = sys.exc_info()
            writelog("*** print_exception:")
            traceback.print_exception(exc_type,
                                      exc_value,
                                      exc_traceback,
                                      limit=5,
                                      file=sys.stdout)
            writelog("-" * 100)
            flag = 1

        if flag == None:
            writelog("woshipm,处理正常结束!url=" + url)
            return news
        else:
            return None
                    news['cover'] = i[0]
                    break

            news['labels'] = u"产品项目"
        except Exception as e:
            writelog("woshipm,解析时出现异常,请检查!url=" + url)
            exc_type, exc_value, exc_traceback = sys.exc_info()
            writelog("*** print_exception:")
            traceback.print_exception(exc_type,
                                      exc_value,
                                      exc_traceback,
                                      limit=5,
                                      file=sys.stdout)
            writelog("-" * 100)
            flag = 1

        if flag == None:
            writelog("woshipm,处理正常结束!url=" + url)
            return news
        else:
            return None


if __name__ == '__main__':
    spider = woshipm()

    url = 'http://www.woshipm.com/category/pmd'
    inner_url_list = spider.get_inner_url_list(url)
    writelog("人人都是产品经理inner_url_list:" + json.dumps(inner_url_list))
    for inner_url in inner_url_list:
        writelog("人人都是产品经理news:" + json.dumps(spider.get_news(inner_url)))
示例#17
0
    def get_news(self, url):
        """
        : param
        url : 需要进行获取信息的url地址
        flag : 标志位,判断是否抓取成功
        news : 字典,存储各信息
        : return: news 正常返回news,错误返回 -1
        """
        writelog("zaodula,即将处理url:" + url)
        news = {}
        flag = None
        try:
            news['url'] = url
            selector = self.parser(url)
            title = selector.xpath('/html/head/title/text()')[0]
            news['author'] = u'早读课'

            tmp = ""
            for i in title:
                if i == '-' or i == '丨' or i == '—':
                    break
                tmp += i
            news['title'] = tmp

            content = selector.xpath('//div[@class="single-content"]//p')

            article = ""
            temp = []
            cover_list = []
            for a_tag in content:
                href_url = a_tag.xpath('a/@href')
                temp.append(a_tag.text)
                temp.append(href_url)
                cover_list.append(href_url)
            # 去除尾部多余信息
            for i in range(6):
                temp.pop()

            for i in temp:
                if i:
                    if type(i) == list:
                        article = article + "\n" + "![](" + i[0] + ")" + "\n\n"
                    else:
                        article = article + i + "\n\n"

            summary = ""
            for i in temp:
                if len(summary) > 400:
                    break
                else:
                    if i:
                        if type(i) == list:
                            pass
                        else:
                            summary = summary + i + "\n"

            news['content'] = summary

            full_article = self.getContent(url)
            news['text'] = full_article
            # news['text'] = article

            for i in cover_list:
                if i:
                    news['cover'] = i[0]
                    break

            labels_list = selector.xpath('//div[@class="single-cat"]/a/text()')
            if labels_list:
                news['labels'] = '' + labels_list[0]
            else:
                writelog("zaodula,无法解析标签!url=" + url)
                news['labels'] = "早读课默认标签"
        except Exception as e:
            writelog("zaodula,解析时异常,请检查!url=" + url)
            exc_type, exc_value, exc_traceback = sys.exc_info()
            writelog("*** 异常堆栈:")
            traceback.print_exception(exc_type,
                                      exc_value,
                                      exc_traceback,
                                      limit=5,
                                      file=sys.stdout)
            writelog("-" * 100)
            flag = 1

        if flag == None:
            writelog("zaodula,处理正常结束!url=" + url)
            return news
        else:
            return None
示例#18
0
            else:
                writelog("chanpin100,无法解析标签!url=" + url)
                news['labels'] = "产品100默认标签"
        except Exception as e:
            writelog("chanpin100,解析时出现异常,请检查!url=" + url)
            exc_type, exc_value, exc_traceback = sys.exc_info()
            writelog("*** print_exception:")
            traceback.print_exception(exc_type,
                                      exc_value,
                                      exc_traceback,
                                      limit=5,
                                      file=sys.stdout)
            writelog("-" * 100)
            flag = 1

        if flag == None:
            writelog("chanpin100,处理正常结束!url=" + url)
            return news
        else:
            return None


if __name__ == '__main__':
    spider = chanpin()

    url = 'http://www.chanpin100.com/pm'
    inner_url_list = spider.get_inner_url_list(url)

    for inner_url in inner_url_list:
        writelog("产品100, news:" + json.dumps(spider.get_news(inner_url)))
示例#19
0
    def get_news(self, url):
        """
        : param
        url : 需要进行获取信息的url地址
        flag : 标志位,判断是否抓取成功
        news : 字典,存储各信息
        : return: news 正常返回news,错误返回 -1
        """
        writelog("huxiu,即将处理url:" + url)
        news = {}
        flag = None
        try:
            # 重新组合成https://m.huxiu.com/ 类型的移动端url地址。
            news['url'] = url
            news['link'] = url[0:8:1] + "m" + url[11:-1:1] + url[-1]
            selector = self.parser(url)
            news['title'] = selector.xpath(
                '/html/head/title/text()')[0].replace('-虎嗅网', '')
            news['author'] = u'虎嗅网'
            # selector.xpath('//div[3]/div[1]/div[2]/a[1]/text()')[0].strip()

            content = selector.xpath('//div[@class="article-content-wrap"]//p')

            article = ""
            temp = []
            for i in content:
                img_url = i.xpath('img/@src')
                temp.append(i.text)
                temp.append(img_url)

            for i in temp:
                if i:
                    if type(i) == list:
                        article = article + "\n" + "![](" + i[0] + ")" + "\n\n"
                    else:
                        article = article + i + "\n\n"

            summary = ""
            for i in temp:
                if len(summary) > 400:
                    break
                else:
                    if i:
                        if type(i) == list:
                            pass
                        else:
                            summary = summary + i + "\n"

            news['content'] = summary.replace('\xa0', '')

            full_article = self.getContent(url)
            # full_article_list = selector.xpath('//div[@class="article-content-wrap"]')
            # for tmp_full_article in full_article_list:
            #    tmp_article_value = tmp_full_article.string()
            #    writelog(tmp_article_value)

            # news['text'] = article
            news['text'] = full_article
            labels_list = selector.xpath(
                '//div[@class="article-img-box"]/img/@src')
            if labels_list:
                news['labels'] = "" + labels_list[0]
            else:
                writelog("huxiu,无法解析标签!url=" + url)
                news['labels'] = "虎嗅网默认标签"
            labels_list = selector.xpath(
                '//div[@class="column-link-box"]/a/text()')
            if labels_list:
                news['labels'] = ""
                for label in labels_list:
                    news['labels'] += (" " + label)
            else:
                news['labels'] = "虎嗅网默认标签"
            news['service'] = 'Article.AddArticle'
        except Exception as e:
            writelog("huxiu,解析出现异常!url=" + url)
            exc_type, exc_value, exc_traceback = sys.exc_info()
            writelog("*** 异常堆栈如下:")
            traceback.print_exception(exc_type,
                                      exc_value,
                                      exc_traceback,
                                      limit=5,
                                      file=sys.stdout)
            writelog("-" * 100)
            flag = 1

        if flag == None:
            writelog("huxiu,处理正常结束!url:" + url)
            return news
        else:
            return None
    def get_news(self, url):
        """
        : param
        url : 需要进行获取信息的url地址
        flag : 标志位,判断是否抓取成功
        news : 字典,存储各信息
        : return: news 正常返回news,错误返回 -1
        """
        writelog("leiphone,即将处理url:" + url)
        news = {}
        flag = None
        try:
            news['url'] = url
            news['link'] = url[0:8:1] + "m" + url[11:-1:1] + url[-1]
            selector = self.parser(url)
            news['author'] = u'雷锋网'
            news['title'] = selector.xpath('/html/head/title/text()')[0]

            # selector.xpath('/html/body//section/div/article//a/text()')[0].strip()

            content = selector.xpath('//div[@class="lph-article-comView"]//p')
            article = ""
            temp = []
            flag1 = True
            for i in content:
                img_url = i.xpath('img/@src')

                if (img_url):
                    writelog("leiphone,即将处理img链接:" + json.dumps(img_url))

                if flag1 and img_url:
                    news['cover'] = img_url
                    # flag = False

                temp.append(i.text)
                temp.append(img_url)

            # 去除尾部多余信息
            temp.pop()
            temp.pop()

            for i in temp:
                if i:
                    if type(i) == list:
                        article = article + "\n" + "![](" + i[0] + ")" + "\n\n"
                    else:
                        article = article + i + "\n\n"

            summary = ""
            for i in temp:
                if len(summary) > 400:
                    break
                else:
                    if i:
                        if type(i) == list:
                            pass
                        else:
                            summary = summary + i + "\n"

            news['content'] = summary

            full_article = self.getContent(url)
            news['text'] = full_article
            news['labels'] = '雷锋网默认标签'
            # news['text'] = article
            news['service'] = 'Article.AddArticle'
        except Exception as e:
            writelog("leiphone,解析时出现异常,请检查!url=" + url)
            exc_type, exc_value, exc_traceback = sys.exc_info()
            writelog("*** print_exception:")
            traceback.print_exception(exc_type,
                                      exc_value,
                                      exc_traceback,
                                      limit=5,
                                      file=sys.stdout)
            writelog("-" * 100)
            flag = 1

        if flag == None:
            writelog("leiphone,处理正常结束!url=" + url)
            return news
        else:
            return None
            news['text'] = full_article
            news['labels'] = '雷锋网默认标签'
            # news['text'] = article
            news['service'] = 'Article.AddArticle'
        except Exception as e:
            writelog("leiphone,解析时出现异常,请检查!url=" + url)
            exc_type, exc_value, exc_traceback = sys.exc_info()
            writelog("*** print_exception:")
            traceback.print_exception(exc_type,
                                      exc_value,
                                      exc_traceback,
                                      limit=5,
                                      file=sys.stdout)
            writelog("-" * 100)
            flag = 1

        if flag == None:
            writelog("leiphone,处理正常结束!url=" + url)
            return news
        else:
            return None


if __name__ == '__main__':
    leiphone = leiphone()

    url = 'https://www.leiphone.com/category/ai'
    inner_url_list = leiphone.get_inner_url_list(url)
    writelog("雷锋网inner_url_list:" + json.dumps(inner_url_list))
    for inner_url in inner_url_list:
        writelog("雷锋网news:" + json.dumps(leiphone.get_news(inner_url)))
示例#22
0
                    news['cover'] = i[0]
                    break

            news['labels'] = u"产品中国"
        except Exception as e:
            writelog("pmtoo,解析时出现异常,请检查!url=" + url)
            exc_type, exc_value, exc_traceback = sys.exc_info()
            writelog("*** print_exception:")
            traceback.print_exception(exc_type,
                                      exc_value,
                                      exc_traceback,
                                      limit=5,
                                      file=sys.stdout)
            writelog("-" * 100)
            flag = 1

        if flag == None:
            writelog("pmtoo,处理正常结束!url=" + url)
            return news
        else:
            return None


if __name__ == '__main__':
    spider = pmtoo()

    url = 'http://www.pmtoo.com/article/category/产品经理'
    inner_url_list = spider.get_inner_url_list(url)
    writelog("产品中国inner_url_list:" + json.dumps(inner_url_list))
    for inner_url in inner_url_list:
        writelog("产品中国news:" + json.dumps(spider.get_news(inner_url)))
示例#23
0
            driver.quit()

            writelog("36kr,处理正常结束!url:" + url)
            return news
        except Exception as e:
            writelog("36kr,解析出现异常!url=" + url)
            exc_type, exc_value, exc_traceback = sys.exc_info()
            writelog("*** 异常堆栈如下:")
            traceback.print_exception(exc_type,
                                      exc_value,
                                      exc_traceback,
                                      limit=5,
                                      file=sys.stdout)
            writelog("-" * 100)
            return None


if __name__ == '__main__':

    # writelog(kr.get_news(url)['text'])

    kr = kr()
    url = 'https://36kr.com/api/search-column/23'
    inner_url_list = kr.get_inner_url_list_new(url)
    writelog(json.dumps(inner_url_list))
    for inner_url in inner_url_list:
        news = kr.get_news(inner_url['link'], inner_url['link'],
                           inner_url['desc'])
        writelog("抓取到36kr文章:\n" + json.dumps(news))