示例#1
0
 def __init__(self):
     PathUtils.initDir()  # 创建目录
     # 数据库操作
     dbUtils = DbUtils('config_3_series')
     # self.queryItems = dbUtils.select({"id": 511})
     # self.queryItems = dbUtils.select(None)
     self.queryItems = dbUtils.selectByPage(None, 0, 1)
示例#2
0
                                 callback=self.articleCommentNum,
                                 meta={"item": article_item})

    # 获取评论的数量
    def articleCommentNum(self, response):
        article_item = response.meta['item']
        response_json = json.loads(response.text)
        article_item['comment_num'] = response_json['result']['objcounts'][0][
            'replycountall']
        yield article_item

    # 解析当前所在位置
    def getLocation(self, originList):
        locationList = []
        for item in originList:
            if len(item.strip()) > 0:
                locationList.append(item.strip())
        tmp = ''
        for i in range(1, len(locationList) - 1):
            tmp = tmp + locationList[i] + '->'

        return locationList[0] + tmp + locationList[-1]


if __name__ == "__main__":
    execute(['scrapy', 'crawl', 'news_2_article'])
    dbUtils = DbUtils('news_2_article')
    queryItems = dbUtils.select(None)
    excelUtils = ExcelUtils()
    excelUtils.generateExcel('news', 'news_2_article', list(queryItems))
示例#3
0
 def __init__(self):
     # 数据库操作
     dbUtils = DbUtils('news_1_main')
     self.queryItems = dbUtils.selectByPage(None, 0, 100)
示例#4
0
from scrapy.cmdline import execute
from autohome.items import BrandItem
from autohome.spiders.utils.DbUtils import DbUtils
from autohome.spiders.utils.ExcelUtils import ExcelUtils


class BrandSpider(scrapy.Spider):
    name = 'config_1_brand'
    start_urls = [
        'https://www.autohome.com.cn/ashx/AjaxIndexCarFind.ashx?type=1'
    ]

    def parse(self, response):
        responseBody = response.body.decode(response.encoding)
        brandItems = json.loads(responseBody)['result']['branditems']
        for item in brandItems:
            brandItem = BrandItem()
            brandItem['_id'] = item['id']
            brandItem['name'] = item['name']
            brandItem['bfirstletter'] = item['bfirstletter']
            brandItem['logo'] = item['logo']
            yield brandItem


if __name__ == "__main__":
    execute(['scrapy', 'crawl', 'config_1_brand'])
    dbUtils = DbUtils('config_1_brand')
    queryItems = dbUtils.select(None)
    excelUtils = ExcelUtils()
    excelUtils.generateExcel('config', 'config_1_brand', list(queryItems))
示例#5
0
 def from_crawler(cls, crawler):
     dbUtils = DbUtils('ip_pool')
     queryItems = dbUtils.select(None)
     return cls(ip_pool=list(queryItems))
示例#6
0
            item['content'] = jsonItem['RContent']
            item['userImgUrl'] = "https:" + jsonItem['RUserHeaderImage']
            item['userName'] = jsonItem['RMemberName']
            item['time'] = jsonItem['replydate']
            item['floor'] = jsonItem['RFloor']
            item['id'] = re.search('&id=([0-9]*)', response.url).group(1)
            yield item
        # 判断是否还有下一页数据
        if receiveLen < commentCount:
            try:
                page = response.meta['page']
                page = page + 1
            except:
                page = 2
            url = 'https://reply.autohome.com.cn/api/comments/show.json?count=50&page={}&id={}&appid=1&datatype=jsonp&order=0&replyid=0'.format(
                page, item['id'])
            yield scrapy.Request(url=url,
                                 callback=self.parse,
                                 meta={
                                     "page": page,
                                     "receiveLen": receiveLen
                                 })


if __name__ == "__main__":
    execute(['scrapy', 'crawl', 'news_3_comment'])
    dbUtils = DbUtils('news_3_comment')
    queryItems = dbUtils.select(None)
    excelUtils = ExcelUtils()
    excelUtils.generateExcel('news', 'news_3_comment', list(queryItems))
示例#7
0
 def __init__(self):
     # 数据库操作
     dbUtils = DbUtils('news_2_article')
示例#8
0
    # 保存Excel
    def save(self):
        PathUtils.initDir()
        rootPath = PathUtils.getRootPath()
        self.workBook.save(rootPath + '/output/config/config_6_config.xlsx')

    # 生成Excel
    def generateExcel(self, resultList):
        for i in range(len(resultList)):
            resultItem = {}

            # 解析数据
            self.resolveJson(resultList[i], resultItem)

            # 生成表格头
            self.createHeader(resultItem)

            # 生成内容
            self.createContent(resultItem)

            # 保存Excel
            self.save()


if __name__ == "__main__":
    # 数据库操作
    dbUtils = DbUtils('config_6_config')
    queryItems = dbUtils.selectByPage(None, 0, 3)
    excel = Excel()
    excel.generateExcel(list(queryItems))
示例#9
0
            columnWidth = self.strLen((list(resultList[0].keys()))[i])
            # 获取一列数据
            columnList = self.getColumnList(resultList, i)
            # 获取该列数据中最大长度
            for j in range(len(columnList)):
                currentWidth = self.strLen(str(columnList[j]))
                columnWidth = self.getMax(columnWidth, currentWidth)
            # 设置该列的宽度
            if 10 < columnWidth < 30:
                workSheet.col(i).width = 256 * (columnWidth + 1)

    # 获取一列数据
    def getColumnList(self, resultList, i):
        columnList = []
        for k in range(len(resultList)):
            columnList.append((list(resultList[k].values()))[i])
        return columnList

    # 获取最大值
    def getMax(self, a, b):
        if a > b:
            return a
        return b


if __name__ == "__main__":
    dbUtils = DbUtils('3_series')
    queryItems = dbUtils.select(None)
    excelUtils = ExcelUtils()
    excelUtils.generateExcel('config', '3_series', list(queryItems))
示例#10
0
 def __init__(self):
     PathUtils.initDir()  # 初始化文件夹
     dbUtils = DbUtils('config_5_spec')  # 加载数据库
     self.queryItems = dbUtils.select(None)  # 查询数据
示例#11
0
 def __init__(self):
     # 数据库操作
     dbUtils = DbUtils('config_1_brand')
     self.queryItems = dbUtils.select(None)
示例#12
0
文件: 5_spec.py 项目: xzmeng/autohome
 def __init__(self):
     # 数据库操作
     dbUtils = DbUtils('config_3_series')
     self.queryItems = dbUtils.select(None)
示例#13
0
文件: 5_spec.py 项目: xzmeng/autohome
    def __init__(self):
        # 数据库操作
        dbUtils = DbUtils('config_3_series')
        self.queryItems = dbUtils.select(None)

    def start_requests(self):
        for item in self.queryItems:
            url = 'https://www.autohome.com.cn/ashx/AjaxIndexCarFind.ashx?type=5&value=%s' % item[
                'id']
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        responseBody = response.body.decode(response.encoding)
        yearItems = json.loads(responseBody)['result']['yearitems']
        for yearItem in yearItems:
            for item in yearItem['specitems']:
                resultItem = YearItem()
                resultItem['id'] = item['id']
                resultItem['name'] = item['name']
                resultItem['parentId'] = yearItem['id']
                yield resultItem


if __name__ == "__main__":
    execute(['scrapy', 'crawl', 'config_5_spec'])
    dbUtils = DbUtils('config_5_spec')
    queryItems = dbUtils.select(None)
    excelUtils = ExcelUtils()
    excelUtils.generateExcel('config', 'config_5_spec', list(queryItems))
示例#14
0
    def __init__(self):
        # 数据库操作
        dbUtils = DbUtils('config_1_brand')
        self.queryItems = dbUtils.select(None)

    def start_requests(self):
        for item in self.queryItems:
            url = 'https://www.autohome.com.cn/ashx/AjaxIndexCarFind.ashx?type=3&value=%s' % item['_id']
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        parentId = str(response.url).strip().split("value=")[-1]
        responseBody = response.body.decode(response.encoding)
        factoryItems = json.loads(responseBody)['result']['factoryitems']
        for item in factoryItems:
            factoryItem = FactoryItem()
            factoryItem['id'] = item['id']
            factoryItem['name'] = item['name']
            factoryItem['firstLetter'] = item['firstletter']
            factoryItem['parentId'] = parentId
            yield factoryItem


if __name__ == "__main__":
    execute(['scrapy', 'crawl', 'config_2_factory'])
    dbUtils = DbUtils('config_2_factory')
    queryItems = dbUtils.select(None)
    excelUtils = ExcelUtils()
    excelUtils.generateExcel('config', 'config_2_factory', list(queryItems))
示例#15
0
# coding=utf-8
import json
import os
import re

from selenium import webdriver

from autohome.items import ConfigItem
from autohome.spiders.utils.DbUtils import DbUtils
from autohome.spiders.utils.ExcelUtils import ExcelUtils
from autohome.spiders.utils.PathUtils import PathUtils

if __name__ == "__main__":

    # 数据库操作
    dbUtils = DbUtils('config_6_config')
    # 当前项目根目录
    rootPath = PathUtils.getRootPath()
    # 注入的Js
    injectJs = (
        "let rules = '';"
        "document.createElement = function() {"
        "      return {"
        "              sheet: {"
        "                      insertRule: function(rule, i) {"
        "                              if (rules.length == 0) {"
        "                                      rules = '#' + rule;"
        "                              } else {"
        "                                      rules = rules + '#' + rule;"
        "                              }"
        "                      }"
示例#16
0
文件: 1_main.py 项目: xzmeng/autohome
                            './/img/@src').extract()[0]
                    article_short_item['title'] = each_short_article.xpath(
                        './/h3/text()').extract()[0]
                    article_short_item[
                        'publicTime'] = each_short_article.xpath(
                            './/span[@class="fn-left"]/text()').extract()[0]
                    article_short_item['readNum'] = each_short_article.xpath(
                        './/span[@class="fn-right"]//em[1]/text()').extract(
                        )[0]
                    article_short_item['shortContent'] = ''.join(
                        each_short_article.xpath(
                            './/p/text()').extract()).strip()
                    yield article_short_item

        # 请求下一页
        next_url_part = response.xpath(
            '//div[@id="channelPage"]/a[@class="page-item-next"]/@href'
        ).extract()[0]
        if next_url_part != '':
            article_next_url = 'http://www.autohome.com.cn{}'.format(
                next_url_part)
            yield scrapy.http.Request(article_next_url, callback=self.parse)


if __name__ == "__main__":
    execute(['scrapy', 'crawl', 'news_1_main'])
    dbUtils = DbUtils('news_1_main')
    queryItems = dbUtils.select(None)
    excelUtils = ExcelUtils()
    excelUtils.generateExcel('news', 'news_1_main', list(queryItems))