Exemplo n.º 1
0
class CarnextItem(scrapy.Item):
    _newPrice_amount = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _newPrice_currencyCode = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _body = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _transmission = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _numberOfDoors = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _numberOfSeats = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _options = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=Join(),
    )
    _customerRef = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _power_amount = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _power_unit = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _horsePower = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _paint_baseColor = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _paint_lightness = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _paint_type = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _maintenanceHistory = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=Join(),
    )
    _retailLocationOptions = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=Join(),
    )
    _parkingLocation = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _vehicleType = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _transferFee = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _leaseOptions = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=Join(),
    )
    _isFreeHomeDelivery = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _inStockDate = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _redirectUrl = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _towingWeightUnbraked = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _towingWeightBraked = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _id = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _make = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _model = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _type = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _mileage_amount = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _mileage_unit = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _minimalMonthlyPrice_additionalMonthlyFee = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _minimalMonthlyPrice_downpaymentPercentage = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _minimalMonthlyPrice_isVatExcluded = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _minimalMonthlyPrice_amount = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _minimalMonthlyPrice_currencyCode = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _salePrice_regionalPrice = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _salePrice_administrativeCosts = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _salePrice_isVatDeductible = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _salePrice_isVatExcluded = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _salePrice_amount = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _salePrice_currencyCode = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _yearOfConstruction = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _fuel = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _hexonRef = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _retailLocationCode = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _retailLocationId = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _registrationDate = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _reservationStatus = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _vehicleCondition = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _lifeCycleStatus = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _co2Emission = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _euroNormClass = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _countryCode = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _isFromForeignCountry = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _isFavorite = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _isLeaseable = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _isBuyable = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _leaseOption_numberOfMonths = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _leaseOption_maxMileage = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _leaseOption_amountExVat = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _leaseOption_amount = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _leaseOption_currencyCode = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _autoManagerRef = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _licensePlate = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _vin = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _avgFuelConsumption = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _roadTax_min = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _roadTax_max = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _engineDisplacement = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _availableCarsInGroup = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _origin = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _carError = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _favoriteLoaded = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _selectedLeaseOption_numberOfMonths = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _selectedLeaseOption_maxMileage = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _selectedLeaseOption_amountExVat = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _selectedLeaseOption_amount = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _selectedLeaseOption_currencyCode = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    _selectedLeaseExcludingVat = scrapy.Field(
        input_processor=MapCompose(foo),
        output_processor=TakeFirst(),
    )
    pass
def parse_keywords(response):
    il = ItemLoader(item=TnaWebsiteItem(), response=response)
    il.default_input_processor = MapCompose(lambda v: v.split(), replace_escape_chars)
    il.default_output_processor = Join()
    il.add_xpath('KEYWORDS', '//meta[@name="keywords"]/@content')
    return il.load_item()
Exemplo n.º 3
0
class ExampleLoader(ItemLoader):
    default_item_class = ExampleItem
    default_input_processor = MapCompose(lambda s: s.strip())
    default_output_processor = TakeFirst()
    description_out = Join()
Exemplo n.º 4
0
class newsItem(Item):
    description = Field(input_processor=MapCompose(remove_tags,
                                                   replace_entities,
                                                   replace_escape_chars,
                                                   str.strip),
                        output_processor=Join())
Exemplo n.º 5
0
class ZhipinItem(scrapy.Item):
    #boss直聘网职位信息
    title = scrapy.Field()
    url = scrapy.Field()
    url_object_id = scrapy.Field()
    salary = scrapy.Field()
    job_city = scrapy.Field(input_processor=MapCompose(get_city))
    work_years = scrapy.Field(input_processor=MapCompose(get_experience))
    degree_need = scrapy.Field(input_processor=MapCompose(get_degree))
    publish_time = scrapy.Field(input_processor=MapCompose(remove_chinaese))
    job_advantage = scrapy.Field(
        input_processor=MapCompose(Null_if),
        output_processor=Join(),
    )
    job_desc = scrapy.Field(
        input_processor=MapCompose(Null_if),
        output_processor=Join(),
    )
    job_addr = scrapy.Field(
        input_processor=MapCompose(remove_tags, handle_jobaddr),  #去除html的tags
    )
    company_name = scrapy.Field()
    company_url = scrapy.Field()
    tags = scrapy.Field(input_processor=Join(","))
    crawl_time = scrapy.Field()

    def get_insert_sql(self):
        insert_sql = """
            insert into job(title, url, url_object_id, salary, job_city, work_years, degree_need, publish_time, 
            job_advantage, job_desc, job_addr, company_name, company_url,
            tags, crawl_time) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
            ON DUPLICATE KEY UPDATE salary=VALUES(salary), job_desc=VALUES(job_desc)
        """
        params = (
            self["title"],
            self["url"],
            self["url_object_id"],
            self["salary"],
            self["job_city"],
            self["work_years"],
            self["degree_need"],
            self["publish_time"],
            self["job_advantage"],
            self["job_desc"],
            self["job_addr"],
            self["company_name"],
            self["company_url"],
            self["tags"],
            self["crawl_time"].strftime(SQL_DATETIME_FORMAT),
        )
        return insert_sql, params

    def save_to_es(self):
        job = LagouType()
        job.title = self['title']
        if "create_date" in self:
            job.create_date = self["create_date"]
        job.url = self["url"]
        job.meta.id = self["url_object_id"]
        if "salary" in self:
            job.salary = self["salary"]
        job.job_city = self["job_city"]
        if "work_years" in self:
            job.work_years = self["work_years"]
        job.degree_need = self["degree_need"]
        job.tags = self["tags"]
        job.publish_time = self["publish_time"]
        job.job_advantage = self["job_advantage"]
        job.job_desc = self["job_desc"]
        job.job_addr = self["job_addr"]
        job.company_name = self["company_name"]
        job.company_url = self["company_url"]
        job.crawl_time = self["crawl_time"]
        job.suggest = gen_suggests(LagouType._doc_type.index,
                                   ((job.title, 10), (job.tags, 7)))
        job.save()
        redis_cli.incr("jobbole_count")
        return
Exemplo n.º 6
0
class EntryLoader(ItemLoader):
    content_in = MapCompose(unicode.strip)
    content_out = Join()
Exemplo n.º 7
0
class ChinaTechLoader(NewsLoader):
    """继承NewsLoader"""
    # Join()将列表拼接成字符串,lambda xxxx: 去掉前后空白字符
    text_out = Compose(Join(), lambda s: s.strip())
    source_out = Compose(Join(), lambda s: s.strip())
Exemplo n.º 8
0
class CrawlerItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()
    text = scrapy.Field(output_processor=Join())
    date = scrapy.Field()
Exemplo n.º 9
0
class BeritaLoader(ItemLoader):

    default_output_processor = Join()

    name_in = MapCompose(unicode.title)
    name_out = Join()
Exemplo n.º 10
0
class Sx8672Item(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field(
        input_processor=MapCompose(remove_tags),
        output_processor=Join(),
    )
    amount = scrapy.Field(
        input_processor=MapCompose(remove_tags),
        output_processor=Join(),
    )
    rate = scrapy.Field(
        input_processor=MapCompose(remove_tags),
        output_processor=Join(),
    )
    period = scrapy.Field(
        input_processor=MapCompose(remove_tags),
        output_processor=Join(),
    )
    start = scrapy.Field(
        input_processor=MapCompose(remove_tags),
        output_processor=Join(),
    )
    end = scrapy.Field(
        input_processor=MapCompose(remove_tags),
        output_processor=Join(),
    )
    invest_records = scrapy.Field(output_processor=Join())
    pay_type = scrapy.Field(
        input_processor=MapCompose(remove_tags),
        output_processor=Join(),
    )
    loaner_info = scrapy.Field(
        input_processor=MapCompose(remove_tags),
        output_processor=Join(),
    )
    loan_using = scrapy.Field(
        input_processor=MapCompose(remove_tags),
        output_processor=Join(),
    )
    loan_info = scrapy.Field(
        input_processor=MapCompose(remove_tags),
        output_processor=Join(),
    )
    progress = scrapy.Field(
        input_processor=MapCompose(remove_tags),
        output_processor=Join(),
    )
    code = scrapy.Field(
        input_processor=MapCompose(remove_tags),
        output_processor=Join(),
    )
    web_name = scrapy.Field(output_processor=Join())
    url = scrapy.Field(output_processor=Join())
    web_code = scrapy.Field(output_processor=Join())
    item_code = scrapy.Field(output_processor=Join())
    a = scrapy.Field()
    b = scrapy.Field()
Exemplo n.º 11
0
    def parse_content(self, response):
        print(response.url)

        def deal_img_urls(img_urls_raw):
            img_url_list = []
            for one_img_url in img_urls_raw:
                if 'paypal_cn' or 'pixel.gif' in one_img_url:
                    continue
                if 'http' and 'www' not in one_img_url:
                    url_img = urljoin('http://www.tibetanyouthcongress.org/',
                                      one_img_url)
                    img_url_list.append(url_img)

            return img_url_list

        def deal_next_page_url(response):
            next_page_url_list = response.xpath('//p[@class="pages"]/a')
            if '&page=' in response.url:
                try:
                    page_now_split = int(response.url.split('&page='))
                    page_now_int = str(page_now_split[1])
                    next_page_url = page_now_split[0] + '&page=' + str(
                        page_now_int + 1)
                except:
                    page_now_int = 1
                    next_page_url = response.url.split('&page')[0] + str(
                        page_now_int + 1)

            else:
                page_now_int = 1
                next_page_url = response.url + '&page=' + str(page_now_int + 1)

            if next_page_url_list:
                if page_now_int < len(next_page_url_list):
                    return next_page_url
                else:
                    return None
            else:
                return None

        content_loader = ItemLoader(response=response,
                                    item=YfspiderspeakItem())
        content_loader.add_value('url', response.url)
        content_loader.add_value('spider_time', time.time())
        content_loader.add_value('id', response.url.strip('/').split('=')[1])

        content_loader.add_xpath(
            'title',
            '//div[@id="content"]//div[@class="entry_title_box"]//div[@class="entry_title"]//text()'
        )
        content_loader.add_xpath(
            'content', '//div[@class="entry"]/div[@id="entry"]//text()',
            Join())
        content_loader.add_value('publish_time', '1111-11-11 11:11:11')
        content_loader.add_value(
            'img_urls', '//div[@class="entry"]/div[@id="entry"]//@src',
            deal_img_urls)
        content_loader.add_xpath('video_urls', '//embed/@src')

        item1 = content_loader.load_item()

        next_page_url = deal_next_page_url(response)
        if not next_page_url:
            return item1
        else:
            return scrapy.Request(url=next_page_url,
                                  meta={'pre_data': item1},
                                  headers=self.headers,
                                  callback=self.deal_next_page)
Exemplo n.º 12
0
class TechLoader(NewsLoader):
    text_out = Compose(Join(), lambda s: s.strip())
    source_out = Compose(Join(), lambda s: s.strip())
Exemplo n.º 13
0
 def test_join(self):
     proc = Join()
     self.assertRaises(TypeError, proc, [None, '', 'hello', 'world'])
     self.assertEqual(proc(['', 'hello', 'world']), u' hello world')
     self.assertEqual(proc(['hello', 'world']), u'hello world')
     self.assertIsInstance(proc(['hello', 'world']), str)
Exemplo n.º 14
0
 class TestItemLoader(ItemLoader):
     default_item_class = TestItem
     name_out = Compose(Join(), float)
Exemplo n.º 15
0
class LagouJobItem(scrapy.Item):
    # 拉勾网职位信息
    title = scrapy.Field()
    url = scrapy.Field()
    url_object_id = scrapy.Field()
    salary_min = scrapy.Field()
    salary_max = scrapy.Field()
    job_city = scrapy.Field(input_processor=MapCompose(remove_splash), )
    work_years_min = scrapy.Field(input_processor=MapCompose(remove_splash), )
    work_years_max = scrapy.Field(input_processor=MapCompose(remove_splash), )
    degree_need = scrapy.Field(input_processor=MapCompose(remove_splash), )
    job_type = scrapy.Field()
    publish_time = scrapy.Field()
    job_advantage = scrapy.Field()
    job_desc = scrapy.Field()
    job_addr = scrapy.Field(input_processor=MapCompose(remove_tags,
                                                       handle_jobaddr), )
    company_name = scrapy.Field()
    company_url = scrapy.Field()
    tags = scrapy.Field(input_processor=Join(","))
    crawl_time = scrapy.Field()
    crawl_update_time = scrapy.Field()

    def get_insert_sql(self):
        insert_sql = """
            insert into lagou_job(title, url, url_object_id, salary_min, salary_max, job_city, work_years_min, work_years_max, degree_need,
            job_type, publish_time, job_advantage, job_desc, job_addr, company_name, company_url,
            tags, crawl_time) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
            ON DUPLICATE KEY UPDATE salary_min=VALUES(salary_min), salary_max=VALUES(salary_max), job_desc=VALUES(job_desc)
        """

        match_obj1 = re.match("经验(\d+)-(\d+)年", self['work_years_min'])
        match_obj2 = re.match("经验应届毕业生", self['work_years_min'])
        match_obj3 = re.match("经验不限", self['work_years_min'])
        match_obj4 = re.match("经验(\d+)年以下", self['work_years_min'])
        match_obj5 = re.match("经验(\d+)年以上", self['work_years_min'])

        if match_obj1:
            self['work_years_min'] = match_obj1.group(1)
            self['work_years_max'] = match_obj1.group(2)
        elif match_obj2:
            self['work_years_min'] = 0.5
            self['work_years_max'] = 0.5
        elif match_obj3:
            self['work_years_min'] = 0
            self['work_years_max'] = 0
        elif match_obj4:
            self['work_years_min'] = 0
            self['work_years_max'] = match_obj4.group(1)
        elif match_obj5:
            self['work_years_min'] = match_obj4.group(1)
            self['work_years_max'] = match_obj4.group(1) + 100
        else:
            self['work_years_min'] = 999
            self['work_years_max'] = 999

        match_salary = re.match("(\d+)[Kk]-(\d+)[Kk]", self['salary_min'])
        if match_salary:
            self['salary_min'] = match_salary.group(1)
            self['salary_max'] = match_salary.group(2)
        else:
            self['salary_min'] = 666
            self['salary_max'] = 666
        match_time1 = re.match("(\d+):(\d+).*", self["publish_time"])
        match_time2 = re.match("(\d+)天前.*", self["publish_time"])
        match_time3 = re.match("(\d+)-(\d+)-(\d+)", self["publish_time"])
        if match_time1:
            today = datetime.datetime.now()
            hour = int(match_time1.group(1))
            minutues = int(match_time1.group(2))
            time = datetime.datetime(today.year, today.month, today.day, hour,
                                     minutues)
            self["publish_time"] = time.strftime(SQL_DATETIME_FORMAT)
        elif match_time2:
            days_ago = int(match_time2.group(1))
            today = datetime.datetime.now() - datetime.timedelta(days=days_ago)
            self["publish_time"] = today.strftime(SQL_DATETIME_FORMAT)
        elif match_time3:
            year = int(match_time3.group(1))
            month = int(match_time3.group(2))
            day = int(match_time3.group(3))
            today = datetime.datetime(year, month, day)
            self["publish_time"] = today.strftime(SQL_DATETIME_FORMAT)
        else:
            self["publish_time"] = datetime.datetime.now().strftime(
                SQL_DATETIME_FORMAT)

        params = (
            self["title"],
            self["url"],
            self["url_object_id"],
            self["salary_min"],
            self["salary_max"],
            self["job_city"],
            self["work_years_min"],
            self["work_years_max"],
            self["degree_need"],
            self["job_type"],
            self["publish_time"],
            self["job_advantage"],
            self["job_desc"],
            self["job_addr"],
            self["company_name"],
            self["company_url"],
            self["tags"],
            self["crawl_time"].strftime(SQL_DATETIME_FORMAT),
        )

        return insert_sql, params
Exemplo n.º 16
0
class ProductItemLoader(BaseItemLoader):
    """Product item loader."""

    price_in = MapCompose(clean_text, float)
    category_out = Join(' > ')
Exemplo n.º 17
0
class ChinaLoader(NewsLoader):
    '''
    该类为ItemLoader的子类,为实现对Item的配置化提取所做的重写/ 涉及多个继承
    '''
    text_out = Compose(Join(), lambda s: s.strip())
    source_out = Compose(Join(), lambda s: s.strip())
Exemplo n.º 18
0
class ChinaLoader(ItemLoader):
    default_output_processor = TakeFirst()
    text_out = Compose(Join(), lambda s: s.strip())
    source_out = Compose(Join(), lambda s: s.strip())
Exemplo n.º 19
0
class JobBoleArticleItem(scrapy.Item):
    title = scrapy.Field(
        input_processor=MapCompose(lambda x: x + "-jobbole", add_jobbole)
    )
    create_date = scrapy.Field(
        input_processor=MapCompose(date_convert),
        output_processor=TakeFirst()
    )
    url = scrapy.Field()
    url_object_id = scrapy.Field()
    front_image_url = scrapy.Field(
        output_processor=MapCompose(return_value)
    )
    front_image_path = scrapy.Field()
    praise_nums = scrapy.Field(
        input_processor=MapCompose(get_nums)
    )
    comment_nums = scrapy.Field(
        input_processor=MapCompose(get_nums)
    )
    fav_nums = scrapy.Field(
        input_processor=MapCompose(get_nums)
    )
    tags = scrapy.Field(
        input_processor=MapCompose(remove_comment_tags),
        output_processor=Join(",")
    )
    content = scrapy.Field()

    def get_insert_sql(self):
        insert_sql = """
            insert into jobbole_article(title, url, create_date, fav_nums, front_image_url, front_image_path,
            parise_nums, comment_nums, tags, content)
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE fav_nums=VALUES(fav_nums),
            front_image_url=VALUES(front_image_url), front_image_path(front_image_path), content=VALUES(content),
            parise_nums=VALUES(praise_nums), comment_nums=VALUES(comment_nums), tags=VALUES(tags)
        """
        params = (self["title"], self["url"], self["create_date"], self["fav_nums"])

        return insert_sql, params

    def save_to_es(self):
        article = ArticleType()
        article.title = self['title']
        article.create_date = self['create_date']
        article.content = remove_tags(self['content'])
        article.front_image_url = self['front_image_url']
        if 'front_image_path' in self:
            article.front_image_path = self['front_image_path']
        article.praise_nums = self['praise_nums']
        article.fav_nums = self['fav_nums']
        article.comment_nums = self['comment_nums']
        article.url = self['url']
        article.tags = self['tags']
        article.meta.id = self['url_object_id']

        article.suggest = gen_suggests(ArticleType._doc_type.index, ((article.title, 10), (article.tags, 7)))

        article.save()

        redis_cli.incr("jobbole_count")

        return
Exemplo n.º 20
0
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
from scrapy.loader import ItemLoader
from scrapy.loader.processors import Compose, MapCompose, Join, TakeFirst
from datetime import date
import re

clean_text = Compose(MapCompose(lambda v: v.strip()), Join())
to_int = Compose(TakeFirst(), int)
to_date = Compose(TakeFirst(), date)


class NewsItem(scrapy.Item):
    titulo = scrapy.Field()
    texto = scrapy.Field()
    url = scrapy.Field()
    fecha = scrapy.Field()
    strategy = scrapy.Field()


class NewsItemLoader(ItemLoader):
    default_item_class = NewsItem
    titulo_out = clean_text
    texto_out = clean_text
    fecha_out = to_date
Exemplo n.º 21
0
class ChinaDigiLoader(NewsLoader):
    text_out = Compose(Join(), lambda s: s.strip())
    source_out = Compose(Join(), lambda s: s.strip())
 def parse(self, response):
     l = ItemLoader(item=articleItem(), response=response) # create itemloader l - following is adding to Fields
     l.add_xpath('title',        '//*[@id="firstHeading"]/text()')
     l.add_xpath('publish_date', '//div[@id="catlinks"]/div[@id="mw-normal-catlinks"]/ul/li/a/text()',re='(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)[\s,]*(?:\d{1,2})[\s,]*(?:\d{4})')
     l.add_xpath('content',      '//div[@id="mw-content-text"]/div[@class="mw-parser-output"]/p/text()|//div[@id="mw-content-text"]/div[@class="mw-parser-output"]/p/child::a/text()|//div[@id="mw-content-text"]/div[@class="mw-parser-output"]/ul/li/text()', Join(' '))
     l.add_xpath('categories',   '//div[@id="catlinks"]/div[@id="mw-normal-catlinks"]/ul/li/a/text()')
     l.add_xpath('sources_url',  '//div[@id="mw-content-text"]/div[@class="mw-parser-output"]/ul/li/span/a/@href')
     l.add_xpath('sources_wiki_page_url', '//div[@id="mw-content-text"]/div[@class="mw-parser-output"]/ul/li/span/i/span/a/@href')
     l.add_value('article_url', response.request.url)
     l.add_value('scraped_at', (datetime.today().strftime('%Y-%m-%d')) )
     yield l.load_item() # could use return/yield - no idea what changes
Exemplo n.º 23
0
class Century21OfficeLoader(ItemLoader):
    default_input_processor = MapCompose(remove_tags, str.strip)
    default_output_processor = TakeFirst()

    officePhone_in = MapCompose(serialize_number)
    officeAddress_out = Join(', ')
Exemplo n.º 24
0
    def populate_item(self, response):
        l = ItemLoader(item=O2Item(), response=response)
        l.default_input_processor = MapCompose(str.strip)
        l.default_output_processor = Join(' ')

        l.add_css('tarif', ".tariffProductName::attr(value)")

        # grundpreis_euro = response.xpath('//span[contains(., "monatlich")]/following-sibling::*[contains(@class, "sum")]//text()').extract()
        # grundpreis_cent = response.xpath('//span[contains(., "monatlich")]/following-sibling::*[contains(@class, "suffix")]//text()').extract()
        # grundpreis = str(grundpreis_euro) + ',' + str(grundpreis_cent) + '€'
        #l.add_value('grundpreis', grundpreis)
        # l.add_xpath('grundpreis', '//span[contains(., "monatlich")]/following-sibling::*[contains(@class, "sum")]//text()')
        # l.add_xpath('grundpreis', '//span[contains(., "monatlich")]/following-sibling::*[contains(@class, "suffix")]//text()')

        grundpreis_wert = response.xpath(
            '//*[contains(concat(" ", normalize-space(@class), " "), " tariff-details-property ")]/div[@class="tariff-information-table"]//*[contains(., "Monatliche Grundgebühr")]/following-sibling::*/div/span/text()'
        ).re("[0-9,.]+")

        # for n in grundpreis_wert:
        #     n.replace(',','.')

        # for i in grundpreis_wert:
        #     i = float(i)

        l.add_value('grundpreis', grundpreis_wert)
        # l.add_xpath('grundpreis', '//*[contains(concat(" ", normalize-space(@class), " "), " tariff-details-property ")]/div[@class="tariff-information-table"]//*[contains(., "Monatliche Grundgebühr")]/following-sibling::*/div/span/text()')

        #l.add_xpath('bereitstellungspreis', '//*[contains(concat(" ", normalize-space(@class), " "), " tariff-details-property ")]/div[@class="tariff-information-table"]//*[contains(., "Anschlusspreis")]/following-sibling::*/following-sibling::*//span/text()')
        bereitstellungspreis_wert = response.xpath(
            '//*[contains(concat(" ", normalize-space(@class), " "), " tariff-details-property ")]/div[@class="tariff-information-table"]//*[contains(., "Anschlusspreis")]/following-sibling::*/following-sibling::*//span/text()'
        ).re("[0-9,.]+")

        l.add_value('bereitstellungspreis', bereitstellungspreis_wert)

        # l.add_xpath('dauer', '//*[@class="row"]/table//tr[contains(., "Dauer")]/td[2]/strong/text()')

        #minute_wert = response.xpath('string(//*[@class="tariff-description"]/article[4]/div/div/table/tbody/tr[1]/td[2]/strong/text())').re("[0-9,.€ ]+")
        minute_wert = response.xpath(
            'string(//*[contains(., "Gespräche:")]/following-sibling::*/strong/text())'
        ).re("[0-9,.]+")
        l.add_value('minute', minute_wert)

        sms_wert = response.xpath(
            'string(//*[contains(., "SMS:")]/following-sibling::*/strong/text())'
        ).re("[0-9,.]+")
        l.add_value('sms', sms_wert)

        mms_wert = response.xpath(
            'string(//*[contains(., "MMS:")]/following-sibling::*/text())').re(
                "[0-9,.]+")
        l.add_value('mms', mms_wert)

        # l.add_xpath('datennutzung', '//*[@class="row"]/table//tr[contains(., "Datennutzung")]/td[2]/strong/text()')
        # l.add_xpath('geschwindigkeit_down', '//*[@class="row"]/table//tr[contains(., "Download")]/td[2]/strong/text()')
        # l.add_xpath('geschwindigkeit_up', '//*[@class="row"]/table//tr[contains(., "Upload")]/td[2]/strong/text()')
        l.add_xpath(
            'datenvolumen',
            '//div[@id="tariff-carousel"]//div[@data-ng-bind-html="tariff.tariffFeatures.feature1 | coUnsafeHtml"]//span/text()'
        )

        yield l.load_item()
Exemplo n.º 25
0
    def parse(self, response):
        def deal_publish_time(publish_time_raw=None):
            if publish_time_raw:  #需要注意这里边可能没有提取到,如果没有取到,这里其实也有可能不是none,而是没有传入数据,所以要在开头默认赋值
                mouth_str_dict = {
                    'January': '01',
                    'February': '02',
                    'March': '03',
                    'April': '04',
                    'May': '05',
                    'June': '06',
                    'July': '07',
                    'August': '08',
                    'September': '09',
                    'October': '10',
                    'November': '11',
                    'December': '12',
                }
                publish_mouth = publish_time_raw.split(' ')
                if str(publish_mouth[0].strip()) in mouth_str_dict.keys():
                    try:
                        mouth_num_str = mouth_str_dict[str(
                            publish_mouth[0].strip())]
                        publish_time = str(publish_mouth[2].strip(
                        )) + '-' + mouth_num_str + '-' + str(publish_mouth[1])
                        publish_date = publish_time.strip(',') + ' 00:00:00'
                        time_tuple = time.strptime(publish_date,
                                                   '%Y-%m-%d %H:%M:%S')
                        publish_time = time.mktime(time_tuple)
                        return str(int(publish_time))
                    except Exception as e:
                        print(e)
                else:
                    return publish_mouth

            else:
                return None

        def deal_publisher(html_raw):
            response_publisher = scrapy.http.HtmlResponse(
                url='thisIsJavaScript', body=str(html_raw))
            publish_user = response_publisher.xpath(
                './/span[@class="byline-author"]/text()').extract_first(
                    default=None)
            publish_user = publish_user.split(',')[0].split('by')[1].split(
                'and')

            print(publish_user)
            return publish_user

        for i in response.xpath('//*[@id="Blog1"]/div[@class="post"]'):
            itemloderArticle = ItemLoader(item=ThreatcollectItem(), selector=i)
            itemloderArticle.add_xpath('title', './/h2/a/text()', TakeFirst())
            itemloderArticle.add_xpath('url', './/h2/a/@href', TakeFirst())
            itemloderArticle.add_xpath(
                'publish_time',
                './/div[@class="post-header"]/div[@class="published"]/span/text()',
                Join(), deal_publish_time)
            itemloderArticle.add_xpath(
                'content',
                './/div[@class="post-body"]/div[contains(@class,"post-content")]/script/text()',
                MapCompose(remove_tags))
            itemloderArticle.add_xpath('article_id', './/@data-id')
            itemloderArticle.add_value('img_urls', i.re(r'src="(.*?)"'))
            itemloderArticle.add_value('spider_time', time.time() * 1000)
            itemloderArticle.add_xpath(
                'publisher',
                './/div[@class="post-body"]/div[contains(@class,"post-content")]/script/text()',
                deal_publisher)
            itemloderArticle.add_value('html', i.extract())

            item1 = itemloderArticle.load_item()
            yield item1
            # yield response.follow(url=item1['url'],headers=self.headers,meta={'item':item1},callback=self.parse_item)

        nexturl = response.xpath(
            '//*[@id="Blog1_blog-pager-older-link"]/@href').extract()
Exemplo n.º 26
0
class FictionItem(scrapy.Item):
    title = scrapy.Field(input_processor=MapCompose(ma),
                         output_processor=Join())
    content = scrapy.Field(input_processor=MapCompose(en),
                           output_processor=Bytejoin())
    next = scrapy.Field(output_processor=Join())
Exemplo n.º 27
0
class TaobaoSpiderLoader(ItemLoader):
    default_item_class = TaobaoItem
    default_input_processor = MapCompose(lambda s: s.strip())
    default_output_processor = TakeFirst()
    description_out = Join()
Exemplo n.º 28
0
class NtuTimetablesLoader(ModifyLoader):
    timetable_in = MapCompose()
    timetable_out = Identity()

    remark_in = MapCompose(lambda x: x + u'\n')
    remark_out = Join('')
Exemplo n.º 29
0
 class TakeFirstItemLoader(TestItemLoader):
     name_out = Join()
Exemplo n.º 30
0
class ElFaroDeCeutaLoader(ItemLoader):
    default_output_processor = TakeFirst()
    content_out = Join()