def _convert(data): if t not in ['join', 'list'] and isinstance(data, list): data = TakeFirst()(data) if type(data) in [str, unicode]: data = data.strip() elif type(data) in [int, float, datetime]: data = str(data) else: return data if t=='join': sep = inf.get('sep', u' ') return Join(sep)(data) elif t=='list': sep = inf.get('sep', u' ') return remove_tags(Join(sep)(data)).strip() elif t=='text': return remove_tags(data).strip() elif t=='clean': cleaner = Cleaner(style=True, scripts=True, javascript=True, links=True, meta=True) return cleaner.clean_html(data) elif t=='unesc': return HTMLParser().unescape(data) elif t=='base64': return base64.decodestring(data) elif t=='sub': frm = inf.get('from') to = inf.get('to') return re.sub(frm, to, data) elif t=='jpath': qs = inf.get('query') return jsonpath.jsonpath(json.loads(data), qs) elif t=='map': m = inf.get('map') d = inf.get('default') return m.get(data, d) elif t=='int': return int(float(data)) elif t=='float': return float(data) elif t=='date': fmt = inf.get('fmt', 'auto') tz = inf.get('tz', '+00:00') return parse_date(data, fmt, tz) elif t=='cst': fmt = inf.get('fmt', 'auto') return parse_date(data, fmt, '+08:00') else: return data
class ArticleItemLoader(ItemLoader): #自定义itemloader default_output_processor = TakeFirst()
class UfItemLoader(ItemLoader): default_item_class = UfItem default_output_processor = TakeFirst()
class CtripItermLoader(ItemLoader): default_output_processor = TakeFirst()
class LagouRedisItemLoader(ItemLoader): default_output_processor = TakeFirst() tags_out = Identity()
class AticDataItem(ItemLoader): default_output_processor = TakeFirst()
class StrandbooksscraperItem(scrapy.Item): organization = scrapy.Field(output_processor=TakeFirst(), ) #rich text format - no special chars title = scrapy.Field( input_processor=Compose(TakeFirst(), name_filter), output_processor=TakeFirst(), ) #rich text format - no special chars description = scrapy.Field(output_processor=TakeFirst(), ) eventWebsite = scrapy.Field( output_processor=TakeFirst(), ) #full link! Hard-code http://.... if missing! street = scrapy.Field(output_processor=TakeFirst(), ) #rich text format - no special chars city = scrapy.Field(output_processor=TakeFirst(), ) #rich text format - no special chars state = scrapy.Field(output_processor=TakeFirst(), ) #rich text format - no special chars zip = scrapy.Field(output_processor=TakeFirst(), ) #numerical fromat required: xxxxx dateFrom = scrapy.Field( input_processor=Compose(TakeFirst(), date_converter), output_processor=TakeFirst(), ) # Only acceptable format is dd/mm/yyyy !! - ex: 19/12/2017 startTime = scrapy.Field( input_processor=Compose(TakeFirst(), time_converter), output_processor=TakeFirst(), ) # Only acceptable format is hh:mm am/pm !! - ex: 07:45 pm In_group_id = scrapy.Field(output_processor=Compose(lambda v: v[0]), ) # should be empty! will code that later ticketUrl = scrapy.Field(output_processor=TakeFirst(), ) #full link! Hard-code http://.... if missing! eventImage = scrapy.Field( output_processor=TakeFirst(), ) #full link! Hard-code http://.... if missing! Leave empty if event image is missing! dateTo = scrapy.Field( input_processor=Compose(TakeFirst(), date_converter), output_processor=TakeFirst(), ) #(REQUIRED FORMAT: dd/mm/yyyy) endTime = scrapy.Field( input_processor=Compose(TakeFirst(), time_converter), output_processor=TakeFirst(), ) #(REQUIRED FORMAT: hh:mm am/pm)
class EwgScraperProduct(scrapy.Item): # Define the fields for Products url = scrapy.Field(output_processor=TakeFirst()) product_id = scrapy.Field(output_processor=TakeFirst()) product_name = scrapy.Field(output_processor=TakeFirst()) product_score = scrapy.Field(output_processor=TakeFirst()) product_type = scrapy.Field(output_processor=TakeFirst()) data_availability = scrapy.Field(output_processor=TakeFirst()) overall_hazard_score = scrapy.Field(output_processor=TakeFirst()) cancer_score = scrapy.Field(output_processor=TakeFirst()) dev_reprod_tox_score = scrapy.Field(output_processor=TakeFirst()) allergy_imm_tox_score = scrapy.Field(output_processor=TakeFirst()) use_restrict_score = scrapy.Field(output_processor=TakeFirst()) ingredient_list = scrapy.Field(output_processor=Identity())
class MoreTicketsEventLoader(ItemLoader): default_output_processor = TakeFirst() id_in = MapCompose(lambda x: re.sub(r'/content/', '', x)) desc_in = MapCompose(lambda x: re.sub(r'\n', '', x)) url_in = MapCompose(lambda x: 'https://www.moretickets.com' + x)
class PiaoNiuEventLoader(ItemLoader): default_output_processor = TakeFirst() id_in = MapCompose(lambda x: re.sub(r'\D', '', x)) desc_in = MapCompose(lambda x: re.sub(r'\n', '', x)) url_in = MapCompose(lambda x: 'https:' + x)
class Company(Item): report_id = Field(output_processor=TakeFirst()) company_name = Field(output_processor=TakeFirst()) stock = Field(output_processor=TakeFirst()) company_participants = Field(output_processor=TakeFirst()) external_participants = Field(output_processor=TakeFirst()) published_quarter = Field(output_processor=TakeFirst()) article_url = Field(output_processor=TakeFirst()) date_published = Field(output_processor=TakeFirst()) earning_call_talk = Field(output_processor=TakeFirst()) question_answers = Field(output_processor=TakeFirst()) article_title = Field(output_processor=TakeFirst()) audio_call_url = Field(output_processor=TakeFirst())
class HrTencentItem(ItemLoader): default_output_processor = TakeFirst()
class ProductoFybeca(scrapy.Item): titulo = scrapy.Field() imagen = scrapy.Field(input_processor=MapCompose(transformar_url_imagen), output_processor=TakeFirst())
def price_field(): return scrapy.Field(input_processor=MapCompose( lambda value: value.replace('$', '') if type(value) == str else value, DataUtils.remove_html, float), output_processor=TakeFirst())
def parse(self, response): #All data must be extracted using XPATH queries #This path should return a list of block of HTML code that contain the information about the listings items = response.xpath("//article[contains(@class,'property-row')]") for item in items: l = ItemLoader(item=RentslamItem(), response=response) #All data must be extracted using XPATH queries image_url = item.xpath('.//img/@src').extract_first() url = item.xpath('.//a/@href').extract_first() price = item.xpath( './/span[contains(@class,"property-row-meta-item-price")]/strong/text()' ).extract_first() bedrooms = item.xpath( './/span[contains(@class,"property-row-meta-item-beds")]/strong/text()' ).extract_first() size = item.xpath( './/span[contains(@class,"property-row-meta-item-area")]/strong/text()' ).extract_first() address = item.xpath('.//h2/a/text()').extract_first() text = item.xpath( './/div[@class="property-row-body"]/p/text()').extract_first() city = item.xpath('.//div[@class="property-row-location"]/a/text()' ).extract_first() #In this example there is no furnishing info, it can be left enpty #furnishing = item.xpath('').extract_first() #Full url. Only the first image is required l.add_value('ImageUrl', image_url) #Full url l.add_value('Url', url) #Price must not include currency symbol, dot or comma. Decimals must be filtered out. Example: € 1.348,77 --> 1348 l.add_value('Price', price, Join(''), re=r'\d+') #Number l.add_value('Bedrooms', bedrooms) #Size must include only the number. Things like "m2" must be filtered out. Example: 90 m2 --> 90 l.add_value('Size', size, TakeFirst(), re=r'\d+') #The address must contain the street name and the house number (if it is present). It must not contain the city name or the postcode l.add_value('Address', address) #This is the description of the listing l.add_value('Text', text) #You can copy th email address from the website here l.add_value('ContactEmailAddress', '*****@*****.**') #You can copy th phoen number from the website here l.add_value('ContactPhoneNumber', '085 - 273 67 30') #In this example there is no furnishing info, it can be left enpty #l.add_value('Furnishing', furnishing) #Name of the city. Sometimes it can have a literal value, like "Amsterdam", if the website only contains listings from amsterdam. l.add_value('City', city) yield l.load_item()
class ArticleItemLoader(ItemLoader): """ 自定义 ItemLoader """ # 为每一个字段都指定一个 output_processor default_output_processor = TakeFirst()
class AuthorLoader(ItemLoader): default_output_processor = TakeFirst()
class FirmwareLoader(ItemLoader): @staticmethod def find_product(text): match = re.search(r"(?:model[:. #]*([\w-][\w.-]+))", " ".join( text).replace(u"\xa0", " ").strip(), flags=re.IGNORECASE) return next((x for x in match.groups() if x), None) if match else None @staticmethod def find_version(text): match = re.search(r"(?:version[:. ]*([\w-][\w.-]+)|ve?r?s?i?o?n?[:. ]*([\d-][\w.-]+))", " ".join(text).replace(u"\xa0", " ").strip(), flags=re.IGNORECASE) return next((x for x in match.groups() if x), None) if match else None @staticmethod def find_build(text): match = re.search(r"(?:build[:. ]*([\w-][\w.-]+)|bu?i?l?d?[:. ]*([\d-][\w.-]+))", " ".join(text).replace(u"\xa0", " ").strip(), flags=re.IGNORECASE) return next((x for x in match.groups() if x), None) if match else None @staticmethod def find_version_period(text): match = re.search(r"((?:[0-9])(?:[\w-]*\.[\w-]*)+)", " ".join(text).replace(u"\xa0", " ").strip()) return next((x for x in match.groups() if x and "192.168." not in x.lower()), None) if match else None def find_date(self, text): for fmt in self.context.get("date_fmt", []): fmt = "(" + re.escape(fmt).replace("\%b", "[a-zA-Z]{3}").replace("\%B", "[a-zA-Z]+").replace( "\%m", "\d{1,2}").replace("\%d", "\d{1,2}").replace("\%y", "\d{2}").replace("\%Y", "\d{4}") + ")" match = re.search(fmt, "".join(text).strip()) res = filter(lambda x: x, match.groups()) if match else None if res: return next(res) return None def clean(s): return "".join(filter(lambda x: x in string.printable, s)).replace("\r", "").replace("\n", "").replace(u"\xa0", " ").strip() def fix_url(url, loader_context): if not urlparse(url).netloc: return urljoin(loader_context.get("response").url, url) return url def parse_date(date, loader_context): for fmt in loader_context.get("date_fmt", []): try: return datetime.datetime.strptime(date, fmt) except ValueError: pass return None def remove_html(s): return re.sub(r"<[a-zA-Z0-9\"/=: ]+>", "", s) default_output_processor = TakeFirst() product_in = MapCompose(clean) vendor_in = Identity() description_in = MapCompose(remove_html, clean) version_in = MapCompose(clean) build_in = MapCompose(clean) date_in = MapCompose(clean, parse_date) mib_in = MapCompose(fix_url) sdk_in = MapCompose(fix_url) url_in = MapCompose(fix_url)
class Form(scrapy.Item): url = scrapy.Field() action = scrapy.Field(output_processor=TakeFirst()) inputs = scrapy.Field()
class ArticleItemLoader(ItemLoader): default_output_processor = TakeFirst() # TakeFirst是取第一个不为空的元素
class InputLoader(ItemLoader): default_item_class = Input default_output_processor = TakeFirst()
class JokeItem(scrapy.Item): # define the fields for your item here like: joke_text = scrapy.Field( input_processor=MapCompose(remove_tags, remove_whitespace), output_processor=TakeFirst() )
class HexunFundDetailItemLoader(ItemLoader): #自定义itemloader default_output_processor = TakeFirst()
class YunqiItemLoader(ItemLoader): default_output_processor = TakeFirst()
def parse_course(self, response): l = ItemLoader(item=ConcordiaCourseItem(), response=response) l.default_output_processor = TakeFirst() l.add_value('institution_name', 'Concordia University') l.add_xpath('course_code', '//div[@class="container"]//div[@class="ccode"]/text()') l.add_xpath('course_name', '//section[@id]//h1/text()') l.add_value('url', response.url) l.add_value('faculty', 'School of Continuing Studies') l.add_xpath( 'description', '//div[@class="container"]//span[@class="xlarge-text"]/div[@class="ccode"]/following-sibling::text()[normalize-space()]' ) l.add_value('location', '') l.add_value('subject', '') # get all blocks of course data course_data = response.xpath( '//div[@class="course-section xlarge-text"]').getall() # get all prices prices = [re.search(r'\$([^\s]+)', block) for block in course_data] prices = [price.group(1) if price else '0.0' for price in prices] l.add_value('price', prices) # Get all days days_in_blocks = [ re.findall(r'([\w ]+) +\(', block) for block in course_data ] l.add_value('days', days_in_blocks) l.add_value('program', response.meta['program']) # # Get all courses time intervals time_in_blocks = [ re.findall(r'\d{1,2}:\d{1,2}', block) for block in course_data ] l.add_value('duration_hours', time_in_blocks) l.add_value('duration_days_week', l.get_collected_values('days')) l.add_xpath('duration_months', '//h3[@class="date burgundy"]/text()') l.add_value('duration_as_string', [ l.get_collected_values('duration_hours'), l.get_collected_values('duration_days_week'), l.get_collected_values('duration_months'), ]) hours_site = re.search(r'Duration[^\d]+(\d+)', course_data[0]) if hours_site: hours_site = hours_site.group(1) else: hours_site = 0 l.add_value('total_hours', [ l.get_collected_values('duration_hours'), l.get_collected_values('duration_days_week'), hours_site, ]) l.add_value('delivery_types', l.get_collected_values('duration_hours')) yield l.load_item()
class SszeLoaderItem(ItemLoader): ''' 自定义ITEM,取每个字段数组的第一个值 ''' default_output_processor = TakeFirst()
class RiLab01Loader(ItemLoader): default_output_processor = TakeFirst() text_out = Join()
class LagouJobItemLoader(ItemLoader): # 自定义itemloader default_output_processor = TakeFirst()
class RiLab01CommentLoader(ItemLoader): default_output_processor = TakeFirst()
class NewLoader(ItemLoader): """重写item loader,默认取第一个""" default_output_processor = TakeFirst()
class MyLoader(ItemLoader): # 自定义TtemLoader default_output_processor = TakeFirst()