class IGAProductItemLoader(product_item_loader.ProductItemLoader): default_input_processor = MapCompose( remove_tags, replace_escape_chars, replace_entities, input_processor_helper.InputProcessorHelper.remove_latin_space, lambda x: ' '.join(x.split()))
class ChildItemLoader(TestItemLoader): url_in = MapCompose(processor)
class DefaultedItemLoader(NameItemLoader): default_input_processor = MapCompose(lambda v: v[:-1])
class IdentityDefaultedItemLoader(DefaultedItemLoader): name_in = MapCompose()
class ChildDefaultedItemLoader(DefaultedItemLoader): name_in = MapCompose( DefaultedItemLoader.default_input_processor, six.text_type.swapcase)
def price_field(): return scrapy.Field(input_processor=MapCompose( lambda value: value.replace('$', '') if type(value) == str else value, DataUtils.remove_html, float), output_processor=TakeFirst())
class ChildItemLoader(TestItemLoader): url_in = MapCompose(lambda v: v.lower())
class PropertyItem(Item): internal_id = Field(input_processor=MapCompose(strip_spaces, remove_tags), output_processor=TakeFirst()) name = Field(input_processor=MapCompose(strip_spaces, remove_accents), output_processor=TakeFirst()) publication_date = Field(input_processor=MapCompose(strip_spaces), output_processor=TakeFirst()) property_type = Field(input_processor=MapCompose(remove_tags, strip_spaces, remove_accents), output_processor=TakeFirst()) link = Field(input_processor=MapCompose(strip_spaces), output_processor=TakeFirst()) city = Field(input_processor=MapCompose(strip_spaces, remove_accents), output_processor=Join()) price = Field(input_processor=MapCompose(extract_digits, float), output_processor=TakeFirst()) bedrooms = Field(input_processor=MapCompose(extract_digits, parse_int), output_processor=TakeFirst()) bathrooms = Field(input_processor=MapCompose(remove_tags, extract_digits, parse_int), output_processor=TakeFirst()) parking_spots = Field(input_processor=MapCompose(remove_tags, extract_digits, parse_int), output_processor=TakeFirst()) surface = Field(input_processor=MapCompose(extract_float), output_processor=TakeFirst()) neighborhood = Field(input_processor=MapCompose(strip_spaces, remove_accents), output_processor=Join()) status = Field( input_processor=MapCompose(remove_accents), output_processor=TakeFirst(), ) location = Field( input_processor=MapCompose(remove_accents), output_processor=TakeFirst(), ) description = Field(input_processor=Compose(strip_spaces, convert_lower, remove_accents), output_processor=Join()) responsible = Field(input_processor=MapCompose(strip_spaces, remove_accents, remove_tags), output_processor=TakeFirst()) stratum = Field(input_processor=MapCompose(extract_digits, parse_int), output_processor=TakeFirst()) features = Field() other_features = Field() floor_location = Field(input_processor=MapCompose(extract_digits, parse_int), output_processor=TakeFirst()) total_levels = Field(input_processor=MapCompose(extract_digits, parse_int), output_processor=TakeFirst()) antiquity = Field(input_processor=MapCompose(remove_accents, strip_spaces), output_processor=TakeFirst()) contact_info = Field() contact_phone = Field(input_processor=MapCompose(extract_digits), output_processor=Join())
class ExampleItemLoader(ItemLoader): default_item_class = ExampleItem default_input_processor = MapCompose(lambda s: s.strip()) default_output_processor = TakeFirst() description_out = Join()
class LagouItemLoader(ItemLoader): #default_input_processer = #default_item_class = scrapy.Item default_output_processor = TakeFirst() title_in = MapCompose(remove_tags) title_out = Join()
class WbparserItem(scrapy.Item): # define the fields for your item here like: _id = scrapy.Field() name = scrapy.Field(output_processor=TakeFirst()) features = scrapy.Field(input_processor=Compose(gen_features)) photos = scrapy.Field(input_processor=MapCompose(cleaner_photo))
class BaseNoInputReprocessingLoader(ItemLoader): title_in = MapCompose(str.upper) title_out = TakeFirst()
class ChildItemLoader(TestItemLoader): name_in = MapCompose(TestItemLoader.name_in, str.swapcase)
def parse_details(self, response): jsonresponse = json.loads(response.text.strip()) print( '----------------------------------------------------------------') print( '---------------------------------------------------------------- Pageno: ', self.page_no) ## Calculate Total Pages if self.is_first_page == True: total_records = jsonresponse['Total'] if total_records is not None: if int(total_records) > 20: total_pages = int(total_records) / int(20) if (total_pages).is_integer() is True: total_pages = int(total_pages) else: total_pages = int(total_pages) + int(1) else: total_pages = int(1) else: total_records = 0 if total_pages == 1: self.page_no_list = [] else: for page in range(2, int(total_pages) + 1): self.page_no_list.append(str(page)) for i in range(0, len(jsonresponse['Data'])): name = jsonresponse['Data'][i]['BusinessName'] if name is not None and name.strip() != '': company_name = self._getDBA(name)[0] company_name = re.sub(r'[\(\[].*?[\)\]]', '', company_name) company_name = company_name.replace('-DBA', '').replace( '-Dba', '').replace('-dba', '').strip() dba_name = self._getDBA(name)[1] else: company_name = '' dba_name = '' if company_name is not None and company_name.strip() != '': company_name = company_name.strip() elif dba_name is not None and dba_name.strip() != '': company_name = dba_name.strip() else: company_name = '' other_name = jsonresponse['Data'][i]['OtherNames'] if other_name is not None and other_name.strip() != '': other_name = re.sub(r'[\(\[].*?[\)\]]', '', other_name) other_name = re.sub(r' DBA| dba| Dba|dba |DBA', '', other_name) other_name = re.sub(r'\s+', ' ', other_name.strip()) other_name = other_name.replace(' ;', ';').replace( '-DBA', '').replace('-Dba', '').replace('-dba', '').strip() else: other_name = '' if 'Non-Profit' in jsonresponse['Data'][i]['EntityType']: indicator = 'Yes' else: indicator = '' naics_code1 = jsonresponse['Data'][i]['NAICSCode1'] naics_code2 = jsonresponse['Data'][i]['NAICSCode2'] naics_code3 = jsonresponse['Data'][i]['NAICSCode3'] naics_code123 = (naics_code1 if naics_code1 else '') + ( '; ' if naics_code1 and naics_code2 else '') + (naics_code2 if naics_code2 else '') + ('; ' if naics_code2 and naics_code3 else '') + (naics_code3 if naics_code3 else '') address = jsonresponse['Data'][i]['AddressLine1'] if address is not None and address.strip() != '' and str( address.strip()).lower() != 'null': address = address.replace('N/A, N/A', '').replace( 'N/A', '').replace('n/a', '').replace('N/a', '').replace( 'NONE', '').replace('None', '').replace('none', '').replace( 'NONE SHOWN', '').replace('NONE AT THIS TIME', '').strip() else: address = '' city = jsonresponse['Data'][i]['City'] if city is not None and city.strip() != '' and str( city.strip()).lower() != 'null': city = city.replace('N/A', '').replace('n/a', '').replace( 'N/a', '').replace('none', '').strip() else: city = '' state = jsonresponse['Data'][i]['StateCode'] if state is not None and state.strip() != '' and str( state.strip()).lower() != 'null': state = state.replace('N/A', '').replace('n/a', '').replace('N/a', '').strip() else: state = 'MS' zip_code = jsonresponse['Data'][i]['PostalCode'] if zip_code is not None and zip_code.strip() != '' and str( zip_code.strip()).lower() != 'null': zip_code = zip_code.replace('N/A', '').replace( 'n/a', '').replace('N/a', '').replace('none', '').strip() else: zip_code = '' address = self.format_address(address, city, state, zip_code) if jsonresponse['Data'][i]['FormationDate'] is not None: TimestampUtc = jsonresponse['Data'][i]['FormationDate'] TimestampUtc = re.split(r'\(|\)', TimestampUtc)[1] creation_date = datetime.datetime( 1970, 1, 1) + datetime.timedelta(seconds=(int(TimestampUtc) / 1000)) creation_date = self.format_date(creation_date) else: creation_date = '' il = ItemLoader(item=MsSosSpiderItem(), response=response) il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) il.add_value('sourceName', 'MS_SOS') il.add_value( 'url', 'https://corp.sos.ms.gov/corpreporting/Corp/BusinessSearch3') il.add_value('entity_id', jsonresponse['Data'][i]['BusinessId']) il.add_value('company_name', company_name) il.add_value('dba_name_', dba_name) il.add_value('dba_name', other_name) il.add_value('company_subtype', jsonresponse['Data'][i]['EntityType']) il.add_value('non_profit_indicator', indicator) il.add_value('domiciletype', jsonresponse['Data'][i]['DomicileType']) il.add_value('status', jsonresponse['Data'][i]['Status']) il.add_value('naics', naics_code123) il.add_value('location_address_string', address) il.add_value('county', jsonresponse['Data'][i]['County']) il.add_value('creation_date', str(creation_date)) il.add_value('permit_type', 'business_license') yield il.load_item() if len(self.page_no_list) > 0: self.is_first_page = False next_page = self.page_no_list.pop(0) self.page_no = str(next_page) form_data = { 'sort': 'BusinessName-asc', 'page': str(next_page), 'pageSize': '20', 'group': '', 'filter': '' } yield scrapy.FormRequest(url=self.post_url, method="POST", formdata=form_data, callback=self.parse_details, dont_filter=True)
class OrganizationLoader(ItemLoader): default_input_processor = MapCompose(str.strip) default_output_processor = TakeFirst()
class SchemeLoader(ItemLoader): default_input_processor = MapCompose(strip_dashes, replace_escape_chars) default_output_processor = TakeFirst()
def numeric_field(): return scrapy.Field(input_processor=MapCompose(DataUtils.remove_html), output_processor=TakeFirst())
class SpiderInstanceLoader(ItemLoader): # 继承ItemLoader, 重写output_processor,ItemLoader默认以list存放,选取默认字段可以修改output_processor default_output_processor = TakeFirst() # 默认选取列表的第一项 spider_in = MapCompose(select_spider) # the foreign key field
def custom_field(): return scrapy.Field(input_processor=MapCompose(DataUtils.remove_html), output_processor=Join())
class ZhihuQuestionItem(scrapy.Item): # 知乎的问题 item zhihu_id = scrapy.Field() topics = scrapy.Field() url = scrapy.Field() title = scrapy.Field() content = scrapy.Field( input_processor=MapCompose(exclude_none), ) answer_num = scrapy.Field() comments_num = scrapy.Field() watch_user_num = scrapy.Field() click_num = scrapy.Field() crawl_time = scrapy.Field() def get_insert_sql(self): # 插入知乎question表的sql语句 insert_sql = """ insert into zhihu_question ( `comments_num`, `answer_num`, `click_num`, `zhihu_id`, `watch_user_num`, `url`, `title`, `topics`, `content`) values ( %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE content=VALUES(content), answer_num=VALUES(answer_num), comments_num=VALUES(comments_num), watch_user_num=VALUES(watch_user_num), click_num=VALUES(click_num) """ zhihu_id = self["zhihu_id"][0] url = self["url"][0] title = "".join(self["title"]) content = "".join(self["content"]) comments_num = extract_num("".join(self["comments_num"])) try: topics = ",".join(self["topics"]) except Exception: topics = "" try: answer_num = extract_num("".join(self["answer_num"])) except: answer_num = 0 try: if len(self["watch_user_num"]) == 2: watch_user_num = int(self["watch_user_num"][0]) click_num = int(self["watch_user_num"][1]) else: watch_user_num = int(self["watch_user_num"][0]) click_num = 0 except: click_num = 0 watch_user_num = 0 # crawl_time = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT) params = (comments_num, answer_num, click_num, zhihu_id, watch_user_num, url, title, topics, content) return insert_sql, params def save_to_es(self): try: if len(self["watch_user_num"]) == 2: watch_user_num = int(self["watch_user_num"][0]) click_num = int(self["watch_user_num"][1]) else: watch_user_num = int(self["watch_user_num"][0]) click_num = 0 topics = ",".join(self["topics"]) except Exception: watch_user_num = 0 click_num = 0 try: topics = ",".join(self["topics"]) except Exception: topics = "" try: answer_num = extract_num("".join(self["answer_num"])) except: answer_num = 0 zhihu_id = self["zhihu_id"][0] url = self["url"][0] title = "".join(self["title"]) content = "".join(self["content"]) comments_num = extract_num("".join(self["comments_num"])) crawl_time = datetime.datetime.now().strftime(SQL_DATE_FORMAT) article = ZhihuQuestionType() article.meta.id = zhihu_id article.topics = topics article.url = url article.title = title article.content = content article.answer_num = answer_num article.comments_num = comments_num article.watch_user_num = watch_user_num article.click_num = click_num article.crawl_time = crawl_time article.suggest = gen_suggest(ZhihuQuestionType._doc_type.index, ((article.title, 10), (article.content, 9))) redis_cli.incr("zhihu_count") article.save() return
class ChildChildItemLoader(ChildItemLoader): url_in = MapCompose(lambda v: v.upper()) summary_in = MapCompose(lambda v: v)
class LagouJobItem(scrapy.Item): # 拉勾网职位 title = scrapy.Field() url = scrapy.Field() salary = scrapy.Field() job_city = scrapy.Field( input_processor=MapCompose(replace_splash), ) work_years = scrapy.Field( input_processor=MapCompose(replace_splash), ) degree_need = scrapy.Field( input_processor=MapCompose(replace_splash), ) job_type = scrapy.Field() publish_time = scrapy.Field() job_advantage = scrapy.Field() job_desc = scrapy.Field( input_processor=MapCompose(handle_strip), ) job_addr = scrapy.Field( input_processor=MapCompose(remove_tags, handle_jobaddr), ) company_name = scrapy.Field( input_processor=MapCompose(handle_strip), ) company_url = scrapy.Field() crawl_time = scrapy.Field() crawl_update_time = scrapy.Field() def get_insert_sql(self): insert_sql = """ insert into lagou_job(title, url, salary, job_city, work_years, degree_need, job_type, publish_time, job_advantage, job_desc, job_addr, company_url, company_name, job_id) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE salary=VALUES(salary), job_city=VALUES(job_city), work_years=VALUES(work_years), degree_need=VALUES(degree_need), job_type=VALUES(job_type), publish_time=VALUES(publish_time), tags=VALUES(tags) , job_advantage=VALUES(job_advantage), job_desc=VALUES(job_desc), crawl_time=VALUES(crawl_time) """ job_id = extract_num(self["url"]) params = (self["title"], self["url"], self["salary"], self["job_city"], self["work_years"], self["degree_need"], self["job_type"], self["publish_time"], self["job_advantage"], self["job_desc"], self["job_addr"], self["company_url"], self["company_name"], job_id) return insert_sql, params def save_to_es(self): crawl_time = datetime.datetime.now().strftime(SQL_DATE_FORMAT) job_id = extract_num(self["url"]) article = LagouType() article.meta.id = job_id article.title = self['title'] article.url = self['url'] article.salary = self['salary'] article.job_city = self['job_city'] article.work_years = self['work_years'] article.degree_need = self['degree_need'] article.job_type = self['job_type'] article.publish_time = self['publish_time'] article.job_advantage = self['job_advantage'] article.job_desc = self['job_desc'] article.job_addr = self['job_addr'] article.company_name = self['company_name'] article.crawl_time = crawl_time article.crawl_update_time = crawl_time article.suggest = gen_suggest(LagouType._doc_type.index, ((article.title, 10), (article.company_name, 9), (article.job_desc, 8), (article.job_addr, 7))) redis_cli.incr("lagou_count") article.save()
class ChildItemLoader(TestItemLoader): name_in = MapCompose( TestItemLoader.name_in, six.text_type.swapcase)
class ReviewerItemLoader(ItemLoader): """Reviewer item loader""" default_output_processor = TakeFirst() default_input_processor = MapCompose(clean_text) reviewDate_in = MapCompose(clean_text, parse_date)
class ChildItemLoader(TestItemLoader): url_in = MapCompose(processor_with_args, key=u'val')
def parse_item(self, response): # agency table l = ItemLoader(item=AgentItem(), response=response) l.default_output_processor = TakeFirst() l.add_xpath("name", '//div[@class="sthys3"]/text()', re=r":(\w+)") l.add_xpath("telephone", '//div[@class="sttelct2 sttelct"]/text()', MapCompose(lambda x: "".join(x.split()))) l.item.setdefault("company", None) l.add_xpath("company", '//li[@class="st14 stb starial"]//text()') l.add_xpath("address", '//div[@class="xflilist"]/div[3]//text()', re=r':(\w+)') l.add_xpath("register_date", '//div[@class="jbfx"]/text()', re=r'登记日期:([\d/]+)') l.add_value("city_name", self.city_name) l.add_value("dist_name", self.dist_name) l.add_value("category_name", self.category_name) l.add_value("station_name", self.station_name) l.add_xpath("subdist_name", '(//div[@class="xx_xq_l200"])[2]/text()', re='区域:(?:昆山)?(\\w+)') # housekeeping l.add_value("source", response.url) l.add_value("project", self.settings.get("BOT_NAME")) l.add_value("spider", self.name) l.add_value("server", socket.gethostname()) l.add_value("dt", datetime.datetime.utcnow()) item = l.load_item() if not item.get("subdist_name"): self.logger.critical( "subdsitrict name is not scrape, save response as a file") f = open("failed_html/html_%s.html" % parse_qs(urlparse(response.url).query).get("id")[0], 'w', encoding='utf8') f.write(response.text) f.close() # return Request(url=response.url) yield item # properties table l = ItemLoader(item=PropertyItem(), response=response) l.default_output_processor = TakeFirst() l.add_xpath('title', '//div[@class="xxview_title"]/text()') l.add_value("url", response.url) l.add_xpath( "price", '//div[@class="xx_xq_l200"]/span[@class="st22 ' 'sthuangs stb starial"]/text()') l.add_xpath("address", '//div[@class="wydzleft"]/text()', MapCompose(lambda x: x.strip()), re=r'物业地址:([^\x01-\x1f]+)') l.add_xpath("agent_name", '//div[@class="sthys3"]/text()', re=r":(\w+)") l.item.setdefault("agent_company", None) l.add_xpath("agent_company", '//li[@class="st14 stb starial"]//text()') l.add_xpath('agent_phone', '//div[@class="sttelct2 sttelct"]/text()', MapCompose(lambda x: "".join(x.split()))) l.add_xpath("recent_activation", '//div[@class="fyfbtime"]/text()', re='查看人次:(\\d+)') l.add_value("city_name", self.city_name) l.add_value("dist_name", self.dist_name) l.add_value('station_name', self.station_name) l.add_value("category_name", self.category_name) l.add_xpath("subdist_name", '(//div[@class="xx_xq_l200"])[2]/text()', re='区域:(?:昆山)?(\\w+)') # housekeeping l.add_value("source", response.request.url) l.add_value("project", self.settings.get("BOT_NAME")) l.add_value("spider", self.name) l.add_value("server", socket.gethostname()) l.add_value("dt", datetime.datetime.utcnow()) yield l.load_item()
class TestItemLoader(NameItemLoader): name_in = MapCompose(lambda v: v.title())
class ScrapingTestingLoader(ItemLoader): default_input_processor = MapCompose(unicode.strip) default_output_processor = TakeFirst()
class TestItemLoader(ItemLoader): default_item_class = TestItem name_in = MapCompose(float)
class A123phimSpider(scrapy.Spider): name = "123phim" allowed_domains = ["123phim.vn"] start_urls = [ 'http://www.123phim.vn/phim/', 'http://www.123phim.vn/phim/sap-chieu/', ] mc = MapCompose(lambda i: urlparse.urljoin('http://123phim.vn', i)) def parse(self, response): # Get item URLs and yield Requests url_selector = response.xpath('//*[@class="block-base movie"]/a[1]/@href') for url in url_selector.extract(): yield Request(self.mc(url)[0], callback=self.parse_item) def parse_item(self, response): """ This function parses a movie page @url http://www.123phim.vn/phim/840-lich-chieu-doctor-strange.html @returns items 1 @scrapes name namevi rating description premiereDate duration actors @scrapes url project spider server date """ l = ItemLoader(item=MovieItem(), response=response) l.add_xpath('movie_id', '//*[@name="film_id"]', TakeFirst(), re='\d+') l.add_xpath('name', '//*[@class="filmDescription"]/h3/text()') l.add_xpath('namevi', '//*[@class="filmDescription"]/h2/text()') l.add_xpath('rating', '//*[@class="icon imdb"]/strong/text()', TakeFirst()) l.add_xpath('description', '//*[@class="filmShortDesc"]/text()') l.add_xpath('premiereDate', '//*[@class="publish-date-block"]/span/text()', TakeFirst(), re='(\d+/\d+)') l.add_xpath('duration', '//*[@class="filmInfo"]/span[1]/text()', TakeFirst(), re='(\d+)') l.add_xpath('actors', '//*[@class="titleItemSmall"]/a/text()') # Information fields l.add_value('url', response.url) l.add_value('project', self.settings.get('BOT_NAME')) l.add_value('spider', self.name) l.add_value('server', socket.gethostname()) l.add_value('date', datetime.datetime.now()) # Get all movie times movie_id = int(l.get_xpath('//*[@name="film_id"]', TakeFirst(), re='\d+')) l.add_value('movie_times', self.parse_movie_times(movie_id)) return l.load_item() def parse_movie_times(self, movie_id): date_time = datetime.datetime.now().strftime('%Y-%m-%d') # There is 21 cities to crawl cities = list(range(1, 22)) ajax_url = ['http://www.123phim.vn/default/ajax/?method=Session.getListGroupByCinemaNew&' 'locationId={0}&filmId={1}&date={2}&pcinemaId=0'.format( city, movie_id, date_time) for city in cities] movie_times = [json.loads(requests.get(url).text) for url in ajax_url] return movie_times