class ParkLoader(ItemLoader): default_item_class = Park default_output_processor = TakeFirst() images_out = Identity() description_out = Identity() features_out = Identity()
class ArticleLoader(XPathItemLoader): """ Used for easier construction of ArticleItem """ def is_string(string): if isinstance(string, str) or isinstance(string, unicode): if string.strip() != "": #log.msg("returning string: "+ unicode(string.strip())) return string.strip() #log.msg("returning None for string: "+ unicode(string)) return None def separate_tags(tags_string): return tags_string.replace(";", ",").split(",") default_input_processor = MapCompose(is_string) default_output_processor = TakeFirst() publishers_in = MapCompose(is_string) publishers_out = Identity() title_in = MapCompose(is_string, unicode.title) title_out = TakeFirst() time_published_in = MapCompose(is_string) time_published_out = Identity() summary_in = MapCompose(is_string) summary_out = TakeFirst() tags_in = MapCompose(is_string, separate_tags) tags_out = Identity()
class DefaultItemLoader(ItemLoader): default_input_processor = MapCompose(unicode, unicode.strip) #good for string input default_output_processor = Identity() image_urls_in = MapCompose(canonicalize) image_urls_out = Identity() postinfo_in = Identity() postinfo_out = Identity()
class ActivityItemLoader(XPathItemLoader): default_output_processor = TakeFirst() name_out = TakeFirst() categories_out = Identity() address_in = MapCompose(unicode.strip) address_out = JoinAddress() description_in = MapCompose(replace_nbrs) description_out = Join('\n') time_needed_out = TakeFirst() price_out = TakeFirst() image_urls_out = Identity() images = Identity()
class EventLoader(ItemLoader): # used if fields don't specify one default_input_processor = Strip() default_output_processor = TakeFirst() teams_in = MapCompose(unicode.strip, unicode.title) teams_out = Identity() # don't apply default dateTime_in = Compose(take_first, parse_str2date, parse_date2str) # dateTime_out = MapCompose(parse_date2str) markets_in = MapCompose(strip_mkt_name, strip_odds, convert_odds, format_runners) markets_out = Identity() # don't apply default
class CategoryItemLoader(ItemLoader): def catIDfromURL(url): #needs to be put in a utility file, because this is also used in td_spider.py itemIdQuery = re.compile('[Cc]at[Ii]d=[0-9]+$') categoryIDtxt = re.findall(itemIdQuery, url)[0] categoryID = categoryIDtxt.replace("CatId=", "") return int(categoryID) def catLevelfromURL(url): #needs to be put in a utility file, because this is also used in td_spider.py catLevelQuery = re.compile('(?:category\_)([st])(?:lc)') catLevelArry = re.findall(catLevelQuery, url) if catLevelArry: catLevel = catLevelArry[0] if catLevel == 't': #top level - level 1 return 1 if catLevel == 's': #second level - level 2 return 2 def linkToMfgID(link): onclick = link.xpath("@onclick").extract() if onclick: mfgquery = re.compile('[Mm]fr[Ii]d=[0-9]+\"') mfgId = re.findall(mfgquery, onclick[0]) if mfgId: mfgId = mfgId[0].replace("MfrId=", "").replace("\"", "") mfgId = mfgId.encode('utf-8') return int(mfgId) default_input_processor = Identity() default_output_processor = Join() categoryName_in = TakeFirst() categoryName_out = Join() tdCategoryID_in = MapCompose(catIDfromURL) tdCategoryID_out = TakeFirst() tdCategoryParent_in = Identity() tdCategoryParent_out = Identity() tdCategoryLevel_in = MapCompose(catLevelfromURL) tdCategoryLevel_out = TakeFirst() manufacturers_in = MapCompose(linkToMfgID) manufacturers_out = Identity()
class MfgItemLoader(ItemLoader): def cleanString(string): cleanQuery=re.compile('[A-Za-z0-9 .",\'!-]') string=''.join(re.findall(cleanQuery, string)) return string def linkToMfgID(link): onclick = link.xpath("@onclick").extract() if onclick: mfgquery = re.compile('[Mm]fr[Ii]d=[0-9]+\"') mfgId = re.findall(mfgquery, onclick[0]) if mfgId: mfgId = mfgId[0].replace("MfrId=","").replace("\"","") mfgId = mfgId.encode('utf-8') return mfgId def parsemfgName(link): #mfgName = link.xpath("text()").extract()[0] mfgName = link.encode('utf-8').strip() return mfgName default_input_processor = Identity() default_output_processor = Join() mfgName_in = MapCompose(parsemfgName, cleanString) mfgName_out = Join() mfgID_in = MapCompose(linkToMfgID) mfgID_out = Join()
class PriceItemLoader(ItemLoader): def cleanString(string): cleanQuery = re.compile('[A-Za-z0-9 .",\'!-]') string = ''.join(re.findall(cleanQuery, string)) return string def parseSalePrice(priceIn): salePrice = priceIn.strip().replace(" ", "").replace("$", "") return salePrice def parseRebateAmount(rebateIn): priceRebate = rebateIn.strip().replace("\n", "").replace("\r", "").replace( " ", "").replace("$", "") return priceRebate def parseFinalPrice(priceIn): #print priceIn query = re.compile('[0-9\.]') price = re.findall(query, priceIn) if price: finalPrice = ''.join(price) return finalPrice default_input_processor = Identity() default_output_processor = Join() salePrice_in = MapCompose(parseSalePrice) salePrice_out = Join() finalPrice_in = MapCompose(parseFinalPrice) finalPrice_out = Join()
class DaywatchLoader(ItemLoader): ''' Base ItemLoader. Users can create a custom loader to handle certain fields of the project item model. ''' default_input_processor = Identity() default_output_processor = TakeFirst()
class PostLoader(ItemLoader): default_item_class = Post default_input_processor = Identity() default_output_processor = TakeFirst() def ctime_in(self, values): for s in values: yield (datetime.strptime(s.strip(), '%Y-%m-%d %H:%M').replace( tzinfo=pytz.timezone('America/Anguilla')) # UTC -4 .astimezone(pytz.utc).strftime(TIME_FORMAT)) def cover_uri_in(self, values): for s in values: if s.startswith('http://'): yield s else: parts = s.split('~') # title may contain ~ if len(parts) >= 4 and parts[0] == 'init': yield 'http://%s/%s' % (parts[1], parts[2]) def rating_in(self, values): for s in values: yield parse_rating(s)
class SpeakerLoader(ItemLoader): default_item_class = Speaker default_input_processor = MapCompose(remove_tags, unquote_markup, unicode.strip) default_output_processor = Join() image_urls_out = Identity() name_out = Compose(Join(), _cleanup_name)
class AppInfoItemLoader(ItemLoader): default_item_class = AppInfoItem default_output_processor = TakeFirst() default_input_processor = MapCompose(unicode.strip) screenshots_out = Identity() intro_out = Join('<br>') tags_out = Identity() permissions_str_out = Join(';') permissions_out = Identity() instance_in = Identity()
class SearchResultPostLoader(ItemLoader): default_item_class = SearchResultPost default_input_processor = Identity() default_output_processor = TakeFirst() def date_in(self, values): for s in values: yield s.strip()
class PostLoader(ItemLoader): default_item_class = Post default_input_processor = Identity() default_output_processor = TakeFirst() def ctime_in(self, values): for s in values: yield s[5:]
class DealLoaderBase(XPathItemLoader): default_input_processor = MapCompose(unicode.strip) default_output_processor = Join() title_in = MapCompose(unicode.strip) description1_in = MapCompose(unicode.strip) description1_out = Join('\n') description2_in = MapCompose(unicode.strip) description2_out = Join('\n') days_in = MapCompose(strip_alpha) hours_in = MapCompose(strip_alpha) minutes_in = MapCompose(strip_alpha) initial_price_in = MapCompose(strip_alpha) sell_price_in = MapCompose(strip_alpha) discount_in = MapCompose(strip_alpha) saving_in = MapCompose(strip_alpha) nbr_buyers_in = MapCompose(strip_alpha) validity_in = MapCompose(unicode.strip, sanitize_validity) cities_in = MapCompose(unicode.strip) cities_out = Identity() image_urls_out = Identity()
class HistDataItem(scrapy.Item): url = scrapy.Field( input_processor=MapCompose(urllib2.unquote), output_processor=TakeFirst(), ) tk = scrapy.Field( input_processor=Identity(), output_processor=TakeFirst(), ) date = scrapy.Field( input_processor=Identity(), output_processor=TakeFirst(), ) datemonth = scrapy.Field( input_processor=Identity(), output_processor=TakeFirst(), ) platform = scrapy.Field( input_processor=Identity(), output_processor=TakeFirst(), ) timeframe = scrapy.Field( input_processor=Identity(), output_processor=TakeFirst(), ) fxpair = scrapy.Field( input_processor=Identity(), output_processor=TakeFirst(), )
class ResultsItemLoader(ItemLoader): default_item_class = ResultsItem default_output_processor = Compose(TakeFirst(), unicode, unicode.strip) pm1_out = Compose(default_output_processor, removeunichars, tidytomoney) pm2_out = Compose(default_output_processor, removeunichars, tidytomoney) pm3_out = Compose(default_output_processor, removeunichars, tidytomoney) pm4_out = Compose(default_output_processor, removeunichars, tidytomoney) pm5_out = Compose(default_output_processor, removeunichars, tidytomoney) prizemoney_out = Compose(default_output_processor, removeunichars, tidytomoney) racename_out = Compose(default_output_processor, removeunichars) gear_out = Compose(default_output_processor, removeunichars) OR_out = Compose(default_output_processor, removeunichars) TS_out = Compose(default_output_processor, removeunichars) RPR_out = Compose(default_output_processor, removeunichars) damsire_out = Compose(default_output_processor, removeunichars, cleandamsire) jockeyname_out = Compose(default_output_processor, removeunichars) trainername_out = Compose(default_output_processor, removeunichars) sire_out = Compose(default_output_processor, removeunichars) dam_out = Compose(default_output_processor, removeunichars) horsename_out = Compose(default_output_processor, removeunichars) prizemoney_in = Compose(default_output_processor, removeunichars, tidytomoney) L1racedate = Compose(default_output_processor, removeunichars) L2racedate = Compose(default_output_processor, removeunichars) L3racedate = Compose(default_output_processor, removeunichars) L4racedate = Compose(default_output_processor, removeunichars) L5racedate = Compose(default_output_processor, removeunichars) L6racedate = Compose(default_output_processor, removeunichars) L1comment_out = Compose(default_output_processor, removeunichars) L2comment_out = Compose(default_output_processor, removeunichars) L3comment_out = Compose(default_output_processor, removeunichars) L4comment_out = Compose(default_output_processor, removeunichars) L5comment_out = Compose(default_output_processor, removeunichars) L6comment_out = Compose(default_output_processor, removeunichars) currentodds_out = Compose(default_output_processor, decimalizeodds) horse_out = Compose(TakeFirst(), Identity()) horse_in = Compose(TakeFirst(), Identity())
class ProyectoItemLoader(XPathItemLoader): default_item_class = ProyectoItem default_input_processor = MapCompose(fix_space, unicode.strip) default_output_processor = TakeFirst() tipo_in = MapCompose(fix_space, unicode.strip, normalize_tipo_proyecto) camara_origen_in = MapCompose(fix_space, unicode.strip, normalize_camara) camara_origen_expediente_in = MapCompose(fix_space, unicode.strip, normalize_codigo_expediente) origen_in = MapCompose(fix_space, unicode.strip, normalize_proyecto_origen) reproduccion_expediente_in = MapCompose(fix_space, unicode.strip, partial(normalize_codigo_expediente, allow_empty=True)) camara_revisora_in = MapCompose(fix_space, unicode.strip, partial(normalize_camara, allow_empty=True)) camara_revisora_expediente_in = MapCompose(fix_space, unicode.strip, partial(normalize_codigo_expediente, allow_empty=True)) ley_numero_in = MapCompose(fix_space, unicode.strip, digits_only) mensaje_codigo_in = MapCompose(fix_space, unicode.strip, partial(normalize_codigo_mensaje, allow_empty=True)) publicacion_en_in = MapCompose(fix_space, unicode.strip, partial(normalize_publicacion_en, allow_empty=True)) publicacion_fecha_in = MapCompose(fix_space, unicode.strip, spanish_date) publicacion_fecha_out = Compose(lambda v: v[0].isoformat()) comisiones_diputados_out = Identity() comisiones_senadores_out = Identity()
class TimetableLoader(XPathItemLoader): default_output_processor = Join() default_input_processor = MapCompose(unicode.strip) flight_in = Compose(flight_handler) flight_type_in = Identity() flight_type_out = Compose(return_first) flight_status_in = Compose(flight_status_handler) flight_status_out = Compose(return_first) city_of_departure_out = Compose(title) city_of_arrival_out = Compose(title) airport_of_departure_out = Compose(title) airport_of_arrival_out = Compose(title) airline_in = Compose(airline_handler) airline_out = Compose(title)
class ItemItemLoader(ItemLoader): default_input_processor = Identity() default_output_processor = Join() def itemIDfromURL(url): #Item=[a-zA-Z0-9]* itemQuery = re.compile('Item=[a-zA-Z0-9]*') itemString = re.findall(itemQuery, url) print itemString if itemString[0]: print itemString[0] return itemString[0].replace("Item=", "") itemNo_in = MapCompose(itemIDfromURL) itemNo_out = Join()
class ArrayField(PredefinedField): defaults = { 'output_processor': Identity(), 'default_value': [] } def __init__(self, field_or_item, **kwargs): if issubclass(field_or_item, scrapy.Item): defaults = { 'input_processor': MapCompose(dict) } elif issubclass(field_or_item, PredefinedField): defaults = field_or_item.defaults.copy() else: defaults = {} defaults.update(self.defaults) super(ArrayField, self).__init__(defaults, **kwargs)
class IrrAdvertisementLoader(ItemLoader): """ Defines input and output processors and actions for iir.ru advertisement data. """ default_output_processor = TakeFirst() default_input_processor = MapCompose( lambda txt: txt.strip() if isinstance(txt, (unicode, str)) else txt) foreign_id_in = MapCompose(only_digits) views_in = MapCompose(only_digits) price_in = MapCompose(only_price) seller_in = MapCompose(only_letters) published_in = MapCompose(datetime_interpretation) mileage_in = MapCompose(only_digits) mileage_units_in = MapCompose(only_letters) volume_in = MapCompose(only_digits) volume_units_in = MapCompose(only_letters) release_year_in = MapCompose(only_digits) horsepower_in = MapCompose(only_digits) photos_in = MapCompose( lambda x: None if isinstance(x, (unicode, str)) and len(x) < 10 else x) photos_out = Identity()
class LalaItemLoader(TestItemLoader): default_output_processor = Identity()
class IdentityDefaultedItemLoader(DefaultedItemLoader): name_in = Identity()
def test_identity(self): proc = Identity() self.assertEqual(proc([None, '', 'hello', 'world']), [None, '', 'hello', 'world'])
class RootItemLoader(XPathItemLoader): default_item_class = projectItems default_input_processor = Identity() default_ouput_processor = Identity()
class ItemItemLoader(ItemLoader): def cleanString(string): cleanQuery = re.compile('[A-Za-z0-9 .",\'!-]') string = ''.join(re.findall(cleanQuery, string)) return string def parseItemNo(itemin): #srItemNumber = itemin.extract() strItemNumber = itemin.strip().replace("\n", "").replace( "|", "").replace("\r", "").replace("\u00a0", "").strip() return strItemNumber def parseModelNo(itemin): strModelNumber = itemin.strip().replace("\n", "").replace( "|", "").replace("\r", "").replace("\u00a0", "").strip() return strModelNumber def catIDfromURL(url): itemIdQuery = re.compile('[Cc]at[Ii]d=[0-9]+') #print url catIDArry = re.findall(itemIdQuery, url) if catIDArry: categoryIDtxt = catIDArry[0] #print categoryIDtxt categoryID = categoryIDtxt.replace("CatId=", "") return categoryID def parseSpecification(specificationKV): # def cleanKey(key): # cleanQuery=re.compile('[A-Za-z0-9 .-]') # string=''.join(re.findall(cleanQuery, key)) # return string # def cleanValue(value): # cleanQuery=re.compile('[A-Za-z0-9 .",\'!-]') # string=''.join(re.findall(cleanQuery, value)) # return value # def cleanKV(specificationKV): # returnKV={} # for key, value in specificationKV.items(): # returnKV[cleanKey(key)]=cleanValue(value) # return specificationKV # def isSpecial(specificationKV): # isspecial=false # for key, value in specificationKV.items(): # if key=="Capactity": # isspecial=true # return isspecial returnVal = cleanKV(specificationKV) #if isSpecial(returnVal): # print "CAPACAITY" # print "CAPACAITY" # print "CAPACAITY" # print "CAPACAITY" # print "CAPACAITY" # print "CAPACAITY" # print "CAPACAITY" # print "CAPACAITY" # print "CAPACAITY" # print "CAPACAITY" # print "CAPACAITY" # print "CAPACAITY" # print "CAPACAITY" # print "CAPACAITY" # print "CAPACAITY" # print "CAPACAITY" # print "CAPACAITY" # print "CAPACAITY" # print "CAPACAITY" # print "CAPACAITY" # print "CAPACAITY" # print "CAPACAITY" # print "CAPACAITY" return returnVal #kv because it comes in as key/value dict default_input_processor = Identity() default_output_processor = Join() productName_in = MapCompose(unicode.title, cleanString) productName_out = Join() itemNo_in = MapCompose(parseItemNo) itemNo_out = Join() modelNo_in = MapCompose(parseModelNo) modelNo_out = Join() tdCategoryID_in = MapCompose(catIDfromURL) tdCategoryID_out = Identity() specifications_in = MapCompose(parseSpecification) specifications_out = Identity()
def __init__(cls, name, bases, dct): """Customizing __init__ because it has the cls ready""" cls.Images_in = MapCompose(filter_js_func_call) cls.Images_out = Identity() super(CarItemLoaderMeta, cls).__init__(name, bases, dct)
class DealLoader(DealLoaderBase): supplier_phones_in = MapCompose(unicode.strip, sanitize_phones) supplier_phones_our = Identity()
class ArtLoader(ItemLoader): default_item_class = Art default_input_processor = Identity() default_output_processor = TakeFirst()