def parse_search(self, response): row = response.meta['row'] try: products = json.loads(response.body)['products'] except ValueError: return if not products: if 'code' not in response.meta: code = catalog_codes[row] if (str(code) == 'nan') or (str(code) in bad_products): return else: url = 'http://www.product-config.net/catalog3/service?o=keys&d=altra.bostongear&cdskeys={}&unit=english/'.format( code) return scrapy.Request(url=url, meta={ 'row': row, 'code': code }, callback=self.parse_search, dont_filter=True) elif str(catalog_descriptions[row]) != 'nan' or str( catalog_documents[row]) != 'nan': item = UniversalItem() item['ids'] = catalog_ids[row] item['catalog_number'] = row item['code'] = catalog_codes[row] item['image'] = '' item['documents'] = catalog_documents[row] item['additional_description'] = catalog_descriptions[row] return item else: products = products[0] item = UniversalItem() item['ids'] = catalog_ids[row] item['catalog_number'] = row item['code'] = catalog_codes[row] item['image'] = 'https:' + str(products['imageURL']) if str(catalog_descriptions[row]) == 'nan': item['additional_description'] = self.get_descriptions( products) else: item['additional_description'] = catalog_descriptions[row] if str(catalog_documents[row]) == 'nan': docs = {} for i, doc in enumerate(products['attributeValues']): if 'www.altraliterature.com' in doc: docs[doc] = products['attributes'][i]['label'] return self.make_items(item, docs) else: item['documents'] = catalog_documents[row] return item
def parse_product(self, response): hxs = HtmlXPathSelector(response) data = pandas.read_csv("mro/spiders/csv_data/Ringspanncorp/ringspanncorp.csv", sep=',') catalog = list(data.catalog_number) ids = list(data.id) description = list(data.description) key1 = list(data.key1) key2 = list(data.key2) catalog_key1 = dict(zip(catalog, key1)) catalog_key2 = dict(zip(catalog, key2)) catalog_description = dict(zip(catalog, description)) catalog_id = dict(zip(catalog, ids)) key1_ids = dict(zip(key1, ids)) key2_ids = dict(zip(key2, ids)) key1_catalog = dict(zip(key1, catalog)) key2_catalog = dict(zip(key2, catalog)) key1_description = dict(zip(key1, description)) key2_description = dict(zip(key2, description)) for catalog_n in catalog: key = catalog_key1[catalog_n] name = ' ' + str(key) if name in response.xpath('//h1').extract_first(): if catalog_n not in self.items: item = UniversalItem() item['ids'] = catalog_id[catalog_n] item['catalog_number'] = catalog_n key_digits = catalog_key1[catalog_n].re('(\d+)') self.items.append(catalog_n) url = response.xpath('//a[@class="cad link_grey"]/@href') yield Request(url=url, meta={'item': item, 'key': key_digits}, callback='cad_page') for catalog_n in catalog: key = catalog_key2[catalog_n] name = ' ' + str(key) name2 = str(key) + ' ' if name in response.xpath('//h1').extract_first() or name2 in response.xpath('//h1').extract_first(): if catalog_n not in self.items: item = UniversalItem() item['ids'] = catalog_id[catalog_n] item['catalog_number'] = catalog_n key_digits = catalog_key1[catalog_n].re('(\d+)') self.items.append(catalog_n) url = response.xpath('//a[@class="cad link_grey"]/@href') yield Request(url=url, meta={'item': item, 'key': key_digits}, callback='cad_page')
def parse_search(self, response): row = response.meta['row'] data = json.loads(response.body) if data.get('error'): return add_descr = data.get('longDescription', '')\ if str(catalog_add_descriptions[row]) == 'nan'\ else catalog_add_descriptions[row] image = catalog_images[row] if 'dafault' in image or str(catalog_ids[row]) in image: image = 'https:' + data.get('imageURL') if data.get( 'imageURL') else '' attr = catalog_attributes[row] if str(attr) == "nan": attr = '' attributes = data.get('attributes') values = data.get('attributeValues') if attributes and values: for i, item in enumerate(attributes): if item['dataType'] != 'attachment' and item[ 'dataType'] != 'image' and item['visible']: attr += item['label'] + ':' + values[i] + '|' attr = attr[:-1] item = UniversalItem() item['ids'] = catalog_ids[row] item['catalog_number'] = row item['description'] = catalog_descriptions[row] item['additional_description'] = add_descr item['main_image'] = image item['attributes'] = attr return item
def parse_product(self, response): hxs = HtmlXPathSelector(response) # sku = response.url.rsplit('/', 1)[-1] catalog_number = response.meta['sku'] if catalog_number not in self.catalog: print catalog_number return if not response.xpath('//div[@class="productsWrapper"]'): print 'without docs' return names = response.xpath( '//div[@class="productsWrapper"]//td/a/text()').extract() links = response.xpath( '//div[@class="productsWrapper"]//td/a/@href').extract() names_links = dict(zip(names, links)) for key, value in names_links.iteritems(): item = UniversalItem() item['ids'] = self.catalog_ids[catalog_number] item['catalog_number'] = catalog_number item['name'] = key item['document'] = 'https://buy.tecowestinghouse.com' + value yield item
def parse_item(self, response, meta_row, row, url): if 'restricted access' in response.xpath('//*').extract_first(): print meta_row print 'restricted access' if self.index > 5: print 'the end' return proxy = 'http://108.59.14.203:13040/' self.index += 1 return self.request(url, meta_row, row, proxy) else: item = UniversalItem() item['ids'] = catalog_ids[meta_row] item['catalog_number'] = str(meta_row).strip() try: url = response.xpath('//*').re(r'"url":"(.+)","productID"')[0] print '--------' print url req = urllib2.Request('http:' + url) resp = urllib2.urlopen(req) file_name = '%s.zip' % urllib.quote_plus(str(meta_row).strip()) with open('gates_downoad/' + file_name, 'wb') as file: shutil.copyfileobj(resp.fp, file) except Exception: return else: item['cad'] = file_name return item
def create_item(self, row, url): item = UniversalItem() item['ids'] = catalog_ids[row] item['brand'] = catalog_brand[row] item['catalog_number'] = row item['url'] = url return item
def parse(self, response): item = UniversalItem() item['name'] = response.xpath( '//*[@id="left_rail"]/div[1]/div[1]/h1/text()').re(r'\d+ (.+)')[0] item['by date'] = response.xpath( '//*[@id="left_rail"]/div[1]/div[1]/ul/li[1]/span/text()' ).extract_first() item['market cap'] = response.xpath( '//*[@id="left_rail"]/div[1]/div[1]/ul/li[2]/text()' ).extract_first() + response.xpath( '//*[@id="left_rail"]/div[1]/div[1]/ul/li[2]//span/text()' ).extract_first() for dl in response.xpath('//*[@id="left_rail"]/div[1]/div[1]/dl'): item[dl.xpath('./dt/text()').extract_first( )] = dl.xpath('./dd/text()').extract_first() or dl.xpath( './dd/a/@href').extract_first() item['description'] = response.xpath( '//div[@class="profile"]/text()').extract_first() + response.xpath( '//*[@id="fulldesc"]/text()').extract_first() item['url'] = response.url yield item url = response.xpath( '//div[@class="next-button"]/a/@href').extract_first() if url: yield scrapy.Request(url=response.urljoin(url), callback=self.parse)
def parse_item(self, response): hxs = HtmlXPathSelector(response) catalog_number = response.request.meta['catalog_number'] item = UniversalItem() item['ids'] = catalog_ids[catalog_number] item['catalog_number'] = catalog_number item['url_page'] = catalog_url[catalog_number] if response.xpath('//a[@title="Product Drawing PDF"]/@href'): item['Product_Drawing_PDF'] = response.xpath( '//a[@title="Product Drawing PDF"]/@href').extract()[0] else: item['Product_Drawing_PDF'] = '' if response.xpath('//a[@title="Product Drawing DWG"]/@href'): item['Product_Drawing_DWG'] = response.xpath( '//a[@title="Product Drawing DWG"]/@href').extract()[0] else: item['Product_Drawing_DWG'] = '' if response.xpath('//a[@title="Installation Instructions"]/@href'): item['Installation_Instructions'] = \ response.xpath('//a[@title="Installation Instructions"]/@href').extract()[0] else: item['Installation_Instructions'] = '' yield item
def parse_product(self, response): hxs = HtmlXPathSelector(response) # sku = response.url.rsplit('/', 1)[-1] catalog_number = response.meta['sku'] if catalog_number not in self.catalog: print '--------' print catalog_number return item = UniversalItem() item['ids'] = self.catalog_ids[catalog_number] item['catalog_number'] = catalog_number attr_names = response.xpath('//table[@class="rgMasterTable MotorsDetailPage"]/tbody/tr/td[1]/text()').extract() attr_values = response.xpath('//table[@class="rgMasterTable MotorsDetailPage"]/tbody/tr/td[2]/text()').extract() attr = dict(zip(attr_names, attr_values)) for key in attr.keys(): if str(key) == 'Motor Type' or str(key) == 'Drives Type' or str(key) == 'Approx. Weight' or str( key) == 'List Price' or attr[key] == u'\xa0': del attr[key] attrs = "" for key, value in attr.iteritems(): attrs += str(key) + ":" + str(value) + "|" attrs = attrs[:-1] if attrs == "": return item['attributes'] = attrs return item
def create_item(self, row, name, url): item = UniversalItem() item['ids'] = catalog_ids[row] item['catalog_number'] = row item['name'] = name item['url'] = url return item
def create_item(self, row, table): item = UniversalItem() item['ids'] = catalog_ids[row] item['brand'] = catalog_brand[row] item['catalog_number'] = row item['descr'] = table return item
def parse_item_final(self, response): hxs = HtmlXPathSelector(response) catalog = list(self.data.catalog_number) prev_num = str(response.meta['meta_row']).strip() price = re.findall('<b>Retail:</b> \$ (.+) ea', response.body) desc = response.xpath( '//b[contains(text(), "Product Description")]/..').extract_first() print prev_num if prev_num and desc: if prev_num not in desc: return else: return if not price: return item = UniversalItem() item['catalog_number'] = str(response.meta['meta_row']).strip() item['retail'] = re.findall('<b>Retail:</b> \$ (.+) ea', response.body)[0] item['your_price'] = re.findall( '<b>Your Price:</b> \$ (.+) ea', response.body)[0] return item
def parse_item(self, response): if '/item/' in response.url: catalog = response.meta['row'] E = UniversalItem() img = None if 'defaul' in self.catalog_main_image[catalog] or self.catalog_id[ catalog] in self.catalog_main_image[catalog]: img = response.xpath( '//img[@itemprop="image"]/@src').extract_first() E['main_image'] = response.urljoin(img) if img else '' for attr in self.attributes: print attr path = response.xpath( '//h3[@data-id="#{}"]/../div/table/tbody/tr'.format(attr)) temp = '' if path: for i in path: first_part = i.xpath( './td[1]/h2/strong/text()').extract_first( ) or i.xpath( './td[1]/strong/text()').extract_first() temp += first_part + ':' + ' '.join( i.xpath('./td[2]/span/span[2]/span/text()'). extract()) + '|' E[attr] = temp[:-1] if temp else '' E['id'] = self.catalog_id[catalog] E['catalog_number'] = catalog return E
def parse_product(self, response): hxs = HtmlXPathSelector(response) if not response.xpath( '//div[@class="result-item-detail"]//h1[@class="manufacturer-name"]/text()' ): return codes = response.xpath( '//div[@class="result-item-detail"]//h1[@class="manufacturer-name"]/text()' ).re('Dodge \(Baldor\) (.+)') mi = response.xpath( '//div[@class="result-item-detail"]//div[@class="item-property"]/a/text()' ).re('# (.+)') d = zip(codes, mi) for it, l in d: if it not in self.items: item = UniversalItem() item['code'] = it l = l.replace('\r', '') l = l.replace('\n', '') item['mi_item'] = l self.items.append(it) yield item
def parse(self, response): hxs = HtmlXPathSelector(response) data = pandas.read_csv("spiders/csv_data/Dixon/dixon_images.csv", sep=',') catalog = list(data.catalog_number) ids = list(data.id) description = list(data.description) catalog_ids = dict(zip(catalog, ids)) catalog_description = dict(zip(catalog, description)) for sku in catalog: if str(sku) in response.xpath( '//div[@id="product-specs"]').extract_first(): if sku not in self.items: item = UniversalItem() item['ids'] = catalog_ids[sku] item['catalog_number'] = sku item['description'] = catalog_description[sku] item['main_image'] = response.xpath( '//img[@class="product-image"]/@src').extract_first() item['response_url'] = response.url self.items.append(sku) return item
def get_cad_direct_url(self, response): hxs = HtmlXPathSelector(response) if 'CadDownloadUrl' in response.body: data = json.loads(response.body) print '\n' print 'CAD DOWNLOAD URL' print data['CadDownloadUrl'] print '\n' item = UniversalItem() catalog_number = response.request.meta['catalog_number'] item['catalog_number'] = catalog_number item['ids'] = self.catalog_ids[catalog_number] try: url = data['CadDownloadUrl'] req = urllib2.Request(url) resp = urllib2.urlopen(req) file_name = '%s.pdf' % urllib.quote_plus(str(catalog_number)) with open('bear_download/' + file_name, 'wb') as file: shutil.copyfileobj(resp.fp, file) except Exception: print 'EXCEPTION' return else: item['cad'] = file_name return item else: print '\n' print response.request.meta print response.body print '\n' print 'THERE IS NOT CAD DOWNLOAD URL' print '\n' return
def parse_item3(self, response): descr = response.xpath('/html/body/div').extract_first() item = UniversalItem() item['ids'] = catalog_ids[response.meta['meta_row']] item['catalog_number'] = str(response.meta['meta_row']).strip() item['descr'] = descr return item
def parse_product(self, response): hxs = HtmlXPathSelector(response) print 'parse' data = pandas.read_csv("spiders/csv_data/Mcrsafety/MCR_Safety.csv", sep=',') catalog = list(data.catalog_number) ids = list(data.id) catalog_id = dict(zip(catalog, ids)) number = response.meta['number'] item = UniversalItem() item['ids'] = catalog_id[number] item['catalog_number'] = number item['image'] = response.xpath( '//div[@class="material-image-container"]//img/@src').extract( )[0].replace("Download ", "") item['document_name'] = response.xpath( '//div[@class="material-specsheet"]//a/span/text()').extract( )[0].replace("Download ", "") item['document_url'] = 'http://www.mcrsafety.com' + \ response.xpath('//div[@class="material-specsheet"]//a/@href').extract()[0] item['additional_description'] = response.xpath('//div[@class="black material-long-description"]').extract()[ 0] + \ response.xpath('//div[@class="row material-attr-grouping"]').extract()[0] item['features'] = response.xpath('//div[@id="features"]').extract()[0] item['specs'] = response.xpath('//div[@id="specs"]').extract()[0] item['industry_application'] = response.xpath( '//div[@id="industries"]').extract()[0] return item
def parse_item(self, response): catalog_number = response.meta['catalog_number'] item = UniversalItem() item['ids'] = catalog_ids[catalog_number] item['catalog_number'] = catalog_number item['image'] = catalog_main_images[catalog_number] item['description'] = catalog_descriptions[catalog_number] if str(catalog_additional_descriptions[catalog_number]) != 'nan': item['additional_description'] = catalog_additional_descriptions[catalog_number] else: made_in = re.compile('<a.*?>|<p>Backed by our.*?\p>|<p><img.*?\p>|\\n') additional_description = response.xpath('//div[@class="cms-edit-size"]/cmsitemreset/div').extract()[0] item['additional_description'] = re.sub(made_in, '', additional_description) if 'default' in catalog_main_images[catalog_number]: if response.xpath('//span[@class="prod-thumb selected"]/img/@src'): item['image'] = 'http://www.carlislebelts.com' + response.xpath('//span[@class="prod-thumb selected"]/img/@src').extract()[0] item['brochure'] = '' item['product_specs'] = '' item['brochure'] = 'http://www.carlislebelts.com' + \ response.xpath('//div[@class="product-download"]/p[contains(text(), "Brochure")]/../a/@href').extract()[0] item['product_specs'] = 'http://www.carlislebelts.com' + \ response.xpath('//div[@class="product-download"]/p[contains(text(), "Product Specs")]/../a/@href').extract()[0] return item
def parse_item3(self, response): item = UniversalItem() item['ids'] = catalog_ids[response.meta['row']] item['catalog_number'] = response.meta['row'] result = str(response.meta['res']) + str( response.xpath('//*/form').extract_first()) item['additional_descr'] = result return item
def error(self, row): item = UniversalItem() item['ids'] = catalog_ids[row] item['catalog_number'] = str(row) item['descr'] = '' item['add_descr'] = '' item['img_url'] = '' return item
def parse_item(self, response, catalog=None, links=None): if catalog is None: catalog = response.meta['row'] if links is None and '/item/' not in response.url: links = [ i for i in response.xpath( '//*[@id="plp-search-results-list"]/div[4]/div/span[1]/a[1]/@href' ).extract() if '/item/' in i ] if not links: expression = '//*[@id="plp-table-filter"]/tbody/tr/td/span/a[text()="{}" or text()="{}"]/@href'.format( catalog, catalog.lower()) url = response.xpath(expression).extract_first() if url: return scrapy.Request(url=response.urljoin(url), callback=lambda response: self. parse_item(response, catalog, links)) else: pages_catalog = response.xpath( '//*[@id="plp-product-title"]/h1/text()').extract_first( ).split(',')[0].split('Number ')[1].lower() if pages_catalog == catalog.lower(): #img = response.xpath('//*[@id="largegallery"]/div[@class="ad-nav"]/div[@class="ad-thumbs"]/ul/li[1]/a/@href').extract_first() E = UniversalItem() for attr in self.attributes: print attr path = response.xpath( '//h3[@data-id="#{}"]/../div/table/tbody/tr'.format( attr)) temp = '' if path: for i in path: first_part = i.xpath( './td[1]/h2/strong/text()').extract_first( ) or i.xpath( './td[1]/strong/text()').extract_first() temp += first_part + ':' + ' '.join( i.xpath('./td[2]/span/span[2]/span/text()'). extract()) + '|' E[attr] = temp[:-1] if temp else '' E['id'] = self.catalog_id[catalog] E['catalog_number'] = catalog return E ''' if img: return { 'id': self.catalog_id[catalog], 'img' : response.urljoin(img), 'catalog_number': catalog, } ''' if links: return scrapy.Request(url=response.urljoin(links.pop(0)), callback=lambda response: self.parse_item( response, catalog, links))
def parse_item1(self, response, row, overview, descr): specs = str( response.xpath('//*').extract_first().encode('ascii', 'ignore')) overwiew_specs = specs if specs != '<html></html>' else overview item = UniversalItem() item['ids'] = catalog_ids[row] item['catalog_number'] = row item['specs_or_overview'] = overwiew_specs item['description'] = descr return item
def create_item(self, row, img, doc_name, doc_url, specs): item = UniversalItem() item['ids'] = catalog_ids[row] item['catalog_number'] = row item['description'] = catalog_descr[row] item['img'] = img item['doc_name'] = doc_name item['doc_url'] = doc_url item['specs'] = specs return item
def parse_item2(self, response, meta_row): orderno = response.xpath('//*').re(r'<ORDERNO>(.+)</ORDERNO>')[0] + '/' zipfile = response.xpath('//*').re(r'<ZIPFILE>(.+)</ZIPFILE>')[0] item = UniversalItem() item['ids'] = catalog_ids[meta_row] item['catalog_number'] = str(meta_row).strip() item['file_urls'] = [ 'http://www.skf.com/cadDownload/' + orderno + zipfile ] return item
def parse_item2(self, response): code = response.meta['code'] descr = response.xpath( '//*[@id="ctl00_ContentPlaceHolder1_dlSpecs"]').extract_first() E = UniversalItem() E['ids'] = item_codes_ids[code] E['description'] = item_codes_description[code] E['item_code'] = code E['add_descr'] = self.construct_table(descr) E['ordering_number'] = item_code_ordering_number[code] return E
def parse_item1(self, response, row): index = 0 for head in response.xpath('//*[@class="drawingHeader"]'): item = UniversalItem() item['ids'] = catalog_ids[row] item['catalog_number'] = row item['name'] = head.xpath('text()').extract_first() item['url'] = 'https://www.baldorvip.com' + response.xpath('//*[@class="drawing"]/a/@href')[ index].extract().replace('&', '&') index += 1 yield item
def parse_item(self, response): row = response.meta['row'] cadid = response.xpath( '//script[contains(text(), "insite.catalog.catalogPageGlobal")]' ).re(r'"cadid":(.+?),') if cadid: if cadid[0] != '""': item = UniversalItem() item['ids'] = catalog_ids[row] item['catalog_number'] = row item['cadid'] = cadid[0] return item
def parse_item(response): item_image = response.xpath( '//td[@class="item_img"]/img/@src').extract_first() if item_image: if self.images.get( catalog_number ) and 'default' in self.images[catalog_number]: item = UniversalItem() item['ids'] = self.ids[catalog_number] item['catalog_number'] = catalog_number item['img_url'] = item_image return item
def download(self, response): cad = response.meta['cad'] row = response.meta['row'] file_name = response.url.split('/')[-1] with open('results/Reelcraft/' + cad + '_cad/' + file_name, 'wb') as file: file.write(response.body) item = UniversalItem() item['ids'] = catalog_ids[row] item['catalog_number'] = row item['pdf'] = file_name return item