def parse_subproducts(self, response): hxs = HtmlXPathSelector(response) #Fix for the HTML code. html = hxs.extract().replace('<br></h3>','').\ replace('<h3','<div class="item"').\ replace('</p>\n <div','</p></div>\n <div').\ replace('<input type="radio"', '<div class="hd" ').\ replace('checked>','>').\ replace('</p></div>','</div></p></div>').\ replace('</p>\n', '</div></p>\n') products_hxs = HtmlXPathSelector(text=html) products = products_hxs.select('//div[@class="item"]') for product in products: sub_products = product.select('div[@class="hd"]') if sub_products: for sub_product in sub_products: value = sub_product.select('./@value').extract()[0] hd = sub_product.select('./text()').extract()[0] name = ' '.join( (product.select('p/text()').extract()[0], hd)) extracted = process.extractOne(name, self.products) try: if extracted[1] >= 98: url = 'http://sellusyourgadget.co.uk/index.php/home/getConditions/%s' yield Request(url % value.split(':')[0], callback=self.parse_options, meta={ 'id': response.meta['id'], 'name': name, 'memoryR': value, 'memory': value }) except TypeError: return else: name = product.select('p/text()').extract()[0] extracted = process.extractOne(name, self.products) try: if extracted[1] >= 98: value = product.select('p/input/@value').extract()[0] url = 'http://sellusyourgadget.co.uk/index.php/home/getConditions/%s' yield Request(url % value.split(':')[0], callback=self.parse_options, meta={ 'id': response.meta['id'], 'name': name, 'memoryR': value, 'memory': value }) except TypeError: return
def parse_subproducts(self, response): hxs = HtmlXPathSelector(response) # Fix for the HTML code. html = ( hxs.extract() .replace("<br></h3>", "") .replace("<h3", '<div class="item"') .replace("</p>\n <div", "</p></div>\n <div") .replace('<input type="radio"', '<div class="hd" ') .replace("checked>", ">") .replace("</p></div>", "</div></p></div>") .replace("</p>\n", "</div></p>\n") ) products_hxs = HtmlXPathSelector(text=html) products = products_hxs.select('//div[@class="item"]') for product in products: sub_products = product.select('div[@class="hd"]') if sub_products: for sub_product in sub_products: value = sub_product.select("./@value").extract()[0] hd = sub_product.select("./text()").extract()[0] name = " ".join((product.select("p/text()").extract()[0], hd)) extracted = process.extractOne(name, self.products) try: if extracted[1] >= 98: url = "http://sellusyourgadget.co.uk/index.php/home/getConditions/%s" yield Request( url % value.split(":")[0], callback=self.parse_options, meta={"id": response.meta["id"], "name": name, "memoryR": value, "memory": value}, ) except TypeError: return else: name = product.select("p/text()").extract()[0] extracted = process.extractOne(name, self.products) try: if extracted[1] >= 98: value = product.select("p/input/@value").extract()[0] url = "http://sellusyourgadget.co.uk/index.php/home/getConditions/%s" yield Request( url % value.split(":")[0], callback=self.parse_options, meta={"id": response.meta["id"], "name": name, "memoryR": value, "memory": value}, ) except TypeError: return
def parse(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//div[@class="box_product"]') dict_products = {} #Obtains all the products of the first page of the search. for product in products: name = product.select('a/h3/text()').extract()[0] url = url = urljoin_rfc(get_base_url(response), product.select('a/@href').extract()[0]) price = product.select( 'div/div/div[@class="price"]/text()').extract()[0] dict_products[name] = [url, price] #Just loads one product using fuzzy matching. extracted = process.extractOne(response.meta['name'], dict_products.keys(), scorer=fuzz.token_set_ratio) try: if extracted[1] >= 92: loader = ProductLoader(item=Product(), response=response) loader.add_value('sku', response.meta['sku']) loader.add_value('name', extracted[0]) loader.add_value('url', dict_products[extracted[0]][0]) loader.add_value('price', dict_products[extracted[0]][1]) yield loader.load_item() except TypeError: return
def testWithProcessor(self): events = [ ["chicago cubs vs new york mets", "CitiField", "2011-05-11", "8pm"], ["new york yankees vs boston red sox", "Fenway Park", "2011-05-11", "8pm"], ["atlanta braves vs pittsburgh pirates", "PNC Park", "2011-05-11", "8pm"], ] query = "new york mets vs chicago cubs" processor = lambda event: event[0] best = process.extractOne(query, events, processor=processor) self.assertEqual(best[0], events[0])
def testWithScorer(self): choices = [ "new york mets vs chicago cubs", "chicago cubs at new york mets", "atlanta braves vs pittsbugh pirates", "new york yankees vs boston red sox" ] # in this hypothetical example we care about ordering, so we use quick ratio query = "new york mets at chicago cubs" scorer = QRatio # first, as an example, the normal way would select the "more 'complete' match of choices[1]" best = process.extractOne(query, choices) self.assertEqual(best[0], choices[1]) # now, use the custom scorer best = process.extractOne(query, choices, scorer=scorer) self.assertEqual(best[0], choices[0])
def testNullStrings(self): choices = [ None, "new york mets vs chicago cubs", "new york yankees vs boston red sox", None, None ] query = "new york mets at chicago cubs" best = process.extractOne(query, choices) self.assertEqual(best[0], choices[1])
def testWithCutoff(self): choices = [ "new york mets vs chicago cubs", "chicago cubs at new york mets", "atlanta braves vs pittsbugh pirates", "new york yankees vs boston red sox" ] query = "los angeles dodgers vs san francisco giants" # in this situation, this is an event that does not exist in the list # we don't want to randomly match to something, so we use a reasonable cutoff best = process.extractOne(query, choices, score_cutoff=50) self.assertTrue(best is None)
def parse(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//div[@class="box_product"]') dict_products = {} #Obtains all the products of the first page of the search. for product in products: name = product.select('a/h3/text()').extract()[0] url = url = urljoin_rfc(get_base_url(response), product.select('a/@href').extract()[0]) price = product.select( 'div/div/div[@class="price"]/text()').extract()[0] dict_products[name] = [url, price] #Just loads one product using fuzzy matching. extracted = process.extractOne(response.meta['name'], dict_products.keys(), scorer=fuzz.token_set_ratio) try: if extracted[1]>=92: loader = ProductLoader(item=Product(), response=response) loader.add_value('sku', response.meta['sku']) loader.add_value('name', extracted[0]) loader.add_value('url', dict_products[extracted[0]][0]) loader.add_value('price',dict_products[extracted[0]][1]) yield loader.load_item() except TypeError: return
def testGetBestChoice4(self): query = "chicago cubs vs new york mets" best = process.extractOne(query, self.baseball_strings) self.assertEqual(best[0], self.baseball_strings[0])
def testGetBestChoice3(self): query = "atlanta braves at philadelphia phillies" best = process.extractOne(query, self.baseball_strings) self.assertEqual(best[0], self.baseball_strings[2])
def testGetBestChoice1(self): query = "new york mets at atlanta braves" best = process.extractOne(query, self.baseball_strings) self.assertEqual(best[0], "braves vs mets")