Python extractOne 예제들, product_spiders.fuzzywuzzy.process.extractOne Python 예제들

예제 #1

0

파일 보기

파일: sellusyourgadget_spider.py 프로젝트: oceancloud82/scraping

    def parse_subproducts(self, response):
        hxs = HtmlXPathSelector(response)
        #Fix for the HTML code.
        html = hxs.extract().replace('<br></h3>','').\
                             replace('<h3','<div class="item"').\
                             replace('</p>\n                                            <div','</p></div>\n    <div').\
                             replace('<input type="radio"', '<div class="hd" ').\
                             replace('checked>','>').\
                             replace('</p></div>','</div></p></div>').\
                             replace('</p>\n', '</div></p>\n')

        products_hxs = HtmlXPathSelector(text=html)
        products = products_hxs.select('//div[@class="item"]')
        for product in products:
            sub_products = product.select('div[@class="hd"]')
            if sub_products:
                for sub_product in sub_products:
                    value = sub_product.select('./@value').extract()[0]
                    hd = sub_product.select('./text()').extract()[0]
                    name = ' '.join(
                        (product.select('p/text()').extract()[0], hd))
                    extracted = process.extractOne(name, self.products)
                    try:
                        if extracted[1] >= 98:
                            url = 'http://sellusyourgadget.co.uk/index.php/home/getConditions/%s'
                            yield Request(url % value.split(':')[0],
                                          callback=self.parse_options,
                                          meta={
                                              'id': response.meta['id'],
                                              'name': name,
                                              'memoryR': value,
                                              'memory': value
                                          })
                    except TypeError:
                        return
            else:
                name = product.select('p/text()').extract()[0]
                extracted = process.extractOne(name, self.products)
                try:
                    if extracted[1] >= 98:
                        value = product.select('p/input/@value').extract()[0]
                        url = 'http://sellusyourgadget.co.uk/index.php/home/getConditions/%s'
                        yield Request(url % value.split(':')[0],
                                      callback=self.parse_options,
                                      meta={
                                          'id': response.meta['id'],
                                          'name': name,
                                          'memoryR': value,
                                          'memory': value
                                      })
                except TypeError:
                    return

예제 #2

0

파일 보기

파일: sellusyourgadget_spider.py 프로젝트: 0--key/lib

    def parse_subproducts(self, response):
        hxs = HtmlXPathSelector(response)
        # Fix for the HTML code.
        html = (
            hxs.extract()
            .replace("<br></h3>", "")
            .replace("<h3", '<div class="item"')
            .replace("</p>\n                                            <div", "</p></div>\n    <div")
            .replace('<input type="radio"', '<div class="hd" ')
            .replace("checked>", ">")
            .replace("</p></div>", "</div></p></div>")
            .replace("</p>\n", "</div></p>\n")
        )

        products_hxs = HtmlXPathSelector(text=html)
        products = products_hxs.select('//div[@class="item"]')
        for product in products:
            sub_products = product.select('div[@class="hd"]')
            if sub_products:
                for sub_product in sub_products:
                    value = sub_product.select("./@value").extract()[0]
                    hd = sub_product.select("./text()").extract()[0]
                    name = " ".join((product.select("p/text()").extract()[0], hd))
                    extracted = process.extractOne(name, self.products)
                    try:
                        if extracted[1] >= 98:
                            url = "http://sellusyourgadget.co.uk/index.php/home/getConditions/%s"
                            yield Request(
                                url % value.split(":")[0],
                                callback=self.parse_options,
                                meta={"id": response.meta["id"], "name": name, "memoryR": value, "memory": value},
                            )
                    except TypeError:
                        return
            else:
                name = product.select("p/text()").extract()[0]
                extracted = process.extractOne(name, self.products)
                try:
                    if extracted[1] >= 98:
                        value = product.select("p/input/@value").extract()[0]
                        url = "http://sellusyourgadget.co.uk/index.php/home/getConditions/%s"
                        yield Request(
                            url % value.split(":")[0],
                            callback=self.parse_options,
                            meta={"id": response.meta["id"], "name": name, "memoryR": value, "memory": value},
                        )
                except TypeError:
                    return

예제 #3

0

파일 보기

파일: camerahouse_spider.py 프로젝트: ontiyonke/lib

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        products = hxs.select('//div[@class="box_product"]')
        dict_products = {}
        #Obtains all the products of the first page of the search.
        for product in products:
            name = product.select('a/h3/text()').extract()[0]
            url = url = urljoin_rfc(get_base_url(response),
                                    product.select('a/@href').extract()[0])
            price = product.select(
                'div/div/div[@class="price"]/text()').extract()[0]
            dict_products[name] = [url, price]
        #Just loads one product using fuzzy matching.
        extracted = process.extractOne(response.meta['name'],
                                       dict_products.keys(),
                                       scorer=fuzz.token_set_ratio)
        try:
            if extracted[1] >= 92:
                loader = ProductLoader(item=Product(), response=response)
                loader.add_value('sku', response.meta['sku'])
                loader.add_value('name', extracted[0])

                loader.add_value('url', dict_products[extracted[0]][0])
                loader.add_value('price', dict_products[extracted[0]][1])
                yield loader.load_item()
        except TypeError:
            return

예제 #4

0

파일 보기

파일: tests.py 프로젝트: oceancloud82/scraping

    def testWithProcessor(self):
        events = [
            ["chicago cubs vs new york mets", "CitiField", "2011-05-11", "8pm"],
            ["new york yankees vs boston red sox", "Fenway Park", "2011-05-11", "8pm"],
            ["atlanta braves vs pittsburgh pirates", "PNC Park", "2011-05-11", "8pm"],
        ]
        query = "new york mets vs chicago cubs"
        processor = lambda event: event[0]

        best = process.extractOne(query, events, processor=processor)
        self.assertEqual(best[0], events[0])

예제 #5

0

파일 보기

파일: tests.py 프로젝트: oceancloud82/scraping

    def testWithScorer(self):
        choices = [
            "new york mets vs chicago cubs",
            "chicago cubs at new york mets",
            "atlanta braves vs pittsbugh pirates",
            "new york yankees vs boston red sox"
        ]

        # in this hypothetical example we care about ordering, so we use quick ratio
        query = "new york mets at chicago cubs"
        scorer = QRatio

        # first, as an example, the normal way would select the "more 'complete' match of choices[1]"

        best = process.extractOne(query, choices)
        self.assertEqual(best[0], choices[1])

        # now, use the custom scorer

        best = process.extractOne(query, choices, scorer=scorer)
        self.assertEqual(best[0], choices[0])

예제 #6

0

파일 보기

파일: tests.py 프로젝트: oceancloud82/scraping

    def testNullStrings(self):
        choices = [
            None,
            "new york mets vs chicago cubs",
            "new york yankees vs boston red sox",
            None,
            None
        ]

        query = "new york mets at chicago cubs"

        best = process.extractOne(query, choices)
        self.assertEqual(best[0], choices[1])

예제 #7

0

파일 보기

파일: tests.py 프로젝트: oceancloud82/scraping

    def testWithCutoff(self):
        choices = [
            "new york mets vs chicago cubs",
            "chicago cubs at new york mets",
            "atlanta braves vs pittsbugh pirates",
            "new york yankees vs boston red sox"
        ]

        query = "los angeles dodgers vs san francisco giants"

        # in this situation, this is an event that does not exist in the list
        # we don't want to randomly match to something, so we use a reasonable cutoff

        best = process.extractOne(query, choices, score_cutoff=50)
        self.assertTrue(best is None)

예제 #8

0

파일 보기

파일: camerahouse_spider.py 프로젝트: 0--key/lib

 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     products = hxs.select('//div[@class="box_product"]')
     dict_products = {}
     #Obtains all the products of the first page of the search. 
     for product in products:
         name = product.select('a/h3/text()').extract()[0]
         url = url = urljoin_rfc(get_base_url(response), product.select('a/@href').extract()[0])
         price = product.select( 'div/div/div[@class="price"]/text()').extract()[0]
         dict_products[name] = [url, price]
     #Just loads one product using fuzzy matching.
     extracted = process.extractOne(response.meta['name'], dict_products.keys(), scorer=fuzz.token_set_ratio)    
     try:
         if extracted[1]>=92:
             loader = ProductLoader(item=Product(), response=response)
             loader.add_value('sku', response.meta['sku'])
             loader.add_value('name', extracted[0])
         
             loader.add_value('url', dict_products[extracted[0]][0])
             loader.add_value('price',dict_products[extracted[0]][1])
             yield loader.load_item()
     except TypeError: 
         return

예제 #9

0

파일 보기

파일: tests.py 프로젝트: oceancloud82/scraping

 def testGetBestChoice4(self):
     query = "chicago cubs vs new york mets"
     best = process.extractOne(query, self.baseball_strings)
     self.assertEqual(best[0], self.baseball_strings[0])

예제 #10

0

파일 보기

파일: tests.py 프로젝트: oceancloud82/scraping

 def testGetBestChoice3(self):
     query = "atlanta braves at philadelphia phillies"
     best = process.extractOne(query, self.baseball_strings)
     self.assertEqual(best[0], self.baseball_strings[2])

예제 #11

0

파일 보기

파일: tests.py 프로젝트: oceancloud82/scraping

 def testGetBestChoice1(self):
     query = "new york mets at atlanta braves"
     best = process.extractOne(query, self.baseball_strings)
     self.assertEqual(best[0], "braves vs mets")