Exemplo n.º 1
0
    def getVolumetricWeight(self, dimension, ut):
        if len(dimension) == 0 or " x " not in dimension:
            return 0

        volumetric_weight = 0
        try:
            di_arr = dimension.split(' x ')
            dimensions = []
            if len(di_arr) == 3:
                for index in range(len(di_arr)):
                    di = di_arr[index]
                    for pat in self.dimensionPatternsToRemove:
                        di = StringUtil.str_cleaner(di, pat, "").strip()
                    if StringUtil.try_parse('float', di) is True:
                        if ut == UnitTypes.MM:
                            di = math.ceil(float(di) * 0.0393701)
                        elif ut == UnitTypes.CM:
                            di = math.ceil(float(di) * 0.393701)

                        dimensions.append(float(di))
                    else:
                        # Set value to 1 if the dimension value is invalid to convert to float.
                        # This idea is wrong. Temp solution only.
                        dimensions.append(1)

            if len(dimensions) == 3:
                volumetric_weight = reduce(lambda x, y: x * y, dimensions)
                volumetric_weight = volumetric_weight / 130
                volumetric_weight = math.ceil(volumetric_weight)
            else:
                volumetric_weight = 0
        except:
            volumetric_weight = 0

        return volumetric_weight
Exemplo n.º 2
0
    def getProductDimension(self, instance_of, instance_val):
        if self.htmlObject is None:
            return ''

        if instance_val is None:
            return ''

        content = None
        dimension = ''

        if instance_of == "DOCUMENT" and len(instance_val) > 0:
            content = self.getElementValue(instance_val)
        elif instance_of == "STRING" and len(instance_val) > 0:
            content = instance_val

        content = StringUtil.remove_html_tags(content)
        content = StringUtil.str_cleaner(content, r'\\([a-z0-9]{3})', '')
        content = StringUtil.str_cleaner(content,
                                         r'[^0-9a-zA-Z\s\-\(\).,"\'&]+', '')

        if content is not None and type(
                content) is not None and len(content) > 0:
            # for pat in self.dimensionPatternsToRemove:
            #     content = StringUtil.str_cleaner(content, pat, "")

            for sptf in self.dimensionPatterns:
                if StringUtil.str_find_str(str(content), sptf):
                    dimension = StringUtil.str_search_str(str(content), sptf)
                    break

        if dimension and len(dimension) == 0 or len(dimension) > 35:
            dimension = self.getProductWeight(content)

        return dimension
Exemplo n.º 3
0
    def parse_item_page(self, response):
        item_list = []
        hxs = HtmlXPathSelector(response)
        item = response.meta['item']

        # 6PM
        # node_collection = hxs.select("//h1[@class='title']/a|//div["
        #                                         "@class='description']/ul/li/span|//div["
        #                                         "@class='description']/ul/li/a|//div[@class='description']/ul/li")

        # Walmart
        node_collection = hxs.select(
            "//h1[@itemprop='name']/div|//div["
            "@class='product-description-disclaimer']|//div[@class='about-desc']"
        )
        if node_collection is not None and len(node_collection) > 0:
            indx = 0
            for node in node_collection:
                value = ''.join(node.xpath("text()").extract())
                if len(value) > 0:
                    indx += 1
                    value = str(value.strip().encode('utf-8'))
                    value = StringUtil.str_utf_encode(value)
                    item_list.append(value)
        item['keywords'] = StringUtil.remove_html_tags(str(
            ' '.join(item_list)))
        return item
Exemplo n.º 4
0
 def __getDescription(self):
     item_list = []
     node_collection = self.getElementValues("//h1[@class='product-name']|//div[@itemprop='description']/*")
     if node_collection is not None and len(node_collection) > 0:
         indx = 0
         for node in node_collection:
             value = ''.join(node.xpath("text()").extract())
             if len(value) > 0:
                 indx += 1
                 value = str(value.strip().encode('utf-8'))
                 value = StringUtil.str_utf_encode(value)
                 item_list.append(value)
     return StringUtil.remove_html_tags(str(' '.join(item_list)))
Exemplo n.º 5
0
 def getProductWeight(self, instance_val):
     # self.writeToFile('content_raw.txt',instance_val)
     # self.writeToFile('content.txt',content.strip())
     content = ''
     if instance_val is not None:
         content = StringUtil.str_cleaner(instance_val, r'<[^>]*>', '')
         content = StringUtil.str_cleaner(content, r'\s\s', '')
         content = StringUtil.str_search_str(
             content,
             r"(item weight|Shipping Weight)(:|:\s|\s:)(\d+(\.\d{1,2})?)(\s|\S)(ounce|pound|lb\s|lbs)"
         )
     # self.writeToFile('content.txt',content.strip())
     return content
Exemplo n.º 6
0
 def __getDescription(self):
     item_list = []
     node_collection = self.getElementValues("//span[@id='ctl00_ContentPlaceHolder1_ucTemplate_aBrand']|//span["
                                             "@class='productname']|//dl[@id='overview']/dd/p")
     if node_collection is not None and len(node_collection) > 0:
         indx = 0
         for node in node_collection:
             value = ''.join(node.xpath("text()").extract())
             if len(value) > 0:
                 indx += 1
                 value = str(value.strip().encode('utf-8'))
                 value = StringUtil.str_utf_encode(value)
                 item_list.append(value)
     return StringUtil.remove_html_tags(str(' '.join(item_list)))
Exemplo n.º 7
0
 def __getDescription(self):
     item_list = []
     node_collection = self.getElementValues("//div[@id='names']/span/a|//div[@id='names']/h1|//div["
                                             "@id='info']/div/p")
     if node_collection is not None and len(node_collection) > 0:
         indx = 0
         for node in node_collection:
             value = ''.join(node.xpath("text()").extract())
             if len(value) > 0:
                 indx += 1
                 value = str(value.strip().encode('utf-8'))
                 value = StringUtil.str_utf_encode(value)
                 item_list.append(value)
     return StringUtil.remove_html_tags(str(' '.join(item_list)))
Exemplo n.º 8
0
 def __getDescription(self):
     item_list = []
     node_collection = self.getElementValues("//span[@id='productTitle']|//div["
                                             "@id='fbExpandableSectionContent']/ul/li/span|//div["
                                             "@id='feature-bullets']/ul/li/span")
     if node_collection is not None and len(node_collection) > 0:
         indx = 0
         for node in node_collection:
             value = ''.join(node.xpath("text()").extract())
             if len(value) > 0:
                 indx += 1
                 value = str(value.strip().encode('utf-8'))
                 value = StringUtil.str_utf_encode(value)
                 item_list.append(value)
     return StringUtil.remove_html_tags(str(' '.join(item_list)))
Exemplo n.º 9
0
 def __getDescription(self):
     item_list = []
     node_collection = self.getElementValues(
         "//div[@id='buy-block']/div/h1|//div["
         "@class='product-details-description clearfix']/div|//div["
         "@class='product-details-description clearfix']/ul/li")
     if node_collection is not None and len(node_collection) > 0:
         indx = 0
         for node in node_collection:
             value = ''.join(node.xpath("text()").extract())
             if len(value) > 0:
                 indx += 1
                 value = str(value.strip().encode('utf-8'))
                 value = StringUtil.str_utf_encode(value)
                 item_list.append(value)
     return StringUtil.remove_html_tags(str(' '.join(item_list)))
Exemplo n.º 10
0
 def __getDescription(self):
     item_list = []
     node_collection = self.getElementValues(
         "//h1[@itemprop='name']|//div[@class='extended-product-details "
         "hide-when-immersive']/div/div|//div[@class='extended-product-details hide-when-immersive']/div/div/span|//div[@class='extended-product-details hide-when-immersive']/div/div|//div[@itemprop='description']/p|//div[@class='product-details-and-care module-details']/ul/li"
     )
     if node_collection is not None and len(node_collection) > 0:
         indx = 0
         for node in node_collection:
             value = ''.join(node.xpath("text()").extract())
             if len(value) > 0:
                 indx += 1
                 value = str(value.strip().encode('utf-8'))
                 value = StringUtil.str_utf_encode(value)
                 item_list.append(value)
     return StringUtil.remove_html_tags(str(' '.join(item_list)))
Exemplo n.º 11
0
 def __getDescription(self):
     item_list = []
     node_collection = self.getElementValues("//div[@class='exp-product-header']/h1|//div["
                                             "@class='exp-product-header']/h2|//div["
                                             "@class='pi-pdpmainbody']/p/b|//div[@class='pi-pdpmainbody']/p|//div["
                                             "@class='pi-pdpmainbody']/li")
     if node_collection is not None and len(node_collection) > 0:
         indx = 0
         for node in node_collection:
             value = ''.join(node.xpath("text()").extract())
             if len(value) > 0:
                 indx += 1
                 value = str(value.strip().encode('utf-8'))
                 value = StringUtil.str_utf_encode(value)
                 item_list.append(value)
     return StringUtil.remove_html_tags(str(' '.join(item_list)))
Exemplo n.º 12
0
    def parse_item_page(self, response):
        item_list = []
        hxs = Selector(response)
        item = response.meta['item']

        # 6PM
        node_collection = hxs.xpath(
            "//h1[@class='title']/a|//div["
            "@class='description']/ul/li/span|//div["
            "@class='description']/ul/li/a|//div[@class='description']/ul/li")

        # Walmart
        # node_collection = hxs.xpath("//h1[@itemprop='name']/div|//div["
        #                                         "@class='product-description-disclaimer']|//div[@class='about-desc']")

        # Amazon
        # node_collection = hxs.xpath("//span[@id='productTitle']|//div["
        #                                         "@id='fbExpandableSectionContent']/ul/li/span|//div["
        #                                         "@id='feature-bullets']/ul/li/span")
        if node_collection is not None and len(node_collection) > 0:
            indx = 0
            for node in node_collection:
                value = ''.join(node.xpath("text()").extract())
                if len(value) > 0:
                    indx += 1
                    value = str(value.strip().encode('utf-8'))
                    value = StringUtil.str_utf_encode(value)
                    item_list.append(value)
        item['keywords'] = StringUtil.remove_html_tags(str(
            ' '.join(item_list)))

        # Create a CSV file for training data
        with open('train_data.csv', 'ab') as csvfile:
            trainwriter = csv.writer(csvfile,
                                     delimiter=',',
                                     quoting=csv.QUOTE_ALL)
            trainwriter.writerow(
                [item['category'],
                 str(item['keywords']).lower()])

        return item
Exemplo n.º 13
0
    def __getTitleInCategoryLevel(self):
        item_list = []
        node_collection = self.getElementValues(
            "//div[@class='product-v2-name']/h1")
        if node_collection is not None and len(node_collection) > 0:
            indx = 0
            for node in node_collection:
                value = ''.join(node.xpath("text()").extract())
                if len(value) > 0:
                    indx += 1
                    value = str(value.strip().encode('utf-8'))
                    value = StringUtil.str_utf_encode(value)
                    item_list.append(
                        self.listToJson(['title', 'index'], [value, indx]))

        return item_list
Exemplo n.º 14
0
    def __getTitleInCategoryLevel(self):
        item_list = []
        node_collection = self.getElementValues("//div[@class='a-row a-spacing-micro']/a/h2|//div[@class='a-row "
                                                "a-spacing-top-mini']/a/h2|//div[@class='a-row "
                                                "a-spacing-mini']/a/h2|//div[@class='a-row "
                                                "a-spacing-none']/a/h2|//span[@id='productTitle']|//ol["
                                                "@class='class=a-carousel']/li/div/a/span")
        if node_collection is not None and len(node_collection) > 0:
            indx = 0
            for node in node_collection:
                value = ''.join(node.xpath("@data-attribute").extract())
                if len(value) > 0:
                    indx += 1
                    value = str(value.strip().encode('utf-8'))
                    value = StringUtil.str_utf_encode(value)
                    item_list.append(self.listToJson(['title', 'index'], [value, indx]))

        return item_list