Python to_text示例

编程语言: Python

命名空间/包名称: utils.htmlutils

方法/功能: to_text

hotexamples.com的示例: 4

Python to_text - 已找到4个示例。这些是从开源项目中提取的最受好评的utils.htmlutils.to_text现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

    def parse(self, sync, *args, **kwargs):
        file = self.config['URL']
        if '_local_' in kwargs:
            file = 'feeds/data/%s.%s' % (self.config['ACCOUNT'],
                    self.config['TYPE'].lower())
        else:
            # get the file and save it
            path = '%s/%s-%d.%s' % (settings.FEEDS_ROOT,
                    self.config['ACCOUNT'],
                    sync.id,
                    self.config['TYPE'].lower())

            urllib.urlretrieve(self.config['URL'], path)
            file = path

        # clean the file
        import fileinput
        for line in fileinput.input(file, inplace=1):
            keys_with_spaces = ['Product Name', 'List price', 'Sell price',
                    'All terms', 'Stock Level', 'Publisher Reference',
                    'Post date', 'Updated date']
            for key in keys_with_spaces:
                line = line.replace(key, key.replace(' ','_'))
            print line,

        ack_brand = Brand.objects.get(id=19)
        tinkle_brand = Brand.objects.get(id=28)
        karadi_brand = Brand.objects.get(id=29)

        books_and_comics = Category.objects.get(name='Books & Comics')

        xmldoc = etree.parse(file)
        products = []
        for product in xmldoc.xpath("/xml/node"):
            data = dict(cleaned_data=self.get_default_cleaned_data())
            no_html_fields = ['List_price','Sell_price', 'Product_Name','Description','Path']
            # single value fields
            single_value_fields = ['ISBN','Product_Name','Description','List_price','Sell_price',
                    'international','Path','Nid','Published']
            for field in single_value_fields:
                data[field] = ''
                node = product.xpath(field.split(' ')[0])
                if node and len(node) > 0 and node[0].text:
                    data[field] = node[0].text
                    if field in no_html_fields:
                        data[field] = htmlutils.to_text(node[0].text)

            if data.get('Published','').lower() == 'no':
                continue

            if 'subscription' in data.get('Product_Name','').lower() and 'tinkle' in data.get('Product_Name','').lower():
                print 'adding tinkle subscription'
            else:
                print 'skipping other ack products'
                continue

            # ignore blaclists
            if self.is_blacklisted_sku(data['Nid']): continue

            # create cleaned data 
            data['cleaned_data']['brand'] = tinkle_brand
            data['cleaned_data']['category'] = books_and_comics 

            data['cleaned_data']['sku'] = data['Nid']
            data['cleaned_data']['model'] = data['ISBN'] or ''
            data['cleaned_data']['title'] = data['Product_Name']
            data['cleaned_data']['image_url'] = [self.get_image_url(data['Path'])]
            data['cleaned_data']['shipping_duration'] = '4-6 Weeks'
            data['cleaned_data']['offer_price'] = Decimal(self.get_text(data['Sell_price']).replace(',','').replace('Rs',''))
            
            data['cleaned_data']['list_price'] = Decimal(self.get_text(data['List_price']).replace('Rs','').replace(',',''))
            data['cleaned_data']['description'] = data['Description']
            data['cleaned_data']['availability'] = AvailabilityMap.objects.get(
                    applies_to = 'account',
                    account = self.config['ACCOUNT']).availability
            products.append(data)
        print len(products)

        return products

示例#2

显示文件

    def parse(self, sync, *args, **kwargs):
        file = self.config['URL']
        if '_local_' in kwargs:
            file = 'feeds/data/%s.%s' % (self.config['ACCOUNT'],
                    self.config['TYPE'].lower())
        else:
            # get the file and save it
            path = '%s/%s-%d.%s' % (settings.FEEDS_ROOT,
                    self.config['ACCOUNT'],
                    sync.id,
                    self.config['TYPE'].lower())

            urllib.urlretrieve(self.config['URL'], path)
            file = path

        xmldoc = etree.parse(file)
        products = []
        for product in xmldoc.xpath("/root/products"):
            data = dict(cleaned_data=self.get_default_cleaned_data())
            no_html_fields = ['Brand_Name','categories_name']
            # single value fields
            single_value_fields = ['SKU','Title','Features','Specification',
                    'Overview','Brand_Name','Image_URL','categories_name',
                    'Warranty_Period','Offer_Price','MRP','Shipping_Duration']
            for field in single_value_fields:
                data[field] = ''
                node = product.xpath(field)
                if node and len(node) > 0:
                    data[field] = node[0].text
                    if field in no_html_fields:
                        data[field] = htmlutils.to_text(node[0].text)

            # ignore blaclists
            if self.is_blacklisted_brand(data['Brand_Name']): continue
            if self.is_blacklisted_sku(data['SKU']): continue
            if self.is_blacklisted_category(data['categories_name']): continue

            # create cleaned data 
            data['cleaned_data']['brand_mapping'] = self.get_brand_mapping(data['Brand_Name'])
            data['cleaned_data']['category_mapping'] = self.get_category_mapping(data['categories_name'])

            data['cleaned_data']['sku'] = data['SKU']
            data['cleaned_data']['brand'] = self.get_brand_mapping(data['Brand_Name']).mapped_to
            data['cleaned_data']['category'] = self.get_category_mapping(data['categories_name']).mapped_to
            data['cleaned_data']['model'] = self.get_model_name(data['SKU'])
            data['cleaned_data']['title'] = data['Title']
            data['cleaned_data']['image_url'] = [data['Image_URL']]
            data['cleaned_data']['shipping_duration'] = data['Shipping_Duration'] or '8-10 Working Days'
            data['cleaned_data']['offer_price'] = Decimal(data['Offer_Price'])
            if data['MRP'].replace('.','').replace('0',''):
                data['cleaned_data']['list_price'] = Decimal(data['MRP'])
            else:
                data['cleaned_data']['list_price']= Decimal(data['Offer_Price'])
            data['cleaned_data']['description'] = 'Overview\n\n%sFeatures\n\n%sSpecs\n\n%s' % (
                    striptags(data['Overview']), striptags(data['Features']), striptags(data['Specification']))
            data['cleaned_data']['availability'] = AvailabilityMap.objects.get(
                    applies_to = 'account',
                    account = self.config['ACCOUNT']).availability
            products.append(data)

        return products

示例#3

显示文件

    def parse(self, sync, *args, **kwargs):
        file = self.config['URL']
        if '_local_' in kwargs:
            file = 'feeds/data/%s.%s' % (self.config['ACCOUNT'],
                                         self.config['TYPE'].lower())
        else:
            # get the file and save it
            path = '%s/%s-%d.%s' % (settings.FEEDS_ROOT,
                                    self.config['ACCOUNT'], sync.id,
                                    self.config['TYPE'].lower())

            urllib.urlretrieve(self.config['URL'], path)
            file = path

        xmldoc = etree.parse(file)
        products = []
        for product in xmldoc.xpath("/root/products"):
            data = dict(cleaned_data=self.get_default_cleaned_data())
            no_html_fields = ['Brand_Name', 'categories_name']
            # single value fields
            single_value_fields = [
                'SKU', 'Title', 'Features', 'Specification', 'Overview',
                'Brand_Name', 'Image_URL', 'categories_name',
                'Warranty_Period', 'Offer_Price', 'MRP', 'Shipping_Duration'
            ]
            for field in single_value_fields:
                data[field] = ''
                node = product.xpath(field)
                if node and len(node) > 0:
                    data[field] = node[0].text
                    if field in no_html_fields:
                        data[field] = htmlutils.to_text(node[0].text)

            # ignore blaclists
            if self.is_blacklisted_brand(data['Brand_Name']): continue
            if self.is_blacklisted_sku(data['SKU']): continue
            if self.is_blacklisted_category(data['categories_name']): continue

            # create cleaned data
            data['cleaned_data']['brand_mapping'] = self.get_brand_mapping(
                data['Brand_Name'])
            data['cleaned_data'][
                'category_mapping'] = self.get_category_mapping(
                    data['categories_name'])

            data['cleaned_data']['sku'] = data['SKU']
            data['cleaned_data']['brand'] = self.get_brand_mapping(
                data['Brand_Name']).mapped_to
            data['cleaned_data']['category'] = self.get_category_mapping(
                data['categories_name']).mapped_to
            data['cleaned_data']['model'] = self.get_model_name(data['SKU'])
            data['cleaned_data']['title'] = data['Title']
            data['cleaned_data']['image_url'] = [data['Image_URL']]
            data['cleaned_data']['shipping_duration'] = data[
                'Shipping_Duration'] or '8-10 Working Days'
            data['cleaned_data']['offer_price'] = Decimal(data['Offer_Price'])
            if data['MRP'].replace('.', '').replace('0', ''):
                data['cleaned_data']['list_price'] = Decimal(data['MRP'])
            else:
                data['cleaned_data']['list_price'] = Decimal(
                    data['Offer_Price'])
            data['cleaned_data'][
                'description'] = 'Overview\n\n%sFeatures\n\n%sSpecs\n\n%s' % (
                    striptags(data['Overview']), striptags(
                        data['Features']), striptags(data['Specification']))
            data['cleaned_data']['availability'] = AvailabilityMap.objects.get(
                applies_to='account',
                account=self.config['ACCOUNT']).availability
            products.append(data)

        return products

示例#4

显示文件

    def parse(self, sync, *args, **kwargs):
        file = self.config["URL"]
        if "_local_" in kwargs:
            file = "feeds/data/%s.%s" % (self.config["ACCOUNT"], self.config["TYPE"].lower())
        else:
            # get the file and save it
            path = "%s/%s-%d.%s" % (settings.FEEDS_ROOT, self.config["ACCOUNT"], sync.id, self.config["TYPE"].lower())

            urllib.urlretrieve(self.config["URL"], path)
            file = path

        # clean the file
        import fileinput

        for line in fileinput.input(file, inplace=1):
            keys_with_spaces = [
                "Product Name",
                "List price",
                "Sell price",
                "All terms",
                "Stock Level",
                "Publisher Reference",
                "Post date",
                "Updated date",
            ]
            for key in keys_with_spaces:
                line = line.replace(key, key.replace(" ", "_"))
            print line,

        ack_brand = Brand.objects.get(id=19)
        tinkle_brand = Brand.objects.get(id=28)
        karadi_brand = Brand.objects.get(id=29)

        books_and_comics = Category.objects.get(name="Books & Comics")

        xmldoc = etree.parse(file)
        products = []
        for product in xmldoc.xpath("/xml/node"):
            data = dict(cleaned_data=self.get_default_cleaned_data())
            data["cleaned_data"]["status"] = "active"
            no_html_fields = ["List_price", "Sell_price", "Product_Name", "Description", "Path"]
            # single value fields
            single_value_fields = [
                "ISBN",
                "Product_Name",
                "Description",
                "List_price",
                "Sell_price",
                "international",
                "Path",
                "Nid",
                "Published",
            ]
            for field in single_value_fields:
                data[field] = ""
                node = product.xpath(field.split(" ")[0])
                if node and len(node) > 0 and node[0].text:
                    data[field] = node[0].text
                    if field in no_html_fields:
                        data[field] = htmlutils.to_text(node[0].text)

            if data.get("Published", "").lower() == "no":
                continue

            if (
                "subscription" in data.get("Product_Name", "").lower()
                and "tinkle" in data.get("Product_Name", "").lower()
            ):
                print "skipping tinkle subscription"
                continue
            if "robinage" in data.get("Product_Name", "").lower():
                data["cleaned_data"]["shipping_duration"] = "15-20 Working Days"
            elif "brainwave" in data.get("Product_Name", "").lower():
                data["cleaned_data"]["shipping_duration"] = "4-6 Weeks"
            else:
                data["cleaned_data"]["shipping_duration"] = "7-10 Working Days"

            # ignore blaclists
            if self.is_blacklisted_sku(data["Nid"]):
                continue

            # create cleaned data
            data["cleaned_data"]["brand"] = ack_brand
            if "karadi" in data["Product_Name"].lower():
                data["cleaned_data"]["brand"] = karadi_brand
            if "tinkle" in data["Product_Name"].lower():
                data["cleaned_data"]["brand"] = tinkle_brand
            data["cleaned_data"]["category"] = books_and_comics

            data["cleaned_data"]["sku"] = data["Nid"]
            data["cleaned_data"]["model"] = data["ISBN"] or ""
            data["cleaned_data"]["title"] = data["Product_Name"]
            data["cleaned_data"]["image_url"] = [self.get_image_url(data["Path"])]
            # data['cleaned_data']['shipping_duration'] = '7-10 Working Days'
            data["cleaned_data"]["list_price"] = Decimal(
                self.get_text(data["List_price"]).replace(",", "").replace("Rs", "")
            )

            data["cleaned_data"]["offer_price"] = Decimal(
                self.get_text(data["Sell_price"]).replace("Rs", "").replace(",", "")
            )
            data["cleaned_data"]["description"] = data["Description"]
            data["cleaned_data"]["availability"] = AvailabilityMap.objects.get(
                applies_to="account", account=self.config["ACCOUNT"]
            ).availability
            products.append(data)
        print len(products)

        return products