Пример #1
0
    def render(self, info=None, format='html', encoding='unicode'):
        '''
        Render string from template.

        :keyword info: data to substitute into a document
        :keyword str format: HTML doctype to use for string
        :param str encoding: encoding type for return string
        '''
        # substitute any info into the document
        if info is not None:
            self.__imod__(info)
        tree = _copytree(self._tree)
        # use XHTML 1.0 doctype
        if format == 'html':
            # strip namespace prefix from HTML
            xhtml_to_html(tree)
            doc = htostring(tree, encoding=encoding, doctype=_html5)
        # use XHTML 1.1 doctype
        elif format == 'xhtml10':
            doc = htostring(
                tree, method='xml', encoding=encoding, doctype=_xhtml10,
            )
        # use HTML 5 doctype
        elif format == 'xhtml11':
            doc = htostring(
                tree, method='xml', encoding=encoding, doctype=_xhtml11,
            )
        return doc
Пример #2
0
def scrape_category(url, title):
    category_slug = slugify(title)

    #if testing and category_slug != 'storage-servers-nas':
    #    return

    try:
        f = urlopen(url)
    except ValueError:
        if trace: print 'Retrying:', url
        url = 'http://eracks.com' + url.replace(' ', '%20')
        if trace: print 'As:', url
        f = urlopen(url)

    doc = html5lib.parse(
        f, treebuilder='lxml', namespaceHTMLElements=False
    )  # this didn't work, but above three lines did: encoding='utf-8',
    html.xhtml_to_html(doc)
    jQuery = PyQuery([doc])

    prods = jQuery('#products a').filter(
        lambda not_used: 'config?sku=' in PyQuery(this).attr('href'))

    for a in prods:
        scrape_product(PyQuery(a).attr('href'), category_slug)
Пример #3
0
def scrape(slug, url, name, title=None):
    f = urlopen(url)
    doc = f.read()

    doc, errs = tidy_document(
        doc,
        options={
            "output-html": 1,
            #'indent':1,
            "clean": 1,
            "drop-font-tags": 1,
        },
    )
    if errs:
        # raise Exception, errs
        print errs

    doc = html5lib.parse(doc, treebuilder="lxml")  # this didn't work, but above three lines did: encoding='utf-8',
    html.xhtml_to_html(doc)
    jQuery = PyQuery([doc])

    td = jQuery("td#content")
    assert len(td) == 1

    for img in td("img"):
        # print 'img:', PyQuery (img)
        img = PyQuery(img)
        src = img.attr("src")
        # alt = img.attr('alt')

        # if src.startswith ('/image'):
        rslt = getimage(src, slug.split("/")[0])
        img.attr("src", rslt)
        if trace:
            print rslt

    # td =
    # no_fonts (td)

    # need to fix links here

    content = PyQuery(td[0])
    # content = content.html()
    content = no_namespaces(content.html())

    print slug, content[:60]  # .html()  # [:60]

    if dbteeth:
        # q, created = QuickPage.objects.get_or_create (

        qp, created = create_or_update(
            QuickPage,
            keys=dict(slug=slug),
            fields=dict(
                name=name,
                title=title if title else name,
                content=content,
                # defaults = dict (sortorder = sortorder),
            ),
        )
Пример #4
0
def scrape_category (url, title):
    category_slug = slugify (title)

    try:
        f = urlopen (url)
    except ValueError:
        if trace: print 'Retrying:', url
        url = 'http://eracks.com' + url.replace (' ','%20')
        if trace: print 'As:', url
        f = urlopen (url)

    doc = html5lib.parse(f, treebuilder='lxml', namespaceHTMLElements=False)  # this didn't work, but above three lines did: encoding='utf-8',
    html.xhtml_to_html (doc)
    jQuery = PyQuery([doc])

    page_title =  jQuery ('title').text()

    if page_title.startswith ("eRacks Open Source Systems: "):
        page_title = page_title.partition ("eRacks Open Source Systems: ") [-1]

    if page_title.startswith ("eRacks "):
        page_title = page_title.partition ("eRacks ") [-1]

    content = jQuery ('td#content')
    links = content ('a')
    images = content ('img')

    for link in links:
        a = PyQuery (link)
        href = a.attr('href')
        skus = find_sku.findall (href)

        if skus:
            sku = skus [0]
            #a.attr ('href', '/%s/%s/' % (category_slug, slugify (sku)))
            a.attr ('href', '/products/%s/%s/' % (category_slug, sku))
        elif href.startswith ('/Legacy'):
            sku = slugify (href.split ('/') [-1])
            #a.attr ('href', '/%s/%s/' % (category_slug, slugify (sku)))
            a.attr ('href', '/products/%s/%s/' % (category_slug, sku))

        print 'link:', a.attr('href')

    for image in images:
        img = PyQuery (image)
        src = img.attr('src')
        newsrc = getimage (src, 'categories/' + category_slug)
        img.attr ('src', newsrc)
        print 'image:', newsrc

    description = content.html()
    if trace: print description

    if dbteeth:
        cat = Categories.objects.get (name=title)
        cat.comments = cat.comments + '\n\nScraped from Zope as of ' + str(datetime.date.today())
        cat.description = description
        cat.title = page_title
        cat.save()
        print '..saved.'
Пример #5
0
def scrape(slug, url, name, title=None):
    f = urlopen(url)
    doc = f.read()

    doc, errs = tidy_document(
        doc,
        options={
            'output-html': 1,
            #'indent':1,
            'clean': 1,
            'drop-font-tags': 1,
        })
    if errs:
        #raise Exception, errs
        print errs

    doc = html5lib.parse(
        doc, treebuilder='lxml'
    )  # this didn't work, but above three lines did: encoding='utf-8',
    html.xhtml_to_html(doc)
    jQuery = PyQuery([doc])

    td = jQuery('td#content')
    assert len(td) == 1

    for img in td('img'):
        #print 'img:', PyQuery (img)
        img = PyQuery(img)
        src = img.attr('src')
        #alt = img.attr('alt')

        #if src.startswith ('/image'):
        rslt = getimage(src, slug.split('/')[0])
        img.attr('src', rslt)
        if trace: print rslt

    #td =
    #no_fonts (td)

    # need to fix links here

    content = PyQuery(td[0])
    #content = content.html()
    content = no_namespaces(content.html())

    print slug, content[:60]  #.html()  # [:60]

    if dbteeth:
        #q, created = QuickPage.objects.get_or_create (

        qp, created = create_or_update(
            QuickPage,
            keys=dict(slug=slug),
            fields=dict(
                name=name,
                title=title if title else name,
                content=content,
                #defaults = dict (sortorder = sortorder),
            ))
Пример #6
0
def clean_html(tree):
    mytree = deepcopy(tree)
    for elem in mytree.iter():
        for attr, val in elem.attrib.iteritems():
            if attr.startswith('{'):
                del elem.attrib[attr]

    xhtml_to_html(mytree)
    return etree.tostring(normalize_ns(mytree), method="html",
                          encoding=unicode)
Пример #7
0
def parse_table():
    source_file = 'citylist.html'
    if os.path.isfile(source_file):
        log("Reading %s" % source_file)
        tree = etree.parse(source_file)
    else:
        log("Reading %s" % WIKI_URL)
        tree = etree.parse(WIKI_URL)
        tree.write(source_file, encoding='utf-8')
    html.xhtml_to_html(tree)

    tables = tree.findall('//table')
    tables.sort(key=len)
    table = tables[-1] # longest table
    return table
Пример #8
0
def scrape_product (url, title):
    f = urlopen (url)
    doc = html5lib.parse(f, treebuilder='lxml')  # this didn't work, but above three lines did: encoding='utf-8',
    html.xhtml_to_html (doc)
    jQuery = PyQuery([doc])

    cat = Categories.objects.get (name=title)
    #name = title
    #slug = slugify (title)
    description = jQuery ('td#content').html()
    #print description [:50]

    cat.comments = cat.comments + '\n\nScraped from Zope as of ' + str(datetime.date.today())

    cat.description = description
    cat.save()

    print '..saved.'
Пример #9
0
def scrape_product(url, title):
    f = urlopen(url)
    doc = html5lib.parse(
        f, treebuilder='lxml'
    )  # this didn't work, but above three lines did: encoding='utf-8',
    html.xhtml_to_html(doc)
    jQuery = PyQuery([doc])

    cat = Categories.objects.get(name=title)
    #name = title
    #slug = slugify (title)
    description = jQuery('td#content').html()
    #print description [:50]

    cat.comments = cat.comments + '\n\nScraped from Zope as of ' + str(
        datetime.date.today())

    cat.description = description
    cat.save()

    print '..saved.'
Пример #10
0
def scrape_category (url, title):
    category_slug = slugify (title)

    #if testing and category_slug != 'storage-servers-nas':
    #    return

    try:
        f = urlopen (url)
    except ValueError:
        if trace: print 'Retrying:', url
        url = 'http://eracks.com' + url.replace (' ','%20')
        if trace: print 'As:', url
        f = urlopen (url)

    doc = html5lib.parse(f, treebuilder='lxml', namespaceHTMLElements=False)  # this didn't work, but above three lines did: encoding='utf-8',
    html.xhtml_to_html (doc)
    jQuery = PyQuery([doc])

    prods = jQuery ('#products a').filter (lambda not_used: 'config?sku=' in PyQuery(this).attr('href'))

    for a in prods:
        scrape_product (PyQuery(a).attr ('href'), category_slug)
Пример #11
0
    def __call__(self, doc):
        """
        Cleans the document.
        """
        try:
            getroot = doc.getroot
        except AttributeError:
            pass  # Element instance
        else:
            doc = getroot()  # ElementTree instance, instead of an element
        # convert XHTML to HTML
        xhtml_to_html(doc)
        # Normalize a case that IE treats <image> like <img>, and that
        # can confuse either this step or later steps.
        for el in doc.iter('image'):
            el.tag = 'img'
        if not self.comments:
            # Of course, if we were going to kill comments anyway, we don't
            # need to worry about this
            self.kill_conditional_comments(doc)

        kill_tags = set(self.kill_tags or ())
        remove_tags = set(self.remove_tags or ())
        allow_tags = set(self.allow_tags or ())

        if self.scripts:
            kill_tags.add('script')
        if self.safe_attrs_only:
            safe_attrs = set(self.safe_attrs)
            for el in doc.iter(etree.Element):
                attrib = el.attrib
                for aname in attrib.keys():
                    if aname not in safe_attrs:
                        del attrib[aname]
        if self.javascript:
            if not (self.safe_attrs_only
                    and self.safe_attrs == defs.safe_attrs):
                # safe_attrs handles events attributes itself
                for el in doc.iter(etree.Element):
                    attrib = el.attrib
                    for aname in attrib.keys():
                        if aname.startswith('on'):
                            del attrib[aname]
            doc.rewrite_links(self._remove_javascript_link,
                              resolve_base_href=False)
            # If we're deleting style then we don't have to remove JS links
            # from styles, otherwise...
            if not self.inline_style:
                for el in _find_styled_elements(doc):
                    old = el.get('style')
                    new = _css_javascript_re.sub('', old)
                    new = _css_import_re.sub('', new)
                    if self._has_sneaky_javascript(new):
                        # Something tricky is going on...
                        del el.attrib['style']
                    elif new != old:
                        el.set('style', new)
            if not self.style:
                for el in list(doc.iter('style')):
                    if el.get('type', '').lower().strip() == 'text/javascript':
                        el.drop_tree()
                        continue
                    old = el.text or ''
                    new = _css_javascript_re.sub('', old)
                    # The imported CSS can do anything; we just can't allow:
                    new = _css_import_re.sub('', old)
                    if self._has_sneaky_javascript(new):
                        # Something tricky is going on...
                        el.text = '/* deleted */'
                    elif new != old:
                        el.text = new
        if self.comments:
            kill_tags.add(etree.Comment)
        if self.processing_instructions:
            kill_tags.add(etree.ProcessingInstruction)
        if self.style:
            kill_tags.add('style')
        if self.inline_style:
            etree.strip_attributes(doc, 'style')
        if self.links:
            kill_tags.add('link')
        elif self.style or self.javascript:
            # We must get rid of included stylesheets if Javascript is not
            # allowed, as you can put Javascript in them
            for el in list(doc.iter('link')):
                if 'stylesheet' in el.get('rel', '').lower():
                    # Note this kills alternate stylesheets as well
                    if not self.allow_element(el):
                        el.drop_tree()
        if self.meta:
            kill_tags.add('meta')
        if self.page_structure:
            remove_tags.update(('head', 'html', 'title'))
        if self.embedded:
            # FIXME: is <layer> really embedded?
            # We should get rid of any <param> tags not inside <applet>;
            # These are not really valid anyway.
            for el in list(doc.iter('param')):
                found_parent = False
                parent = el.getparent()
                while parent is not None and parent.tag not in ('applet',
                                                                'object'):
                    parent = parent.getparent()
                if parent is None:
                    el.drop_tree()
            kill_tags.update(('applet', ))
            # The alternate contents that are in an iframe are a good fallback:
            remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param'))
        if self.frames:
            # FIXME: ideally we should look at the frame links, but
            # generally frames don't mix properly with an HTML
            # fragment anyway.
            kill_tags.update(defs.frame_tags)
        if self.forms:
            remove_tags.add('form')
            kill_tags.update(('button', 'input', 'select', 'textarea'))
        if self.annoying_tags:
            remove_tags.update(('blink', 'marquee'))

        _remove = []
        _kill = []
        for el in doc.iter():
            if el.tag in kill_tags:
                if self.allow_element(el):
                    continue
                _kill.append(el)
            elif el.tag in remove_tags:
                if self.allow_element(el):
                    continue
                _remove.append(el)

        if _remove and _remove[0] == doc:
            # We have to drop the parent-most tag, which we can't
            # do.  Instead we'll rewrite it:
            el = _remove.pop(0)
            el.tag = 'div'
            el.attrib.clear()
        elif _kill and _kill[0] == doc:
            # We have to drop the parent-most element, which we can't
            # do.  Instead we'll clear it:
            el = _kill.pop(0)
            if el.tag != 'html':
                el.tag = 'div'
            el.clear()

        _kill.reverse()  # start with innermost tags
        for el in _kill:
            el.drop_tree()
        for el in _remove:
            el.drop_tag()

        if self.remove_unknown_tags:
            if allow_tags:
                raise ValueError(
                    "It does not make sense to pass in both allow_tags and remove_unknown_tags"
                )
            allow_tags = set(defs.tags)
        if allow_tags:
            # make sure we do not remove comments/PIs if users want them (which is rare enough)
            if not self.comments:
                allow_tags.add(etree.Comment)
            if not self.processing_instructions:
                allow_tags.add(etree.ProcessingInstruction)

            bad = []
            for el in doc.iter():
                if el.tag not in allow_tags:
                    bad.append(el)
            if bad:
                if bad[0] is doc:
                    el = bad.pop(0)
                    el.tag = 'div'
                    el.attrib.clear()
                for el in bad:
                    el.drop_tag()
        if self.add_nofollow:
            for el in _find_external_links(doc):
                if not self.allow_follow(el):
                    rel = el.get('rel')
                    if rel:
                        if ('nofollow' in rel
                                and ' nofollow ' in (' %s ' % rel)):
                            continue
                        rel = '%s nofollow' % rel
                    else:
                        rel = 'nofollow'
                    el.set('rel', rel)
Пример #12
0
    def __call__(self, doc):
        """
        Cleans the document.
        """
        if hasattr(doc, "getroot"):
            # ElementTree instance, instead of an element
            doc = doc.getroot()
        # convert XHTML to HTML
        xhtml_to_html(doc)
        # Normalize a case that IE treats <image> like <img>, and that
        # can confuse either this step or later steps.
        for el in doc.iter("image"):
            el.tag = "img"
        if not self.comments:
            # Of course, if we were going to kill comments anyway, we don't
            # need to worry about this
            self.kill_conditional_comments(doc)

        kill_tags = set(self.kill_tags or ())
        remove_tags = set(self.remove_tags or ())
        allow_tags = set(self.allow_tags or ())

        if self.scripts:
            kill_tags.add("script")
        if self.safe_attrs_only:
            safe_attrs = set(self.safe_attrs)
            for el in doc.iter(etree.Element):
                attrib = el.attrib
                for aname in attrib.keys():
                    if aname not in safe_attrs:
                        del attrib[aname]
        if self.javascript:
            if not (self.safe_attrs_only and self.safe_attrs == defs.safe_attrs):
                # safe_attrs handles events attributes itself
                for el in doc.iter(etree.Element):
                    attrib = el.attrib
                    for aname in attrib.keys():
                        if aname.startswith("on"):
                            del attrib[aname]
            doc.rewrite_links(self._remove_javascript_link, resolve_base_href=False)
            # If we're deleting style then we don't have to remove JS links
            # from styles, otherwise...
            if not self.inline_style:
                for el in _find_styled_elements(doc):
                    old = el.get("style")
                    new = _css_javascript_re.sub("", old)
                    new = _css_import_re.sub("", new)
                    if self._has_sneaky_javascript(new):
                        # Something tricky is going on...
                        del el.attrib["style"]
                    elif new != old:
                        el.set("style", new)
            if not self.style:
                for el in list(doc.iter("style")):
                    if el.get("type", "").lower().strip() == "text/javascript":
                        el.drop_tree()
                        continue
                    old = el.text or ""
                    new = _css_javascript_re.sub("", old)
                    # The imported CSS can do anything; we just can't allow:
                    new = _css_import_re.sub("", old)
                    if self._has_sneaky_javascript(new):
                        # Something tricky is going on...
                        el.text = "/* deleted */"
                    elif new != old:
                        el.text = new
        if self.comments or self.processing_instructions:
            # FIXME: why either?  I feel like there's some obscure reason
            # because you can put PIs in comments...?  But I've already
            # forgotten it
            kill_tags.add(etree.Comment)
        if self.processing_instructions:
            kill_tags.add(etree.ProcessingInstruction)
        if self.style:
            kill_tags.add("style")
        if self.inline_style:
            etree.strip_attributes(doc, "style")
        if self.links:
            kill_tags.add("link")
        elif self.style or self.javascript:
            # We must get rid of included stylesheets if Javascript is not
            # allowed, as you can put Javascript in them
            for el in list(doc.iter("link")):
                if "stylesheet" in el.get("rel", "").lower():
                    # Note this kills alternate stylesheets as well
                    if not self.allow_element(el):
                        el.drop_tree()
        if self.meta:
            kill_tags.add("meta")
        if self.page_structure:
            remove_tags.update(("head", "html", "title"))
        if self.embedded:
            # FIXME: is <layer> really embedded?
            # We should get rid of any <param> tags not inside <applet>;
            # These are not really valid anyway.
            for el in list(doc.iter("param")):
                found_parent = False
                parent = el.getparent()
                while parent is not None and parent.tag not in ("applet", "object"):
                    parent = parent.getparent()
                if parent is None:
                    el.drop_tree()
            kill_tags.update(("applet",))
            # The alternate contents that are in an iframe are a good fallback:
            remove_tags.update(("iframe", "embed", "layer", "object", "param"))
        if self.frames:
            # FIXME: ideally we should look at the frame links, but
            # generally frames don't mix properly with an HTML
            # fragment anyway.
            kill_tags.update(defs.frame_tags)
        if self.forms:
            remove_tags.add("form")
            kill_tags.update(("button", "input", "select", "textarea"))
        if self.annoying_tags:
            remove_tags.update(("blink", "marquee"))

        _remove = []
        _kill = []
        for el in doc.iter():
            if el.tag in kill_tags:
                if self.allow_element(el):
                    continue
                _kill.append(el)
            elif el.tag in remove_tags:
                if self.allow_element(el):
                    continue
                _remove.append(el)

        if _remove and _remove[0] == doc:
            # We have to drop the parent-most tag, which we can't
            # do.  Instead we'll rewrite it:
            el = _remove.pop(0)
            el.tag = "div"
            el.attrib.clear()
        elif _kill and _kill[0] == doc:
            # We have to drop the parent-most element, which we can't
            # do.  Instead we'll clear it:
            el = _kill.pop(0)
            if el.tag != "html":
                el.tag = "div"
            el.clear()

        _kill.reverse()  # start with innermost tags
        for el in _kill:
            el.drop_tree()
        for el in _remove:
            el.drop_tag()

        if self.remove_unknown_tags:
            if allow_tags:
                raise ValueError(
                    "It does not make sense to pass in both allow_tags and remove_unknown_tags"
                )
            allow_tags = set(defs.tags)
        if allow_tags:
            bad = []
            for el in doc.iter():
                if el.tag not in allow_tags:
                    bad.append(el)
            if bad:
                if bad[0] is doc:
                    el = bad.pop(0)
                    el.tag = "div"
                    el.attrib.clear()
                for el in bad:
                    el.drop_tag()
        if self.add_nofollow:
            for el in _find_external_links(doc):
                if not self.allow_follow(el):
                    rel = el.get("rel")
                    if rel:
                        if "nofollow" in rel and " nofollow " in (" %s " % rel):
                            continue
                        rel = "%s nofollow" % rel
                    else:
                        rel = "nofollow"
                    el.set("rel", rel)
Пример #13
0
print d('p').filter(lambda i: i == 1)
print d('p').filter(lambda i: i == 2)
print d('p').filter(lambda not_used: PyQuery(this).text() == 'Hi')
'''

import re
find_sku = re.compile ('config\?sku=(.*)\&.*')

#print sku.search ('/products/Enterprise%20Servers/config?sku=ENTERPRISE&session=11706171452858578')
#print sku.findall ('/products/Enterprise%20Servers/config?sku=ENTERPRISE&session=11706171452858578')
#sys.exit()


f = urlopen (url)
doc = html5lib.parse(f, treebuilder='lxml')  # this didn't work, but above three lines did: encoding='utf-8',
html.xhtml_to_html (doc)
jQuery = PyQuery([doc])


for a in jQuery ('#products a'):  #  [-1:]:  # skip 'Legacy' at the end
    a = PyQuery (a)

    title = a.text()
    href = a.attr ('href')

    #assert title == unquote (urlparse (href).path).split ('/') [-1]
    ## link = '/products/
    print  'Working on:', slugify (title), title #, href
    scrape_category (href, title)
    print 'Done:', title
    print
Пример #14
0
print d('p').filter(lambda i: i == 2)
print d('p').filter(lambda not_used: PyQuery(this).text() == 'Hi')
'''

import re
find_sku = re.compile('config\?sku=(.*)\&.*')

#print sku.search ('/products/Enterprise%20Servers/config?sku=ENTERPRISE&session=11706171452858578')
#print sku.findall ('/products/Enterprise%20Servers/config?sku=ENTERPRISE&session=11706171452858578')
#sys.exit()

f = urlopen(url)
doc = html5lib.parse(
    f, treebuilder='lxml'
)  # this didn't work, but above three lines did: encoding='utf-8',
html.xhtml_to_html(doc)
jQuery = PyQuery([doc])

for a in jQuery('#products a'):  #  [-1:]:  # skip 'Legacy' at the end
    a = PyQuery(a)

    title = a.text()
    href = a.attr('href')

    #assert title == unquote (urlparse (href).path).split ('/') [-1]
    ## link = '/products/
    print 'Working on:', slugify(title), title  #, href
    scrape_category(href, title)
    print 'Done:', title
    print
Пример #15
0

## globals

url = "http://eracks.com/customers"

teeth = 0  # whether to write scraped images


## main

f = urlopen(url)

doc = html5lib.parse(f, treebuilder="lxml")  # this didn't work, but above three lines did: encoding='utf-8',

print html.xhtml_to_html(doc)

jQuery = PyQuery([doc])


def getimage(src):
    f = urlopen(src)
    info = f.info()
    fname = src.split("/")[-1] + "." + info.getsubtype()
    path = "/home/joe/Projects/django_eracks/static/images/customers/" + fname
    if teeth:
        open(path, "wb").write(f.read())
    return "/images/customers/" + fname  # url for retrieval


href = src = caption = loc = title = ""
Пример #16
0
def scrape_product(url, category_slug):
    f = urlopen(url)
    doc = html5lib.parse(
        f, treebuilder='lxml', namespaceHTMLElements=False
    )  # this didn't work, but above three lines did: encoding='utf-8',
    html.xhtml_to_html(doc)
    jQuery = PyQuery([doc])
    #content = jQuery ('td#content table').eq(0)
    content = jQuery('td#content')
    content('form').remove()

    # used to do this, but some models (eg blades) don't have tables:
    #content = jQuery ('td#content table td').eq (0)

    #if content.is_('table'):
    #    content = content ('table td').eq (0)

    # nope, this was too simplistic - let's take apart the tables - see below in final save
    # nope, this doens't work either. I give up.

    skus = find_sku.findall(url)
    sku = skus[0]
    slug = slugify(sku)

    print sku
    '''
    if sku in ['ESERVE',
     'NAS6X',
     'NAS16X',
     'PREMIUM',
     'TWINSERVE',
     'PREMIUM2',
     'SANDYCORE',
     'i7CORE',
     'i7SHORT',]:
        print 'Skipping..'
        return
    #elif testing and sku != 'NAS12':
    #    print 'Skipping due to testing..'
    #    return
    '''

    content('.small').filter(lambda notused: PyQuery(this).text().startswith(
        "Per single unit, this configuration's price")).remove()
    content('.small').filter(lambda notused: PyQuery(this).text().startswith(
        "The base price with this configuration is")).remove()
    content('.small').filter(lambda notused: PyQuery(this).text().startswith(
        "All eRacks systems come with a Standard")).remove()
    content('.small').filter(lambda notused: PyQuery(this).text().startswith(
        "The price differences between the default")).remove()
    content('.small').filter(lambda notused: PyQuery(this).text().startswith(
        "Contact eRacks to inquire about leasing")).remove()

    content('form').remove()

    content('#pricetext').remove()
    content('#warrantynote').remove()
    content('#closenote').remove()

    xbig = content('.xbig')
    if xbig:
        xbig('a').remove()
        inner = xbig.html().replace(':', '').strip()
        xbig.replaceWith('<h5 class=xbig>%s</h5>' % inner)
        print 'xbig replaced:', inner

    font = content('font[size=4], font[size=5]')
    if font:
        font('a').remove()
        inner = font.text().replace(':', '').strip()
        font.replaceWith('<h5 class="product">%s</h5>' % inner)
        print 'font replaced:', inner

    if testing:
        print
        print sku, 'content:'
        print content.html()

    links = content('a')
    images = content('img')

    for link in links:
        a = PyQuery(link)
        href = a.attr('href')

        if href:
            if '?' in href:
                href = href.split('?')[
                    0]  # doesn't this get rid of all get parms?
                a.attr('href', href)

            linkskus = find_sku.findall(href)  # That this is looking for?!!
        else:
            print "Empty Link:", a.html()
            linkskus = []
            print content.html()

        if linkskus:
            linksku = linkskus[0]
            a.attr('href', '/products/%s/%s/' % (category_slug, linksku))
            print 'New link:', a.attr('href')
        elif href.startswith('/Legacy'):
            linksku = slugify(href.split('/')[-1])
            a.attr('href', '/products/%s/%s/' % (category_slug, linksku))
            print 'New link:', a.attr('href')
        elif 'ore photos' in a.text():
            print 'Scraping:', href
            scrape_photos(url, href, slug)
            #print 'Removing link (scraped):', href
            #a.remove()
            print 'Updating "more photos" link:', href
            a.attr('href', '#photos')
            a.attr('onclick', '$("#photos-tab").click();')
        elif href.endswith('_photos'):
            print 'Scraping:', href
            scrape_photos(url, href, slug)
            print 'Updating "<prod>_photos" link:', href
            a.attr('href', '#photos')
            a.attr('onclick', '$("#photos-tab").click();')

    for image in images:
        img = PyQuery(image)
        src = img.attr('src')
        newsrc = getimage(src, 'products/' + slug)
        img.attr('src', newsrc)
        print 'image:', newsrc

    if dbteeth:
        #prod, created = Product.objects.get_or_create (sku=sku)  # prods are already in the db, silly!
        prod = Product.objects.get(sku=sku)
        prod.comments = prod.comments + '\n\nScraped from Zope as of ' + str(
            datetime.date.today())
        #prod.description = content.text() + '<br>'.join ([PyQuery(c).html() for c in content ('td')])  # content.html()
        prod.description = content.html()
        # save image(s):
        # prod.image =
        # prod.images.add (name, title, src, etc)
        prod.save()
        print '..saved.'
Пример #17
0
def scrape_product (url, category_slug):
    f = urlopen (url)
    doc = html5lib.parse(f, treebuilder='lxml', namespaceHTMLElements=False)  # this didn't work, but above three lines did: encoding='utf-8',
    html.xhtml_to_html (doc)
    jQuery = PyQuery([doc])
    #content = jQuery ('td#content table').eq(0)
    content = jQuery ('td#content')
    content ('form').remove()

    # used to do this, but some models (eg blades) don't have tables:
    #content = jQuery ('td#content table td').eq (0)

    #if content.is_('table'):
    #    content = content ('table td').eq (0)

    # nope, this was too simplistic - let's take apart the tables - see below in final save
    # nope, this doens't work either. I give up.

    skus = find_sku.findall (url)
    sku = skus [0]
    slug = slugify (sku)

    print sku

    '''
    if sku in ['ESERVE',
     'NAS6X',
     'NAS16X',
     'PREMIUM',
     'TWINSERVE',
     'PREMIUM2',
     'SANDYCORE',
     'i7CORE',
     'i7SHORT',]:
        print 'Skipping..'
        return
    #elif testing and sku != 'NAS12':
    #    print 'Skipping due to testing..'
    #    return
    '''

    content ('.small').filter (lambda notused:
         PyQuery (this).text().startswith ("Per single unit, this configuration's price")).remove()
    content ('.small').filter (lambda notused:
         PyQuery (this).text().startswith ("The base price with this configuration is")).remove()
    content ('.small').filter (lambda notused:
         PyQuery (this).text().startswith ("All eRacks systems come with a Standard")).remove()
    content ('.small').filter (lambda notused:
         PyQuery (this).text().startswith ("The price differences between the default")).remove()
    content ('.small').filter (lambda notused:
         PyQuery (this).text().startswith ("Contact eRacks to inquire about leasing")).remove()

    content ('form').remove()

    content ('#pricetext').remove()
    content ('#warrantynote').remove()
    content ('#closenote').remove()

    xbig = content ('.xbig')
    if xbig:
        xbig ('a').remove()
        inner = xbig.html().replace (':','').strip()
        xbig.replaceWith ('<h5 class=xbig>%s</h5>' % inner)
        print 'xbig replaced:', inner

    font = content('font[size=4], font[size=5]')
    if font:
        font ('a').remove()
        inner = font.text().replace (':','').strip()
        font.replaceWith ('<h5 class="product">%s</h5>' % inner)
        print 'font replaced:', inner

    if testing:
        print
        print sku, 'content:'
        print content.html()

    links = content ('a')
    images = content ('img')

    for link in links:
        a = PyQuery (link)
        href = a.attr('href')

        if href:
            if '?' in href:
                href = href.split('?')[0] # doesn't this get rid of all get parms?
                a.attr ('href', href)

            linkskus = find_sku.findall (href)  # That this is looking for?!!
        else:
            print "Empty Link:", a.html()
            linkskus=[]
            print content.html()

        if linkskus:
            linksku = linkskus [0]
            a.attr ('href', '/products/%s/%s/' % (category_slug, linksku))
            print 'New link:', a.attr('href')
        elif href.startswith ('/Legacy'):
            linksku = slugify (href.split ('/') [-1])
            a.attr ('href', '/products/%s/%s/' % (category_slug, linksku))
            print 'New link:', a.attr('href')
        elif 'ore photos' in a.text():
            print 'Scraping:', href
            scrape_photos (url, href, slug)
            #print 'Removing link (scraped):', href
            #a.remove()
            print 'Updating "more photos" link:', href
            a.attr ('href', '#photos')
            a.attr ('onclick', '$("#photos-tab").click();')
        elif href.endswith ('_photos'):
            print 'Scraping:', href
            scrape_photos (url, href, slug)
            print 'Updating "<prod>_photos" link:', href
            a.attr ('href', '#photos')
            a.attr ('onclick', '$("#photos-tab").click();')

    for image in images:
        img = PyQuery (image)
        src = img.attr('src')
        newsrc = getimage (src, 'products/' + slug)
        img.attr ('src', newsrc)
        print 'image:', newsrc

    if dbteeth:
        #prod, created = Product.objects.get_or_create (sku=sku)  # prods are already in the db, silly!
        prod = Product.objects.get (sku=sku)
        prod.comments = prod.comments + '\n\nScraped from Zope as of ' + str(datetime.date.today())
        #prod.description = content.text() + '<br>'.join ([PyQuery(c).html() for c in content ('td')])  # content.html()
        prod.description = content.html()
        # save image(s):
        # prod.image =
        # prod.images.add (name, title, src, etc)
        prod.save()
        print '..saved.'
Пример #18
0
    def __call__(self, doc):
        """
        Cleans the document.
        """
        if hasattr(doc, 'getroot'):
            # ElementTree instance, instead of an element
            doc = doc.getroot()
        # convert XHTML to HTML
        xhtml_to_html(doc)
        # Normalize a case that IE treats <image> like <img>, and that
        # can confuse either this step or later steps.
        for el in doc.iter('image'):
            el.tag = 'img'
        if not self.comments:
            # Of course, if we were going to kill comments anyway, we don't
            # need to worry about this
            self.kill_conditional_comments(doc)

        kill_tags = set(self.kill_tags or ())
        remove_tags = set(self.remove_tags or ())
        allow_tags = set(self.allow_tags or ())

        if self.scripts:
            kill_tags.add('script')
        if self.safe_attrs_only:
            safe_attrs = set(defs.safe_attrs)
            for el in doc.iter():
                attrib = el.attrib
                for aname in attrib.keys():
                    if aname not in safe_attrs:
                        del attrib[aname]
        if self.javascript:
            if not self.safe_attrs_only:
                # safe_attrs handles events attributes itself
                for el in doc.iter():
                    attrib = el.attrib
                    for aname in attrib.keys():
                        if aname.startswith('on'):
                            del attrib[aname]
            doc.rewrite_links(self._remove_javascript_link,
                              resolve_base_href=False)
            if not self.style:
                # If we're deleting style then we don't have to remove JS links
                # from styles, otherwise...
                for el in _find_styled_elements(doc):
                    old = el.get('style')
                    new = _css_javascript_re.sub('', old)
                    new = _css_import_re.sub('', old)
                    if self._has_sneaky_javascript(new):
                        # Something tricky is going on...
                        del el.attrib['style']
                    elif new != old:
                        el.set('style', new)
                for el in list(doc.iter('style')):
                    if el.get('type', '').lower().strip() == 'text/javascript':
                        el.drop_tree()
                        continue
                    old = el.text or ''
                    new = _css_javascript_re.sub('', old)
                    # The imported CSS can do anything; we just can't allow:
                    new = _css_import_re.sub('', old)
                    if self._has_sneaky_javascript(new):
                        # Something tricky is going on...
                        el.text = '/* deleted */'
                    elif new != old:
                        el.text = new
        if self.comments or self.processing_instructions:
            # FIXME: why either?  I feel like there's some obscure reason
            # because you can put PIs in comments...?  But I've already
            # forgotten it
            kill_tags.add(etree.Comment)
        if self.processing_instructions:
            kill_tags.add(etree.ProcessingInstruction)
        if self.style:
            kill_tags.add('style')
            etree.strip_attributes(doc, 'style')
        if self.links:
            kill_tags.add('link')
        elif self.style or self.javascript:
            # We must get rid of included stylesheets if Javascript is not
            # allowed, as you can put Javascript in them
            for el in list(doc.iter('link')):
                if 'stylesheet' in el.get('rel', '').lower():
                    # Note this kills alternate stylesheets as well
                    el.drop_tree()
        if self.meta:
            kill_tags.add('meta')
        if self.page_structure:
            remove_tags.update(('head', 'html', 'title'))
        if self.embedded:
            # FIXME: is <layer> really embedded?
            # We should get rid of any <param> tags not inside <applet>;
            # These are not really valid anyway.
            for el in list(doc.iter('param')):
                found_parent = False
                parent = el.getparent()
                while parent is not None and parent.tag not in ('applet', 'object'):
                    parent = parent.getparent()
                if parent is None:
                    el.drop_tree()
            kill_tags.update(('applet',))
            # The alternate contents that are in an iframe are a good fallback:
            remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param'))
        if self.frames:
            # FIXME: ideally we should look at the frame links, but
            # generally frames don't mix properly with an HTML
            # fragment anyway.
            kill_tags.update(defs.frame_tags)
        if self.forms:
            remove_tags.add('form')
            kill_tags.update(('button', 'input', 'select', 'textarea'))
        if self.annoying_tags:
            remove_tags.update(('blink', 'marquee'))

        _remove = []
        _kill = []
        for el in doc.iter():
            if el.tag in kill_tags:
                if self.allow_element(el):
                    continue
                _kill.append(el)
            elif el.tag in remove_tags:
                if self.allow_element(el):
                    continue
                _remove.append(el)

        if _remove and _remove[0] == doc:
            # We have to drop the parent-most tag, which we can't
            # do.  Instead we'll rewrite it:
            el = _remove.pop(0)
            el.tag = 'div'
            el.attrib.clear()
        elif _kill and _kill[0] == doc:
            # We have to drop the parent-most element, which we can't
            # do.  Instead we'll clear it:
            el = _kill.pop(0)
            if el.tag != 'html':
                el.tag = 'div'
            el.clear()

        _kill.reverse() # start with innermost tags
        for el in _kill:
            el.drop_tree()
        for el in _remove:
            el.drop_tag()

        allow_tags = self.allow_tags
        if self.remove_unknown_tags:
            if allow_tags:
                raise ValueError(
                    "It does not make sense to pass in both allow_tags and remove_unknown_tags")
            allow_tags = set(defs.tags)
        if allow_tags:
            bad = []
            for el in doc.iter():
                if el.tag not in allow_tags:
                    bad.append(el)
            if bad:
                if bad[0] is doc:
                    el = bad.pop(0)
                    el.tag = 'div'
                    el.attrib.clear()
                for el in bad:
                    el.drop_tag()
        if self.add_nofollow:
            for el in _find_external_links(doc):
                if not self.allow_follow(el):
                    el.set('rel', 'nofollow')
Пример #19
0
## globals

url = 'http://eracks.com/customers'

teeth = 0  # whether to write scraped images

## main

f = urlopen(url)

doc = html5lib.parse(
    f, treebuilder='lxml'
)  # this didn't work, but above three lines did: encoding='utf-8',

print html.xhtml_to_html(doc)

jQuery = PyQuery([doc])


def getimage(src):
    f = urlopen(src)
    info = f.info()
    fname = src.split('/')[-1] + '.' + info.getsubtype()
    path = '/home/joe/Projects/django_eracks/static/images/customers/' + fname
    if teeth: open(path, 'wb').write(f.read())
    return '/images/customers/' + fname  # url for retrieval


href = src = caption = loc = title = ''
sortorder = 100