示例#1
0
def addCategoryToScan(url):
    content = retrievePage(url)
    html = lhtml.fromstring(content)
    d = pq(content)
    s = d('h1#breadCrumb')
    # s = html.xpath("//h1[@id='breadCrumb']")
    breadcrumb = toAscii(s[0].text_content())

    # print breadcrumb
    # containers = html.xpath(".//*[@id='bestRefinement']")
    containers = d('#bestRefinement > a')
    # thecontainer = getFormatContainer(containers)

    #currently always going here - Apr27
    if containers is None:
        ats = Amazon_Textbook_Section_NR(title=breadcrumb, url=url)
        try:
            ats.save()
        except:
            pass
        return

    # s = thecontainer.xpath(".//div[@class='refinement']")
    for cat in containers:
        el = cat
        if len(el):
            ats = Amazon_Textbook_Section_NR(title=breadcrumb + " " +
                                             el.text_content(),
                                             url=el.get('href'))
            ats.save()
def addCategoryToScan(url):
    content = retrievePage(url)
    html = lhtml.fromstring(content)
    d = pq(content)
    s = d('h1#breadCrumb')
    # s = html.xpath("//h1[@id='breadCrumb']")
    breadcrumb = toAscii(s[0].text_content())

    # print breadcrumb
    # containers = html.xpath(".//*[@id='bestRefinement']")
    containers = d('#bestRefinement > a')
    # thecontainer = getFormatContainer(containers)

    #currently always going here - Apr27
    if containers is None:
        ats = Amazon_Textbook_Section_NR(title=breadcrumb, url=url)
        try:
            ats.save()
        except:
            pass
        return

    # s = thecontainer.xpath(".//div[@class='refinement']")
    for cat in containers:
        el = cat
        if len(el):
            ats = Amazon_Textbook_Section_NR(title=breadcrumb + " " + el.text_content(), url=el.get('href'))
            ats.save()
示例#3
0
def addCategoryToScan(url):
    content = retrievePage(url)
    html = lhtml.fromstring(content)

    s = html.xpath("//h1[@id='breadCrumb']")
    breadcrumb = toAscii(s[0].text_content())

    #print breadcrumb
    containers = html.xpath(".//*[@class='refinementContainer']")
    thecontainer = getFormatContainer(containers)

    #currently always going here - Apr27
    if thecontainer is None:
        ats = Amazon_Textbook_Section_NR(title=breadcrumb, url=url)
        ats.save()
        return

    s = thecontainer.xpath(".//div[@class='refinement']")
    for cat in s:
        el = cat.cssselect("a")
        if el:
            ats = Amazon_Textbook_Section_NR(title=breadcrumb + " " + el[0].text_content(), url=el[0].get('href'))
            ats.save()