Пример #1
0
def crawlPage(link):
    output = []
    for i in link:
        try:
            pageType, indexUrl = getIndexPage(i)
        except:
            continue
        if pageType == 0:
            output.append(crawlShowPage(i, indexUrl))
        elif pageType == 1:
            output.append(crawlKeywordPage(indexUrl))
        elif pageType == 2:
            output.append(sel.getJSON(indexUrl))
    return output
Пример #2
0
def crawlShowPage(pagelink, indexlink):
    print pagelink
    showIndex = sel.getJSON(indexlink)
    page = urllib.urlopen(pagelink)
    data = page.read().decode("utf-8")
    dom = etree.HTML(data)
    page.close()
    episodes = dom.xpath(
        r"/html/body/div/div/div[2]/div/div[5]/div[3]/div/div/div/div/div/div/div/div/div[2]/div/div/div/div/ul"
    )
    epi = []
    for i in episodes:
        url = i.xpath(r"li/a")[0].attrib["href"]
        pagetype, indexurl = getIndexPage(url)
        # print i.xpath(r"li/a")[0].attrib["href"]
        # print indexurl
        epi.append(crawlKeywordPage(indexurl))
    return [showIndex, epi]
Пример #3
0
def crawlKeywordPage(link):
    return sel.getJSON(link)