示例#1
0
]
print "initUrls", initUrls


def getCurFileName():
    filename = os.path.basename(__file__)
    return filename[0:filename.find(".")]


crawler = Crawler(initUrls, initDir, headers, getCurFileName())
print "crawler初始化成功"


def fn1(url):
    r = requests.get(url, headers=headers, timeout=100).text
    maxCount = BeautifulSoup(r,
                             'lxml').find('div',
                                          class_="page").find_all('a')[-2].text
    #     print maxCount
    page_urls = [url + "/" + str(i) for i in range(1, int(maxCount) + 1)]
    return page_urls


def fn2(url):
    r = requests.get(url, headers=headers, timeout=100).text
    return BeautifulSoup(r, 'lxml').find('div',
                                         class_="content").find('a').img['src']


crawler.then(fn1).then(fn2)
示例#2
0
}
# 最大208
initUrls=["http://jandan.net/ooxx/page-{num}#comments".format(num=num) for num in range(207,208)]
print "initUrls",initUrls
def getCurFileName():
    filename=os.path.basename(__file__)
    return filename[0:filename.find(".")]

crawler=Crawler(initUrls,initDir,headers,getCurFileName());
print "crawler初始化成功"
def fn1(url):
    arr=[]
    r = requests.get(url, headers=headers, timeout=100).text
    for father in BeautifulSoup(r, 'lxml').find_all('div',class_="row"):
        link=father.find("div",class_="text").img['src']
        arr.append('http:'+link)
    return arr
# def fn2(url):
#     r = requests.get(url, headers=headers, timeout=100).text
#     return BeautifulSoup(r, 'lxml').find('div',class_="content").find('a').img['src']
crawler.then(fn1)
# .then(fn2)