def get_tasklist(url): res = lt.send_request(url) categories = lt.links(res, search='/cp') tasklist = [ f'{domain}/pg{n}-{c[1:]}' for c in categories for n in range(1, 101) ] return tasklist
def test_links(): res = lt.send_request(domain) r = lt.links(res) wikis = lt.links(res, search='wiki') assert isinstance(r, list) assert '#' not in r and '' not in r assert len(set(r)) == len(r) assert all(['wiki' in wiki for wiki in wikis])
def mycrawl(url): try: res = lt.send_request(url) res.encoding = 'utf-8' webcode = res.text coinproof = re.findall(proofpattern, webcode) coinalgorithm = re.findall(algopattern, webcode) coinmax = re.findall(maxpattern, webcode) coininfo['name'].append(url) coininfo['algo'].append(coinalgorithm) coininfo['proof'].append(coinproof) coininfo['total'].append(coinmax) except: logging.exception('error') coininfo['name'].append(url) coininfo['algo'].append('error') coininfo['proof'].append('error') coininfo['total'].append('error')
def crawl(url): tree = lt.fetch(url, use_cookies=True, headers=headers) time.sleep(0.5) items = tree.css('.wrap') for item in items: data = dict() data['views'] = lt.expand_num( item.css('span.views var::text').extract_first()) data['rating'] = int(item.css('.value::text').extract_first()[:-1]) viewKey = item.css('a::attr(href)').extract_first().split('=')[-1] video = lt.send_request(f'{domain}/embed/{viewKey}', cookies=cookies, headers=headers).text flashvars = re.findall('var flashvars =(.*?),\n', video)[0] info = json.loads(flashvars) data['title'] = info.get('video_title') data['duration'] = info.get('video_duration') data['image'] = info.get('image_url') data['link'] = info.get('link_url') data['quality_480p'] = info.get('quality_480p') pprint(data) col.insert_one(data)
def test_send_request(): res = lt.send_request(domain) assert type(res) == requests.models.Response assert res.status_code == 200
def test_re_links(): res = lt.send_request(f'{domain}/post') hrefs = lt.re_links(res, r'https://konachan.net/wiki/.*?') assert type(hrefs) == list and len(hrefs) > 5
def test_links(): res = lt.send_request(domain) r = lt.links(res) assert type(r) == list and '#' not in r
def test_get(): res = lt.send_request(domain) assert res.status_code == 200
def test_proxies(): res = lt.send_request('http://httpbin.org/get', use_proxies=True) assert res.status_code == 200
def test_absolute_links(): res = lt.send_request('http://www.spbeen.com') hrefs = [href.replace('http://','') for href in lt.links(res, absolute=True)] assert len([href for href in hrefs if "//" in href])==0
def test_re_links(): res = lt.send_request('http://www.spbeen.com') hrefs = lt.re_links(res,'https?://www.spbeen.com//p/.*?') assert type(hrefs) == list
def test_re_links(): res = lt.send_request(f'{domain}/post') hrefs = lt.re_links(res, r'https://konachan.com/wiki/.*?') assert isinstance(hrefs, list) and len(hrefs) > 5
def test_read_cookies(): url = 'http://httpbin.org/cookies' cookies = lt.read_cookies(filename='./looter/examples/cookies.txt') r = lt.send_request(url, cookies=cookies) assert dict(cookies.items()) == r.json()['cookies']
def test_send_request(): res = lt.send_request(domain) assert isinstance(res, requests.models.Response) assert res.status_code == 200