def crawl(url): tree = lt.fetch(url) imgs = tree.css('dl.list-left dd')[:-1] for img in imgs: link = img.css('a::attr(href)').extract_first() bango = link.split('/')[-1][:-5] detail = lt.fetch(link, headers=headers) max_page = detail.css('.content-page .page-ch::text').re_first(r'\d+') img_urls = [ f'http://img1.mm131.me/pic/{bango}/{n}.jpg' for n in range(1, int(max_page) + 1) ] lt.async_save_imgs(img_urls, headers=headers, random_name=True)
def crawl(url): tree = lt.fetch(url) imgs = tree.cssselect('dl.list-left dd')[:-1] for img in imgs: link = img.cssselect('a')[0].get('href') bango = link.split('/')[-1][:-5] detail = lt.fetch(link, headers=headers) pagination = detail.cssselect('.content-page .page-ch')[0].text max_page = int(re.findall(r'\d+', pagination)[0]) img_urls = [ f'http://img1.mm131.me/pic/{bango}/{n}.jpg' for n in range(1, max_page + 1) ] lt.async_save_imgs(img_urls, headers=headers, random_name=True)
def crawl(url): tree = lt.fetch(url) items = tree.css('ul.note-list li') for item in items: title = item.css('.content a.title::text').extract_first() author = item.css('a.nickname::text').extract_first() source = f"{domain}{item.css('.content a.title::attr(href)').extract_first()}" vote = max(map(int, (item.css('.meta span').re(r'\d+')))) site = 'jianshu' date = datetime.utcnow() view = 0 comment = 0 try: comment = int(item.css('.meta a::text').re_first(r'\d+')) except TypeError: pass collect = 0 row = (title, author, source, vote, site, date, view, comment, collect) print(row) try: cursor.execute( 'INSERT INTO `article` (`title`, `author`, `source`, `vote`, `site`, `date`, `view`, `comment`, `collect`) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)', row) connection.commit() except Exception as e: print(e)
def test_get_img_name(): tree = lt.fetch(f'{domain}/post') img = tree.css('a.directlink::attr(href)').extract_first() name = lt.get_img_name(img) random_name = lt.get_img_name(img, random_name=True) assert '%' not in name assert random_name != name
def crawl(url): tree = lt.fetch(url) items = tree.cssselect('table tr')[1:] for item in items: schema = item.cssselect('td')[-5].text.lower() ip = item.cssselect('td')[1].text port = item.cssselect('td')[2].text proxy = f'{schema}://{ip}:{port}' print(proxy) proxies.append(proxy)
def test_save_img(): tree = lt.fetch(f'{domain}/post') img = tree.css('a.directlink::attr(href)').extract_first() name = lt.get_img_name(img) lt.save_img(img) with open(name, 'rb') as f: img_data = f.read() assert isinstance(img_data, bytes) and len(img_data) > 100 assert not lt.save_img(broken_domain) os.remove(name)
def crawl(url): tree = lt.fetch(url) items = tree.css('#waterfall .item') for item in items: data = {} data['name'] = item.css('img::attr(title)').extract_first() data['cover'] = item.css('img::attr(src)').extract_first() data['link'] = item.css('.movie-box::attr(href)').extract_first() data['bango'] = item.css('date::text').extract_first() data['date'] = item.css('date::text').extract()[1] yield data
def crawl(url): tree = lt.fetch(url) items = tree.css('.list-view .item') for item in items: data = {} data['title'] = item.css('a::text').extract_first().strip() data['url'] = item.css('a::attr(href)').extract_first().strip() intro = item.css('span.intro::text').extract_first() data['date'] = intro[:10] data['intro'] = intro yield data
def crawl(url): tree = lt.fetch(url) items = tree.css('li.shot-thumbnail') for item in items: data = {} data['title'] = item.css('a strong::text').extract_first() data['url'] = f"{domain}{item.css('a::attr(href)').extract_first()}" data['author'] = item.css('.display-name::text').extract_first() data['fav'] = int(item.css('span.toggle-fav::text').extract_first().strip()) data['comment'] = int(item.css('li.cmnt span::text').extract_first().strip()) yield data
def crawl(url): tree = lt.fetch(url) items = tree.css('ul#browserItemList li.item') for item in items: data = {} data['title'] = item.css('h3 a.l::text').extract_first() info = item.css('p.info::text').extract_first().strip() date = info.split(r'/')[1].strip() if r'/' in info else info data['date'] = format_date(date) data['url'] = f"{domain}{item.css('h3 a.l::attr(href)').extract_first()}" yield data
def crawl(url): time.sleep(1) tree = lt.fetch(url) items = tree.css('#TopicsNode .cell') for item in items: data = {} data['title'] = item.css('span.item_title a::text').extract_first() data['author'] = item.css('span.small.fade strong a::text').extract_first() data['source'] = f"{domain}{item.css('span.item_title a::attr(href)').extract_first()}" reply = item.css('a.count_livid::text').extract_first() data['reply'] = int(reply) if reply else 0 total.append(data)
def crawl(url): tree = lt.fetch(url) items = tree.css('.type-post') for item in items: data = {} data['title'] = item.css('h2 a::text').extract_first() data['url'] = item.css('h2 a::attr(href)').extract_first() data['date'] = re.sub( r'年|月|日', '-', item.css('small span.date::text').extract_first())[:-1] data['view'] = int(item.css('small::text').re('\d+')[0]) yield data
def crawl(url): tree = lt.fetch(url) items = tree.css('ul.car-monthlisting li') total = [] for item in items: data = {} data['name'] = item.css('a::text').extract_first() data['url'] = item.css('a::attr(href)').extract_first() data['comments'] = int(item.css('span::text').re_first(r'(\d+)')) pprint(data) total.append(data) return total
def crawl(url): tree = lt.fetch(url) items = tree.css('ul.subject-list li.subject-item') for item in items: data = {} data['title'] = item.css('h2 a::text').extract_first().strip() data['link'] = item.css('h2 a::attr(href)').extract_first() data['pub'] = item.css('.pub::text').extract_first().strip() data['rating'] = float( item.css('span.rating_nums::text').extract_first()) data['comments'] = int(item.css('span.pl').re_first(r'\d+')) pprint(data) col.insert_one(data)
def crawl(url): tree = lt.fetch(url) items = tree.css('#inner_mid_col article') for item in items: if item.css('p.note::text').extract()[-1][10:] == 'English': data = {} data['name'] = item.css('p.title a::text').extract_first().strip() data['link'] = item.css('p.title a::attr(href)').extract_first().strip() data['author'] = item.css('p.note::text').extract_first()[3:] data['publisher'] = item.css('p.publisher::text').extract_first()[11:] data['year'] = int(''.join(item.css('p.date2::text').extract_first().split())[-4:]) pprint(data) col.insert_one(data)
def crawl(url): tree = lt.fetch(url) items = tree.css('ul#browserItemList li.item') for item in items: data = {} data['title'] = item.css('h3 a.l::text').extract_first() data['date'] = format_date( item.css('p.info::text').extract_first().strip().split(r'/') [1].strip()) data[ 'url'] = f"{domain}{item.css('h3 a.l::attr(href)').extract_first()}" pprint(data) total.append(data)
def crawl(url): tree = lt.fetch(url) data = dict() cols = ['salary', 'place', 'date', 'nature', 'experience', 'degree', 'amount', 'category'] for i, col in enumerate(cols): data[col] = tree.cssselect('ul.terminal-ul li')[i].cssselect('strong')[0].text data['salary'] = "".join(data['salary'].split()) del data['place'] data['date'] = tree.cssselect('ul.terminal-ul li')[2].cssselect('strong #span4freshdate')[0].text data['category'] = tree.cssselect('ul.terminal-ul li')[7].cssselect('strong a')[0].text detail = tree.cssselect('.tab-inner-cont p') data['detail'] = ''.join([p.text for p in detail if p.text]).strip() pprint(data) total.append(data)
def crawl(url): try: tree = lt.fetch(url) items = tree.css('ul.note-list li') for item in items: data = {} data['title'] = item.css('.content a.title::text').extract_first() data['author'] = item.css('a.nickname::text').extract_first() data['source'] = f"{domain}{item.css('.content a.title::attr(href)').extract_first()}" data['vote'] = max(map(int, (item.css('.meta span').re(r'\d+')))) pprint(data) total.append(data) except Exception as e: print(e)
def crawl(url): tree = lt.fetch(url) items = tree.cssselect('.question-summary') for item in items: data = dict() data['question'] = item.cssselect('a.question-hyperlink')[0].text data['link'] = domain + item.cssselect('a.question-hyperlink')[0].get( 'href') data['votes'] = int(item.cssselect('.vote-count-post strong')[0].text) data['answers'] = int(item.cssselect('.status strong')[0].text) data['views'] = int(''.join( item.cssselect('.views')[0].get('title')[:-6].split(','))) data['timestamp'] = item.cssselect('.relativetime')[0].get('title') pprint(data) col.insert_one(data)
def crawl(url): tree = lt.fetch(url) posts = tree.cssselect('.cg') for post in posts: data = dict() title = post.cssselect('h1 a')[0] data['name'] = title.text data['url'] = domain + title.get('href') data['artist'] = post.cssselect('.artist-list')[0].text dj_content = post.cssselect('.dj-content .dj-desc')[0] td = dj_content.cssselect('tr td') data['series'] = td[1].text.strip() or 'N/A' data['type'] = td[3].cssselect('a')[0].text.strip() data['language'] = td[5].text.strip() data['tags'] = ', '.join([tag.text for tag in td[7].cssselect('.relatedtags ul li a')]) or 'N/A' data['date'] = post.cssselect('.dj-content p.cg-date')[0].text pprint(data)
def crawl(url): tree = lt.fetch(url) items = tree.cssselect('ul.bigimg li') for item in items: data = dict() data['title'] = item.cssselect('a')[0].get('title').strip() data['detail'] = item.cssselect('a')[0].get('href') data['price'] = float( item.cssselect('p.price .search_now_price')[0].text[1:]) data['author'] = item.cssselect('p.search_book_author a')[0].get( 'title') data['date'] = item.cssselect( 'p.search_book_author span')[1].text.strip()[1:] data['press'] = item.cssselect('p.search_book_author a')[-1].text data['comments'] = int( item.cssselect('p.search_star_line a')[0].text[:-3]) pprint(data)
def crawl(url): tree = lt.fetch(url) items = tree.css('ul.subject-list li.subject-item') for item in items: data = {} data['title'] = item.css('h2 a::text').extract_first().strip() data['link'] = item.css('h2 a::attr(href)').extract_first() data['pub'] = item.css('.pub::text').extract_first().strip() try: data['rating'] = float( item.css('span.rating_nums::text').extract_first()) except Exception: data['rating'] = 0.0 try: data['comments'] = int(item.css('span.pl').re_first(r'\d+')) except Exception: data['comments'] = 0 yield data
def get_total(categories): total = [] for category in categories: tree = lt.fetch(category) links = tree.css('a::attr(href)').re('.*?/blog/\d+/\d+/.*') hints = tree.css('span.hint::text').extract() hints = [hint[1:][:-1].split('@') for hint in hints] titles = [ title for title in tree.css('li a::text').extract() if title not in categories_texts ] data = [{ 'title': title, 'url': link, 'comments': int(hint[0]), 'date': hint[1] } for title, link, hint in zip(titles, links, hints)] pprint(data) total.extend(data) return total
def crawl(url): tree = lt.fetch(url) items = tree.css('table.n_worklist tr') for item in items: data = {} data['name'] = item.css('.work_name a::text').extract_first() data['link'] = item.css('.work_name a::attr(href)').extract_first() data['maker'] = item.css('dd.maker_name a::text').extract_first() try: data['price'] = int(''.join(item.css('span.work_price::text').extract_first().split(','))) data['rate'] = int(item.css('.star_rating::text').re_first('\d+')) data['review'] = int(item.css('.work_review a::text').re_first('\d+')) except Exception as e: print(e) data['price'] = 0 data['rate'] = 0 data['review'] = 0 if not data['name']: continue yield data
def crawl(url): tree = lt.fetch(url, use_cookies=True, headers=headers) time.sleep(0.5) items = tree.css('.wrap') for item in items: data = {} data['views'] = item.css('span.views var::text').extract_first() data['rating'] = int(item.css('.value::text').extract_first()[:-1]) viewKey = item.css('a::attr(href)').extract_first().split('=')[-1] video = requests.get(f'https://{domain}/embed/{viewKey}', cookies=cookies, headers=headers).text flashvars = re.findall('var flashvars =(.*?),\n', video)[0] info = json.loads(flashvars) data['title'] = info.get('video_title') data['duration'] = info.get('video_duration') data['image'] = info.get('image_url') data['link'] = info.get('link_url') data['quality_480p'] = info.get('quality_480p') pprint(data) col.insert_one(data)
def test_get_img_info(): tree = lt.fetch(f'{domain}/post') img = tree.cssselect('a.directlink')[0] url, name = lt.get_img_info(img) assert url == img.get('href') and '%' not in name
def test_fetch(): tree = lt.fetch(f'{domain}/post') imgs = tree.cssselect('a.directlink') assert len(imgs) > 0
def test_fetch(): tree = lt.fetch(f'{domain}/post') imgs = tree.css('a.directlink::attr(href)').extract() assert len(imgs) > 0 and isinstance(imgs[0], str) assert not lt.fetch(broken_domain)
def crawl(url): tree = lt.fetch(url) imgs = tree.cssselect('a.directlink') lt.async_save_imgs(imgs)
def crawl(url): tree = lt.fetch(url) imgs = tree.cssselect('img.lazy') imgs = [img.get('data-original') for img in imgs] lt.save_imgs(imgs)