def test_save(): # sort_by unordered_data = [{'r': 2}, {'r': 3}, {'r': 1}] lt.save(unordered_data, name='ordered.json', sort_by='r') ordered_data = read_json('ordered.json') assert ordered_data[0]['r'] == 1 os.remove('ordered.json') # no_duplicate dup_data = [{'a': 1}, {'a': 1}, {'b': 2}] lt.save(dup_data, name='unique.json', no_duplicate=True) unique_data = read_json('unique.json') assert len(dup_data) > len(unique_data) os.remove('unique.json')
def test_save(): data = [{ 'rank': 2, 'name': 'python' }, { 'rank': 1, 'name': 'js' }, { 'rank': 3, 'name': 'java' }] lt.save(data, sort_by='rank') with open('data.json', 'r') as f: ordered_data = json.loads(f.read()) assert ordered_data[0]['rank'] == 1 os.remove('data.json') dup_data = [{'a': 1}, {'a': 1}, {'b': 2}] lt.save(dup_data, no_duplicate=True) with open('data.json', 'r') as f: unique_data = json.loads(f.read()) assert len(dup_data) > len(unique_data) os.remove('data.json') with pytest.raises(SystemExit): lt.save(data, name='data.csv')
""" 简书上“程序员”专题下的文章 """ from pprint import pprint import looter as lt domain = 'https://www.jianshu.com' total = [] def crawl(url): try: tree = lt.fetch(url) items = tree.css('ul.note-list li') for item in items: data = {} data['title'] = item.css('.content a.title::text').extract_first() data['author'] = item.css('a.nickname::text').extract_first() data['source'] = f"{domain}{item.css('.content a.title::attr(href)').extract_first()}" data['vote'] = max(map(int, (item.css('.meta span').re(r'\d+')))) pprint(data) total.append(data) except Exception as e: print(e) if __name__ == '__main__': tasklist = [f'{domain}/c/NEt52a?order_by=top&page={n}' for n in range(1, 201)] [crawl(task) for task in tasklist] lt.save(total, name='jianshu.csv', sort_by='vote', order='desc')
""" import requests import arrow import looter as lt domain = 'https://sspai.com' def crawl(url): items = requests.get(url, headers=lt.DEFAULT_HEADERS).json()['list'] for item in items: data = {} data['title'] = item['title'] data['released_at'] = arrow.get(item['released_at']).naive data['summary'] = item['summary'] data['words_count'] = item['words_count'] data['likes_count'] = item['likes_count'] data['favorites_count'] = item['favorites_count'] data['comments_count'] = item['comments_count'] data['url'] = f"{domain}/post/{item['id']}" yield data if __name__ == '__main__': tasklist = [ f'{domain}/api/v1/articles?offset={n * 10}&limit=10&type=recommend_to_home&sort=recommend_to_home_at&include_total=false' for n in range(1170) ] total = lt.crawl_all(crawl, tasklist) lt.save(total, name='sspai.csv', sort_by='likes_count', order='desc')
from pathlib import Path import looter as lt encoding = 'utf-8' total = [] def parse_json(path: Path): data = json.loads(path.read_text(encoding=encoding)) pens = data[0]['data']['pens']['pens'] results = [] for pen in pens: result = { 'title': pen['title'], 'user': pen['owner']['username'], 'url': pen['url'], 'updatedAt': pen['updatedAt'], 'comments': pen['counts']['comments'], 'loves': pen['counts']['loves'], 'views': pen['counts']['views'] } results.append(result) return results if __name__ == "__main__": for path in Path('.').glob('*.json'): total.extend(parse_json(path)) lt.save(total, name='codepen_loved.csv', sort_by='loves', order='desc', no_duplicate=True)
data['番名'] = item['title'] data['链接'] = item['link'] order = item['order'] score = order.get('score') data['评分'] = float(score[:-1]) if score else 0.0 data['放松日期'] = arrow.get(order['pub_date']).naive season_id = item['season_id'] data['id'] = season_id season = requests.get( f'{domain}/ext/web_api/season_count?season_id={season_id}&season_type=1' ).json()['result'] data['追番人数'] = season['favorites'] data['播放量'] = season['views'] data['硬币数'] = season['coins'] data['弹幕数'] = season['danmakus'] pprint(data) total.append(data) if __name__ == '__main__': tasklist = [ f'{domain}/media/web_api/search/result?order=3&sort=0&page={n}&season_type=1&pagesize=30' for n in range(1, 106) ] with futures.ThreadPoolExecutor(20) as executor: executor.map(crawl, tasklist) lt.save(total, name='bilibili_top_bangumi.csv', sort_by='追番人数', order='desc')
domain = 'https://bangumi.tv' user_id = '399538' page_limit = 7 format_date = lambda date: '-'.join(f'0{d}' if len(d) == 1 else d for d in re.sub( r'年|月|日', '-', date)[:-1].split('-')) def crawl(url): tree = lt.fetch(url) items = tree.css('ul#browserItemList li.item') for item in items: data = {} data['title'] = item.css('h3 a.l::text').extract_first() info = item.css('p.info::text').extract_first().strip() date = info.split(r'/')[1].strip() if r'/' in info else info data['date'] = format_date(date) data[ 'url'] = f"{domain}{item.css('h3 a.l::attr(href)').extract_first()}" yield data if __name__ == '__main__': tasklist = [ f'{domain}/anime/list/{user_id}/collect?orderby=date&page={n}' for n in range(1, page_limit + 1) ] total = lt.crawl_all(crawl, tasklist) lt.save(total, name='bangumi.csv', sort_by='date', order='desc')
item_data_list.append(item_data) ids = ','.join(item['post_id'] for item in items) stat_data_list = [] stats = requests.get( f'https://sso.ifanr.com/api/v5/wp/article/stats/?limit=50&post_id__in={ids}', headers=lt.DEFAULT_HEADERS).json()['objects'] for stat in stats: stat_data = {} stat_data['favorite_count'] = stat['favorite_count'] stat_data['like_count'] = stat['like_count'] stat_data['share_count'] = stat['share_count'] stat_data_list.append(stat_data) data_list = [{ **data[0], **data[1] } for data in zip(item_data_list, stat_data_list)] yield data_list if __name__ == '__main__': tasklist = [ f'https://sso.ifanr.com//api/v5/wp/web-feed/?published_at__lte=2019-05-25+07%3A00%3A11&limit=20&offset={n * 20}' for n in range(1900) ] total = lt.crawl_all(crawl, tasklist) lt.save(total, name='ifanr.csv', no_duplicate=True, sort_by='like_count', order='desc')
""" 我看过的电影在豆瓣上的归档 """ import looter as lt domain = 'https://movie.douban.com' MAX_PAGE = 4 def crawl(url): tree = lt.fetch(url) items = tree.css('.list-view .item') for item in items: data = {} data['title'] = item.css('a::text').extract_first().strip() data['url'] = item.css('a::attr(href)').extract_first().strip() intro = item.css('span.intro::text').extract_first() data['date'] = intro[:10] data['intro'] = intro yield data if __name__ == '__main__': tasklist = [ f'{domain}/people/158535797/collect?start={n * 30}&sort=time&rating=all&filter=all&mode=list' for n in range(0, MAX_PAGE) ] total = lt.crawl_all(crawl, tasklist) lt.save(total, name='douban_movie_archive.csv')
""" salttiger上的免费国外编程电子书 """ import os from pprint import pprint import looter as lt domain = 'https://salttiger.com' def crawl(url): tree = lt.fetch(url) items = tree.css('ul.car-monthlisting li') total = [] for item in items: data = {} data['name'] = item.css('a::text').extract_first() data['url'] = item.css('a::attr(href)').extract_first() data['comments'] = int(item.css('span::text').re_first(r'(\d+)')) pprint(data) total.append(data) return total if __name__ == '__main__': task = f'{domain}/archives/' result = crawl(task) lt.save(result, name='salttiger.csv', sort_by='comments', order='desc')
total = [] async def crawl(url): tree = await lt.async_fetch(url) items = tree.css('.resources-wrapper') for item in items: data = {} if item.css('a::text').extract()[0] == 'Sponsor UI Movement': continue data['title'] = item.css('a::text').extract()[-2] data['url'] = f"{domain}{item.css('a::attr(href)').extract_first()}" if (vote := item.css('small.vote-count-wrapper::text').extract_first()): data['vote'] = int(vote) else: data['vote'] = 0 total.append(data) if __name__ == '__main__': tasklist = [f'{domain}/all-designs/?page={n}' for n in range(1, 501)] loop = asyncio.get_event_loop() result = [crawl(task) for task in tasklist] loop.run_until_complete(asyncio.wait(result)) lt.save(total, name='uimovement.csv', no_duplicate=True, sort_by='vote', order='desc')
domain = 'https://book.douban.com' def crawl(url): tree = lt.fetch(url) items = tree.css('ul.subject-list li.subject-item') for item in items: data = {} data['title'] = item.css('h2 a::text').extract_first().strip() data['link'] = item.css('h2 a::attr(href)').extract_first() data['pub'] = item.css('.pub::text').extract_first().strip() try: data['rating'] = float( item.css('span.rating_nums::text').extract_first()) except Exception: data['rating'] = 0.0 try: data['comments'] = int(item.css('span.pl').re_first(r'\d+')) except Exception: data['comments'] = 0 yield data if __name__ == '__main__': tasklist = [ f'{domain}/tag/%E8%AE%A1%E7%AE%97%E6%9C%BA?start={20 * n}&type=T' for n in range(0, 50) ] total = lt.crawl_all(crawl, tasklist) lt.save(total, name='douban_books.csv', sort_by='comments', order='desc')
捷径社区的捷径排行榜 """ from pprint import pprint import requests import looter as lt domain = 'https://sharecuts.cn' total = [] def crawl(url): items = requests.get(url, headers=lt.DEFAULT_HEADERS).json() for item in items: data = {} data['name'] = item['name'] data['category'] = item['Category']['name'] data['note'] = item['note'] data['author'] = item['User']['nickname'] data['url'] = item['url'] data['downloads'] = item['downloads_count'] data['votes'] = item['votes_count'] data['comments'] = item['comments_count'] pprint(data) total.append(data) if __name__ == '__main__': task = f'{domain}/api/shortcuts/hot?offset=0&limit=1025' crawl(task) lt.save(total, name='sharecuts.csv', sort_by='votes', order='desc')
""" codemyui网站归档 """ import requests from parsel import Selector import looter as lt domain = 'https://codemyui.com' total = [] def crawl(url): tree = Selector( text=requests.get(url, headers=lt.DEFAULT_HEADERS).json()['html']) items = tree.css('.alm-layout .details') for item in items: data = {} data['title'] = item.css('h3 a::text').extract_first() data['url'] = item.css('h3 a::attr(href)').extract_first() data['description'] = item.css('p::text').extract_first() total.append(data) if __name__ == '__main__': tasklist = [ f'{domain}/wp-admin/admin-ajax.php?id=LoadArticleToo&post_id=0&slug=home&canonical_url=https%3A%2F%2Fcodemyui.com%2F&posts_per_page=120&page={n}&offset=0&post_type=post&repeater=default&seo_start_page=1&preloaded=false&preloaded_amount=0&cta[cta]=true&cta[cta_position]=after:4&cta[cta_repeater]=template_1&cta[cta_theme_repeater]=null&taxonomy_terms=home&order=DESC&orderby=post__in&post__in=28560,28549,28555,28541,27445,28536,28530,28511,28501,27445,28473,28462,28455,17739,27445,28426,28419,28400,25214,27445,28375,19762,25308,197,27445,24874,217,4011,248,27445,24628,23147,24834,242,27445,76,5016,23177,22950,27445,25409,24739,25519,24684,27445,201,330,22918,24952,27445,5230,20146,17596,36,27445,270,230,206,27558,27445,27415,27409,27404,27362,27445,27357,27352,27347,27342,27445,27330,27324,27319,27312,27445,27307,27286,27270,27265,27445,27260,27238,27234,27230,27445,27226,27222,27218,27213,27445,27208,27203,27190,27179,27445,27142,27062,27058,27051,27445,27046,27041,27038,27034,27445,27030,27027,27023,27020,27445,26988,26978,60,26972,27445,26968,26942,26938,26934,27445,26914,26910,26907,26903,27445,26900,26884,26896,26893,27445,26881,26878,26875,26869,27445,26872,26866,26854,26845,27445,26839,26835,26823,26831,27445,26827,26818,26814,26810,27445,26806,26791,26795,26786,27445,26777,26782,26770,26766,27445,26761,26756,26751,26747,27445,26733,26740,26722,26716,27445,26706,26702,26695,26691,27445,26687,26683,26677,26673,27445,26669,26665,26661,26656,27445,26652,26606,26602,26598,27445,26594,26573,26566,26562,27445,26556,26552,26468,26505,27445,26501,26472,26461,26449,27445,26445,26437,26434,26441,27445,26412,26416,26408,26404,27445,26400,26393,26389,26382,27445,26373,25651,25646,25637,27445,25634,25630,25626,25623,27445,25620,25617,25614,25611,27445,25540,25543,25534,25524,27445,25513,25507,25502,25490,27445,25485,25481,25477,25474,27445,25470,25462,25466,25458,27445,25454,25448,25444,25438,27445,25434,25430,25423,25419,27445,25415,25401,25405,25397,27445,25393,25388,25384,25378,27445,25374,25370,25317,25365,27445,25360,25355,25351,25342,27445,25322,25327,25337,25333,27445,25312,25009,25244,25299,27445,25135,25290,25286,25281,27445,25238,25208,25229,25232,27445,25247,25262,25226,25235,27445,25203,25170,25078,25558,27445,25177,25139,25173,25122,27445,25142,25075,24983,25116,27445,25119,25094,25099,25103,27445,25085,25072,24971,24996,27445,24989,25061,25041,25044,27445,25037,25034,25030,24986,27445,25012,25018,24978,24977,27445,24974,24958,14552,24817,27445,24853,24919,24922,24910,27445,24879,24905,24900,24896,27445,24892,24888,24884,24870,27445,25347,24845,24848,24842,27445,24839,24823,24820,24810,27445,24806,24782,24801,24797,27445,24791,24787,24749,24753,27445,24756,24746,24743,24729,27445,24719,24726,24723,24716,27445,24701,24704,24698,24694,27445,24690,24668,24678,24673,27445,24663,24659,23623,24612,27445,23592,23253,23235,23219,27445,23180,23112,23091,23070,27445,23050,23034,22974,22869,27445,20397,20369,20127,22838,27445,20111,20095,20077,20062,27445,20034,19981,19960,19941,27445,19925,19902,19880,19861,27445,19828,19812,19796,19778,27445,19747,17941,17925,17888,27445,17904,17860,17578,17670,27445,15715,17636,17771,17755,27445,17723,17250,15834,17652,27445,17562,17526,17510,17494,27445,17478,17446,17427,17373,27445,17391,17407,17234,17355,27445,17340,17322,17303,17290,27445,17218,17186,17170,15919,27445,15895,15877,15591,15850,27445,15818,15802,15786,15757,27445,15737,15699,15418,15366,27445,15666,15643,15625,15400,27445,15573,15549,15520,15503,27445,15486,15470,15452,15434,27445,15383,14362,14330,14813,27445,15180,15268,15228,15093,27445,15059,14347,14841,14830,27445,14864,14878,14315,14391,27445,14379,14535,14565,14596,27445,14410,14447,14428,13889,27445,13936,13953,13968,14007,27445,13314,13903,13983,14024,27445,13632,12369,13647,13662,27445,13617,13835,13602,13458,27445,13058,13473,13406,13428,27445,13443,12924,13043,13028,27445,12910,13075,13091,12895,27445,13108,12947,11817,10965,27445,12608,12625,12645,12667,27445,12568,12581,12411,12395,27445,12429,12446,12340,11802,27445,12286,12094,12076,12119,27445,12148,12052,12006,7597,27445,6161,6533,11837,10931,27445,10912,10889,10865,6389,27445,6805,6100,7582,6194,27445,6434,6741,6491,6662,27445,6628,6579,6349,6307,27445,6129,6078,6204,5972,27445,4607,5931,5150,5848,27445,5814,5790,5752,5728,27445,5682,5137,5122,5109,27445,5487,5444,5405,5367,27445,5334,5301,5158,5085,27445,5066,5000,4983,4943,27445,4925,4878,4842,3131,27445,4827,4753,4724,4689,27445,4583,4550,4564,4416,27445,4511,4379,4446,4463,27445,4399,4382,4359,4337,27445,4207,4083,4107,4097,27445,4073,4244,4224,4139,27445,3981,4038,3963,3942,27445,3919,3902,3830,3844,27445,3403,3761,3742,3708,27445,3310,3377,3238,3659,27445,3547,3573,3590,3290,27445,3300,3280,3325,3357,27445,3331,3221,3204,3166,27445,3112,3092,2945,3053,27445,3024,3007,1407,2602,27445,2965,2523,2535,2571,27445,2529,2580,2586,2594,27445,2755,2365,1813,2382,27445,2499,2481,2464,2445,27445,2427,2256,2297,2275,27445,2349,2321,2240,2191,27445,2167,2137,2124,2083,27445,1895,1515,1925,1755,27445,1870,1848,1773,1742,27445,1549,1531,1688,1669,27445,1603,1639,1622,1586,27445,1436,1567,1479,1463,27445,1409,1422,1432,1244,27445,1428,1419,1414,1400,27445,1395,1387,1384,1379,27445,1317,1355,1351,1336,27445,1229,1251,1332,1233,27445,1323,1313,1296,1292,27445,1288,1255,1238,1226,27445,1223,1219,1211,929,27445,926,923,917,912,27445,908,905,291,292,27445,16,17,18,19,27445,20,21,22,23,27445,24,25,26,27,27445,28,29,30,31,27445,32,33,34,35,27445,37,38,39,40,27445,41,42,43,44,27445,45,46,47,48,27445,1345,49,50,51,27445,52,53,54,55,27445,56,57,58,59,27445,61,62,63,64,27445,65,66,67,68,27445,69,70,71,72,27445,73,74,75,77,27445,78,79,80,81,27445,82,83,84,85,27445,86,87,88,89,27445,90,91,92,93,27445,94,95,96,97,27445,98,99,100,101,27445,102,103,104,105,27445,106,107,108,109,27445,110,111,112,113,27445,114,115,116,117,27445,118,119,120,121,27445,122,123,124,125,27445,126,127,128,129,27445,130,131,132,133,27445,134,135,136,137,27445,138,139,140,141,27445,142,143,144,145,27445,146,147,148,149,27445,150,151,152,153,27445,154,155,156,157,27445,158,159,160,161,27445,162,163,164,165,27445,166,167,168,169,27445,170,171,172,173,27445,174,175,176,177,27445,178,179,180,181,27445,182,183,184,185,27445,186,187,188,189,27445,190,191,192,193,27445,194,195,196,198,27445,199,200,202,203,27445,204,205,207,208,27445,209,210,211,212,27445,213,214,215,216,27445,218,219,220,221,27445,222,223,224,225,27445,226,227,228,229,27445,231,232,233,234,27445,235,236,237,238,27445,239,240,241,243,27445,244,245,246,247,27445,249,250,251,252,27445,253,254,255,256,27445,257,258,259,260,27445,261,262,263,264,27445,265,266,267,268,27445,269,271,272,273,27445,274,275,276,277,27445,278,279,280,281,27445,282,283,284,285,27445,286,287,288,289,27445,290&action=alm_get_posts&query_type=standard' for n in range(0, 9) ] [crawl(task) for task in tasklist] lt.save(total, name='codemyui.csv')
""" 我看过的书籍在豆瓣上的归档 """ import looter as lt domain = 'https://book.douban.com' MAX_PAGE = 4 cookie = 'll="108169"; bid=uRkL12JXzhg; gr_user_id=59ffc171-7ba8-45d4-9848-d04cae3b99cc; _vwo_uuid_v2=DB7C868DC8A7D64AC4F5CA5FABCD30D38|4602ea9fbee2cc9464c993b2583c33b2; __yadk_uid=F3jytnOVCtD1iSsRYhBQoJCWnYu4LlvH; __gads=ID=15a1f5c9d2c8e1c1:T=1565230512:S=ALNI_MbiSb0-9zzDosCeLic43kt9Wajqxw; douban-profile-remind=1; __utmv=30149280.15853; viewed="26349497_25909351_25920727_30170670_25872086_30386804_30239781_4279678_2280547_1139336"; push_noty_num=0; push_doumail_num=0; __utmz=81379588.1575535591.31.25.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/people/158535797/; __utmz=30149280.1575727536.50.8.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); ap_v=0,6.0; _pk_ref.100001.3ac3=%5B%22%22%2C%22%22%2C1575772677%2C%22https%3A%2F%2Fwww.douban.com%2Fpeople%2F158535797%2F%22%5D; _pk_ses.100001.3ac3=*; __utma=30149280.535054538.1565077784.1575727536.1575772677.51; __utmc=30149280; __utmt_douban=1; __utmc=81379588; __utma=81379588.2002100436.1565077801.1575535591.1575772677.32; __utmt=1; dbcl2="158535797:Ox8dyd/5hW4"; ck=SDvR; __utmt=1; _pk_id.100001.3ac3=6a5ac498d3bf471e.1565077801.32.1575772866.1575536822.; __utmb=30149280.9.10.1575772677; __utmb=81379588.7.10.1575772677' headers = lt.DEFAULT_HEADERS headers['Cookie'] = cookie def crawl(url): tree = lt.fetch(url, headers=headers) items = tree.css('.list-view .item') for item in items: data = {} data['title'] = item.css('a::text').extract_first().strip() data['url'] = item.css('a::attr(href)').extract_first().strip() intro = item.css('span.intro::text').extract_first() data['date'] = intro.split('/')[-2].strip() data['intro'] = intro yield data if __name__ == '__main__': tasklist = [f'{domain}/people/158535797/collect?sort=time&start=0&filter=all&mode=list&tags_sort=count'] total = lt.crawl_all(crawl, tasklist) lt.save(total, name='douban_book_archive.csv')
data = requests.get(url, params=params, headers=headers).json() uniqid = data['data']['uniqid'] user_indexes = data['data']['userIndexes'][0] key = requests.get(f'{domain}/Interface/api/ptbk?uniqid={uniqid}', headers=headers).json()['data'] encrypted_data = {kind: user_indexes[kind]['data'] for kind in kinds} decrypted_data = { kind: decrypt(key, d).split(',') for kind, d in encrypted_data.items() } date_range = pd.date_range(start_date, end_date).to_native_types() result = [] for kind, indexes in decrypted_data.items(): rows = [{ 'kind': kind, 'date': date, 'index': index, 'keyword': word } for date, index in zip(date_range, indexes)] result.extend(rows) logger.info((rows[0], rows[-1])) total.extend(result) time.sleep(5) except Exception as e: logger.error(f'{word}抓取失败') if __name__ == '__main__': [crawl(word) for word in words] lt.save(total, name=f'{name}.csv')
""" 8说了你懂的 """ from pprint import pprint from concurrent import futures import looter as lt domain = 'https://www.javbus.pw' total = [] def crawl(url): tree = lt.fetch(url) items = tree.css('#waterfall .item') for item in items: data = {} data['name'] = item.css('img::attr(title)').extract_first() data['cover'] = item.css('img::attr(src)').extract_first() data['link'] = item.css('.movie-box::attr(href)').extract_first() data['bango'] = item.css('date::text').extract_first() data['date'] = item.css('date::text').extract()[1] pprint(data) total.append(data) if __name__ == '__main__': tasklist = [f'{domain}/page/{i}' for i in range(1, 90)] with futures.ThreadPoolExecutor(50) as executor: executor.map(crawl, tasklist) lt.save(total, name='jav.json')
continue data = {} data['title'] = question['title'] data[ 'source'] = f"{domain}/question/{question['id']}/answer/{target['id']}" data['vote'] = target['voteup_count'] pprint(data) total.append(data) def generate(): with open('data.json', encoding=encoding) as i, open('zhihu_top_video.md', 'w', encoding=encoding) as o: data = json.loads(i.read()) o.writelines(f"{i+1}. [{d['title']}]({d['source']})赞数:{d['vote']}\n" for i, d in enumerate(data)) if __name__ == '__main__': tasklist = [ f'{domain}/api/v4/topics/19776749/feeds/essence?&offset={10 * n}&limit=10' for n in range(100) ] with futures.ThreadPoolExecutor(50) as executor: executor.map(crawl, tasklist) save(total, sort_by='vote', order='desc') generate() os.remove('data.json')
""" dribbble最火的shot """ import looter as lt domain = 'https://dribbble.com' def crawl(url): tree = lt.fetch(url) items = tree.css('li.shot-thumbnail') for item in items: data = {} data['title'] = item.css('a strong::text').extract_first() data['url'] = f"{domain}{item.css('a::attr(href)').extract_first()}" data['author'] = item.css('.display-name::text').extract_first() data['fav'] = int(item.css('span.toggle-fav::text').extract_first().strip()) data['comment'] = int(item.css('li.cmnt span::text').extract_first().strip()) yield data if __name__ == '__main__': tasklist = [f'{domain}/shots/popular?timeframe=ever&page={n}&per_page=24' for n in range(1, 51)] total = lt.crawl_all(crawl, tasklist) lt.save(total, name='dribbble.csv', sort_by='fav', no_duplicate=True, order='desc')
unsplash上的免费壁纸排行 """ from pprint import pprint from concurrent import futures import requests import looter as lt domain = 'https://unsplash.com' total = [] def crawl(url): imgs = requests.get(url, headers=lt.DEFAULT_HEADERS).json() for img in imgs: data = {} data['created'] = img['created_at'] data['url'] = img['urls']['full'] data['likes'] = img['likes'] pprint(data) total.append(data) if __name__ == '__main__': tasklist = [ f'{domain}/napi/collections/1065976/photos?page={n}&per_page=10&order_by=latest&share_key=a4a197fc196734b74c9d87e48cc86838' for n in range(1, 136) ] with futures.ThreadPoolExecutor(50) as executor: executor.map(crawl, tasklist) lt.save(total, name='unsplash.csv', sort_by='likes', order='desc')
""" import requests import looter as lt domain = 'https://www.zhihu.com' encoding = 'utf-8' def crawl(url): items = requests.get(url, headers=lt.DEFAULT_HEADERS).json()['data'] for item in items: target = item['target'] question = target.get('question') if not question: # 只抓视频: or not target.get('topic_thumbnails_extra_info'): continue data = {} data['title'] = question['title'] data[ 'source'] = f"{domain}/question/{question['id']}/answer/{target['id']}" data['vote'] = target['voteup_count'] yield data if __name__ == '__main__': tasklist = [ f'{domain}/api/v4/topics/19776749/feeds/essence?&offset={10 * n}&limit=10' for n in range(100) ] total = lt.crawl_all(crawl, tasklist) lt.save(total, name='zhihu_top.csv', sort_by='vote', order='desc')
def crawl(task: str): try: json = requests.get(url.format(q=task), headers=headers).json() pprint(json) if json.get('message'): time.sleep(10) item = json.get('items', [None])[0] data = {} data['task'] = task if item: data['repo_name'] = item.get('full_name') data['repo_url'] = f"https://github.com/{item.get('full_name')}" data['stars'] = item.get('stargazers_count') data['forks'] = item.get('forks_count') data['watchers'] = item.get('watchers_count') pprint(data) total.append(data) except Exception as e: print(f'[Err] {e}') if __name__ == "__main__": tasklist = Path(r'tldr_github.txt').read_text().split(', ') [crawl(task) for task in tasklist] save(total, name='tldr_github.csv', sort_by='stars', no_duplicate=True, order='desc')
小众软件,按评价数排序 """ import asyncio import looter as lt from pprint import pprint domain = 'https://www.appinn.com' categories = ['windows', 'chrome'] total = [] async def crawl(url): tree = await lt.async_fetch(url) items = tree.css('section#latest-posts article.post-box') for item in items: data = {} data['title'] = item.css('a::attr(title)').extract_first() data['url'] = item.css('a::attr(href)').extract_first() data['date'] = item.css('span.thetime span::text').extract_first() data['comments'] = int(item.css("a[itemprop='interactionCount']::text").extract_first()) pprint(data) total.append(data) if __name__ == '__main__': tasklist = [f'{domain}/category/{category}/page/{n}/' for n in range(1, 21) for category in categories] loop = asyncio.get_event_loop() result = [crawl(task) for task in tasklist] loop.run_until_complete(asyncio.wait(result)) lt.save(total, name='appinn.csv', sort_by='comments', order='desc')
""" 8说了你懂的 """ import looter as lt domain = 'https://www.javbus.pw' def crawl(url): tree = lt.fetch(url) items = tree.css('#waterfall .item') for item in items: data = {} data['name'] = item.css('img::attr(title)').extract_first() data['cover'] = item.css('img::attr(src)').extract_first() data['link'] = item.css('.movie-box::attr(href)').extract_first() data['bango'] = item.css('date::text').extract_first() data['date'] = item.css('date::text').extract()[1] yield data if __name__ == '__main__': tasklist = [f'{domain}/page/{i}' for i in range(1, 90)] total = lt.crawl_all(crawl, tasklist) lt.save(total, name='jav.csv', sort_by='date', order='desc')
def crawl(url): text = requests.get(url, headers=lt.DEFAULT_HEADERS).text text = text.replace('<!--', '').replace('-->', '') tree = Selector(text=text) items = tree.css('ul#thread_list li.j_thread_list') for item in items: data = {} data['title'] = item.css('a.j_th_tit::text').extract_first() data['abstract'] = item.css( '.threadlist_abs::text').extract_first().strip() data[ 'url'] = f"{domain}{item.css('a.j_th_tit::attr(href)').extract_first()}" data['author'] = item.css('a.frs-author-name::text').extract_first() data['reply'] = int( item.css('span.threadlist_rep_num::text').extract_first()) data['date'] = item.css( '.threadlist_reply_date::text').extract_first().strip() yield data if __name__ == '__main__': tasklist = [f'{domain}/f?kw={keyword}&ie=utf-8&pn={n}' for n in range(501)] total = lt.crawl_all(crawl, tasklist) lt.save(total, name=f'tieba_{keyword}.csv', sort_by='reply', order='desc', no_duplicate=True)
v2ex上的Python板块 """ import time import looter as lt from pprint import pprint from concurrent import futures domain = 'https://www.v2ex.com' total = [] def crawl(url): tree = lt.fetch(url) items = tree.css('#TopicsNode .cell') for item in items: data = {} data['title'] = item.css('span.item_title a::text').extract_first() data['author'] = item.css('span.small.fade strong a::text').extract_first() data['source'] = f"{domain}{item.css('span.item_title a::attr(href)').extract_first()}" reply = item.css('a.count_livid::text').extract_first() data['reply'] = int(reply) if reply else 0 pprint(data) total.append(data) time.sleep(1) if __name__ == '__main__': tasklist = [f'{domain}/go/python?p={n}' for n in range(1, 572)] [crawl(task) for task in tasklist] lt.save(total, name='v2ex.csv', sort_by='reply', order='desc')
import looter as lt from pprint import pprint domain = 'https://juejin.im' total = [] def crawl(url): items = requests.get(url, headers=lt.DEFAULT_HEADERS).json()['d'] for item in items: data = {} data['title'] = item['title'] data['desc'] = item['desc'] data['author'] = item['userData']['username'] data['profile'] = item['profile'] data['buyCount'] = item['buyCount'] data['price'] = item['price'] data['publishDate'] = item['finishedAt'] data['url'] = f"{domain}/book/{item['_id']}" pprint(data) total.append(data) if __name__ == '__main__': tasklist = [ f'https://xiaoce-timeline-api-ms.juejin.im/v1/getListByLastTime?uid=5901b4faac502e0063cf9e02&client_id=1555503959385&token=eyJhY2Nlc3NfdG9rZW4iOiJuM0g1REUzUUZ0RjczNnJwIiwicmVmcmVzaF90b2tlbiI6InVJck0zcURsbjlkU2dJRm8iLCJ0b2tlbl90eXBlIjoibWFjIiwiZXhwaXJlX2luIjoyNTkyMDAwfQ%3D%3D&src=web&alias=&pageNum={n}' for n in range(1, 4) ] [crawl(task) for task in tasklist] lt.save(total, name='juejin_books.csv', sort_by='buyCount', order='desc')
url = f'{domain}/mmdb/comments/movie/{movie_id}.json?_v_=yes&offset=0&startTime={start_time}' start_time = requests.get(url).json()['cmts'][-1]['startTime'] print(start_time) total_timestamps.append(start_time) Path('maoyan_comment_timestamps.txt').write_text( '\n'.join(total_timestamps)) def crawl(url): items = requests.get(url, headers=lt.DEFAULT_HEADERS).json()['cmts'] for item in items: data = {} data['nick_name'] = item['nickName'] data['score'] = item['score'] data['content'] = item['content'] data['city_name'] = item['cityName'] pprint(data) total_items.append(data) if __name__ == '__main__': get_timestamps() start_times = Path('maoyan_comment_timestamps.txt').read_text().split('\n') tasklist = [ f'{domain}/mmdb/comments/movie/{movie_id}.json?_v_=yes&offset=0&startTime={t}' for t in start_times ] with futures.ThreadPoolExecutor(50) as executor: executor.map(crawl, tasklist) lt.save(total_items, name='maoyan_comments.csv', no_duplicate=True)
from parsel import Selector import looter as lt domain = 'https://hacpai.com' total = [] expand_num = lambda num: float(num[:-1]) * 1000 if 'K' in num else int(num) def crawl(url): html = requests.get(url, headers=lt.DEFAULT_HEADERS).json()['contentHTML'] tree = Selector(text=html) items = tree.css('li.article-list__item') for item in items: data = {} data['title'] = item.css('h2 a::text').extract_first() data['link'] = item.css('h2 a::attr(href)').extract_first() data['views'] = expand_num( item.css('a.article-list__cnt span::text').extract_first().strip()) pprint(data) total.append(data) if __name__ == '__main__': tasklist = [f'{domain}/domain/play?ajax=true&p={n}' for n in range(1, 13)] [crawl(task) for task in tasklist] lt.save(total, name='hacpai.csv', no_duplicate=True, sort_by='views', order='desc')
def crawl(url): tree = lt.fetch(url) items = tree.css('table.n_worklist tr') for item in items: data = {} data['name'] = item.css('.work_name a::text').extract_first() data['link'] = item.css('.work_name a::attr(href)').extract_first() data['maker'] = item.css('dd.maker_name a::text').extract_first() try: data['price'] = int(''.join(item.css('span.work_price::text').extract_first().split(','))) data['rate'] = int(item.css('.star_rating::text').re_first('\d+')) data['review'] = int(item.css('.work_review a::text').re_first('\d+')) except Exception as e: print(e) data['price'] = 0 data['rate'] = 0 data['review'] = 0 if not data['name']: continue yield data if __name__ == '__main__': tasklist = [ f'https://www.dlsite.com/pro/fsr/=/language/jp/order%5B0%5D/trend/per_page/30/page/{n}' for n in range(1, 11252) ] total = lt.crawl_all(crawl, tasklist) lt.save(total, name='dlsite.csv', sort_by='rate', order='desc')