Exemplo n.º 1
0
def test_save():
    # sort_by
    unordered_data = [{'r': 2}, {'r': 3}, {'r': 1}]
    lt.save(unordered_data, name='ordered.json', sort_by='r')
    ordered_data = read_json('ordered.json')
    assert ordered_data[0]['r'] == 1
    os.remove('ordered.json')

    # no_duplicate
    dup_data = [{'a': 1}, {'a': 1}, {'b': 2}]
    lt.save(dup_data, name='unique.json', no_duplicate=True)
    unique_data = read_json('unique.json')
    assert len(dup_data) > len(unique_data)
    os.remove('unique.json')
Exemplo n.º 2
0
def test_save():
    data = [{
        'rank': 2,
        'name': 'python'
    }, {
        'rank': 1,
        'name': 'js'
    }, {
        'rank': 3,
        'name': 'java'
    }]
    lt.save(data, sort_by='rank')
    with open('data.json', 'r') as f:
        ordered_data = json.loads(f.read())
    assert ordered_data[0]['rank'] == 1
    os.remove('data.json')

    dup_data = [{'a': 1}, {'a': 1}, {'b': 2}]
    lt.save(dup_data, no_duplicate=True)
    with open('data.json', 'r') as f:
        unique_data = json.loads(f.read())
    assert len(dup_data) > len(unique_data)
    os.remove('data.json')

    with pytest.raises(SystemExit):
        lt.save(data, name='data.csv')
Exemplo n.º 3
0
"""
简书上“程序员”专题下的文章
"""
from pprint import pprint
import looter as lt

domain = 'https://www.jianshu.com'
total = []


def crawl(url):
    try:
        tree = lt.fetch(url)
        items = tree.css('ul.note-list li')
        for item in items:
            data = {}
            data['title'] = item.css('.content a.title::text').extract_first()
            data['author'] = item.css('a.nickname::text').extract_first()
            data['source'] = f"{domain}{item.css('.content a.title::attr(href)').extract_first()}"
            data['vote'] = max(map(int, (item.css('.meta span').re(r'\d+'))))
            pprint(data)
            total.append(data)
    except Exception as e:
        print(e)


if __name__ == '__main__':
    tasklist = [f'{domain}/c/NEt52a?order_by=top&page={n}' for n in range(1, 201)]
    [crawl(task) for task in tasklist]
    lt.save(total, name='jianshu.csv', sort_by='vote', order='desc')
Exemplo n.º 4
0
"""
import requests
import arrow
import looter as lt

domain = 'https://sspai.com'


def crawl(url):
    items = requests.get(url, headers=lt.DEFAULT_HEADERS).json()['list']
    for item in items:
        data = {}
        data['title'] = item['title']
        data['released_at'] = arrow.get(item['released_at']).naive
        data['summary'] = item['summary']
        data['words_count'] = item['words_count']
        data['likes_count'] = item['likes_count']
        data['favorites_count'] = item['favorites_count']
        data['comments_count'] = item['comments_count']
        data['url'] = f"{domain}/post/{item['id']}"
        yield data


if __name__ == '__main__':
    tasklist = [
        f'{domain}/api/v1/articles?offset={n * 10}&limit=10&type=recommend_to_home&sort=recommend_to_home_at&include_total=false'
        for n in range(1170)
    ]
    total = lt.crawl_all(crawl, tasklist)
    lt.save(total, name='sspai.csv', sort_by='likes_count', order='desc')
from pathlib import Path
import looter as lt

encoding = 'utf-8'
total = []


def parse_json(path: Path):
    data = json.loads(path.read_text(encoding=encoding))
    pens = data[0]['data']['pens']['pens']
    results = []
    for pen in pens:
        result = {
            'title': pen['title'],
            'user': pen['owner']['username'],
            'url': pen['url'],
            'updatedAt': pen['updatedAt'],
            'comments': pen['counts']['comments'],
            'loves': pen['counts']['loves'],
            'views': pen['counts']['views']
        }
        results.append(result)
    return results


if __name__ == "__main__":
    for path in Path('.').glob('*.json'):
        total.extend(parse_json(path))
    lt.save(total, name='codepen_loved.csv',
            sort_by='loves', order='desc', no_duplicate=True)
Exemplo n.º 6
0
        data['番名'] = item['title']
        data['链接'] = item['link']
        order = item['order']
        score = order.get('score')
        data['评分'] = float(score[:-1]) if score else 0.0
        data['放松日期'] = arrow.get(order['pub_date']).naive
        season_id = item['season_id']
        data['id'] = season_id
        season = requests.get(
            f'{domain}/ext/web_api/season_count?season_id={season_id}&season_type=1'
        ).json()['result']
        data['追番人数'] = season['favorites']
        data['播放量'] = season['views']
        data['硬币数'] = season['coins']
        data['弹幕数'] = season['danmakus']
        pprint(data)
        total.append(data)


if __name__ == '__main__':
    tasklist = [
        f'{domain}/media/web_api/search/result?order=3&sort=0&page={n}&season_type=1&pagesize=30'
        for n in range(1, 106)
    ]
    with futures.ThreadPoolExecutor(20) as executor:
        executor.map(crawl, tasklist)
    lt.save(total,
            name='bilibili_top_bangumi.csv',
            sort_by='追番人数',
            order='desc')
Exemplo n.º 7
0
domain = 'https://bangumi.tv'
user_id = '399538'
page_limit = 7

format_date = lambda date: '-'.join(f'0{d}'
                                    if len(d) == 1 else d for d in re.sub(
                                        r'年|月|日', '-', date)[:-1].split('-'))


def crawl(url):
    tree = lt.fetch(url)
    items = tree.css('ul#browserItemList li.item')
    for item in items:
        data = {}
        data['title'] = item.css('h3 a.l::text').extract_first()
        info = item.css('p.info::text').extract_first().strip()
        date = info.split(r'/')[1].strip() if r'/' in info else info
        data['date'] = format_date(date)
        data[
            'url'] = f"{domain}{item.css('h3 a.l::attr(href)').extract_first()}"
        yield data


if __name__ == '__main__':
    tasklist = [
        f'{domain}/anime/list/{user_id}/collect?orderby=date&page={n}'
        for n in range(1, page_limit + 1)
    ]
    total = lt.crawl_all(crawl, tasklist)
    lt.save(total, name='bangumi.csv', sort_by='date', order='desc')
Exemplo n.º 8
0
        item_data_list.append(item_data)
    ids = ','.join(item['post_id'] for item in items)
    stat_data_list = []
    stats = requests.get(
        f'https://sso.ifanr.com/api/v5/wp/article/stats/?limit=50&post_id__in={ids}',
        headers=lt.DEFAULT_HEADERS).json()['objects']
    for stat in stats:
        stat_data = {}
        stat_data['favorite_count'] = stat['favorite_count']
        stat_data['like_count'] = stat['like_count']
        stat_data['share_count'] = stat['share_count']
        stat_data_list.append(stat_data)
    data_list = [{
        **data[0],
        **data[1]
    } for data in zip(item_data_list, stat_data_list)]
    yield data_list


if __name__ == '__main__':
    tasklist = [
        f'https://sso.ifanr.com//api/v5/wp/web-feed/?published_at__lte=2019-05-25+07%3A00%3A11&limit=20&offset={n * 20}'
        for n in range(1900)
    ]
    total = lt.crawl_all(crawl, tasklist)
    lt.save(total,
            name='ifanr.csv',
            no_duplicate=True,
            sort_by='like_count',
            order='desc')
Exemplo n.º 9
0
"""
我看过的电影在豆瓣上的归档
"""
import looter as lt

domain = 'https://movie.douban.com'
MAX_PAGE = 4


def crawl(url):
    tree = lt.fetch(url)
    items = tree.css('.list-view .item')
    for item in items:
        data = {}
        data['title'] = item.css('a::text').extract_first().strip()
        data['url'] = item.css('a::attr(href)').extract_first().strip()
        intro = item.css('span.intro::text').extract_first()
        data['date'] = intro[:10]
        data['intro'] = intro
        yield data


if __name__ == '__main__':
    tasklist = [
        f'{domain}/people/158535797/collect?start={n * 30}&sort=time&rating=all&filter=all&mode=list'
        for n in range(0, MAX_PAGE)
    ]
    total = lt.crawl_all(crawl, tasklist)
    lt.save(total, name='douban_movie_archive.csv')
Exemplo n.º 10
0
"""
salttiger上的免费国外编程电子书
"""
import os
from pprint import pprint
import looter as lt

domain = 'https://salttiger.com'


def crawl(url):
    tree = lt.fetch(url)
    items = tree.css('ul.car-monthlisting li')
    total = []
    for item in items:
        data = {}
        data['name'] = item.css('a::text').extract_first()
        data['url'] = item.css('a::attr(href)').extract_first()
        data['comments'] = int(item.css('span::text').re_first(r'(\d+)'))
        pprint(data)
        total.append(data)
    return total


if __name__ == '__main__':
    task = f'{domain}/archives/'
    result = crawl(task)
    lt.save(result, name='salttiger.csv', sort_by='comments', order='desc')
Exemplo n.º 11
0
total = []


async def crawl(url):
    tree = await lt.async_fetch(url)
    items = tree.css('.resources-wrapper')
    for item in items:
        data = {}
        if item.css('a::text').extract()[0] == 'Sponsor UI Movement':
            continue
        data['title'] = item.css('a::text').extract()[-2]
        data['url'] = f"{domain}{item.css('a::attr(href)').extract_first()}"
        if (vote :=
                item.css('small.vote-count-wrapper::text').extract_first()):
            data['vote'] = int(vote)
        else:
            data['vote'] = 0
        total.append(data)


if __name__ == '__main__':
    tasklist = [f'{domain}/all-designs/?page={n}' for n in range(1, 501)]
    loop = asyncio.get_event_loop()
    result = [crawl(task) for task in tasklist]
    loop.run_until_complete(asyncio.wait(result))
    lt.save(total,
            name='uimovement.csv',
            no_duplicate=True,
            sort_by='vote',
            order='desc')
Exemplo n.º 12
0
domain = 'https://book.douban.com'


def crawl(url):
    tree = lt.fetch(url)
    items = tree.css('ul.subject-list li.subject-item')
    for item in items:
        data = {}
        data['title'] = item.css('h2 a::text').extract_first().strip()
        data['link'] = item.css('h2 a::attr(href)').extract_first()
        data['pub'] = item.css('.pub::text').extract_first().strip()
        try:
            data['rating'] = float(
                item.css('span.rating_nums::text').extract_first())
        except Exception:
            data['rating'] = 0.0
        try:
            data['comments'] = int(item.css('span.pl').re_first(r'\d+'))
        except Exception:
            data['comments'] = 0
        yield data


if __name__ == '__main__':
    tasklist = [
        f'{domain}/tag/%E8%AE%A1%E7%AE%97%E6%9C%BA?start={20 * n}&type=T'
        for n in range(0, 50)
    ]
    total = lt.crawl_all(crawl, tasklist)
    lt.save(total, name='douban_books.csv', sort_by='comments', order='desc')
Exemplo n.º 13
0
捷径社区的捷径排行榜
"""
from pprint import pprint
import requests
import looter as lt

domain = 'https://sharecuts.cn'
total = []


def crawl(url):
    items = requests.get(url, headers=lt.DEFAULT_HEADERS).json()
    for item in items:
        data = {}
        data['name'] = item['name']
        data['category'] = item['Category']['name']
        data['note'] = item['note']
        data['author'] = item['User']['nickname']
        data['url'] = item['url']
        data['downloads'] = item['downloads_count']
        data['votes'] = item['votes_count']
        data['comments'] = item['comments_count']
        pprint(data)
        total.append(data)


if __name__ == '__main__':
    task = f'{domain}/api/shortcuts/hot?offset=0&limit=1025'
    crawl(task)
    lt.save(total, name='sharecuts.csv', sort_by='votes', order='desc')
Exemplo n.º 14
0
"""
codemyui网站归档
"""
import requests
from parsel import Selector
import looter as lt

domain = 'https://codemyui.com'
total = []


def crawl(url):
    tree = Selector(
        text=requests.get(url, headers=lt.DEFAULT_HEADERS).json()['html'])
    items = tree.css('.alm-layout .details')
    for item in items:
        data = {}
        data['title'] = item.css('h3 a::text').extract_first()
        data['url'] = item.css('h3 a::attr(href)').extract_first()
        data['description'] = item.css('p::text').extract_first()
        total.append(data)


if __name__ == '__main__':
    tasklist = [
        f'{domain}/wp-admin/admin-ajax.php?id=LoadArticleToo&post_id=0&slug=home&canonical_url=https%3A%2F%2Fcodemyui.com%2F&posts_per_page=120&page={n}&offset=0&post_type=post&repeater=default&seo_start_page=1&preloaded=false&preloaded_amount=0&cta[cta]=true&cta[cta_position]=after:4&cta[cta_repeater]=template_1&cta[cta_theme_repeater]=null&taxonomy_terms=home&order=DESC&orderby=post__in&post__in=28560,28549,28555,28541,27445,28536,28530,28511,28501,27445,28473,28462,28455,17739,27445,28426,28419,28400,25214,27445,28375,19762,25308,197,27445,24874,217,4011,248,27445,24628,23147,24834,242,27445,76,5016,23177,22950,27445,25409,24739,25519,24684,27445,201,330,22918,24952,27445,5230,20146,17596,36,27445,270,230,206,27558,27445,27415,27409,27404,27362,27445,27357,27352,27347,27342,27445,27330,27324,27319,27312,27445,27307,27286,27270,27265,27445,27260,27238,27234,27230,27445,27226,27222,27218,27213,27445,27208,27203,27190,27179,27445,27142,27062,27058,27051,27445,27046,27041,27038,27034,27445,27030,27027,27023,27020,27445,26988,26978,60,26972,27445,26968,26942,26938,26934,27445,26914,26910,26907,26903,27445,26900,26884,26896,26893,27445,26881,26878,26875,26869,27445,26872,26866,26854,26845,27445,26839,26835,26823,26831,27445,26827,26818,26814,26810,27445,26806,26791,26795,26786,27445,26777,26782,26770,26766,27445,26761,26756,26751,26747,27445,26733,26740,26722,26716,27445,26706,26702,26695,26691,27445,26687,26683,26677,26673,27445,26669,26665,26661,26656,27445,26652,26606,26602,26598,27445,26594,26573,26566,26562,27445,26556,26552,26468,26505,27445,26501,26472,26461,26449,27445,26445,26437,26434,26441,27445,26412,26416,26408,26404,27445,26400,26393,26389,26382,27445,26373,25651,25646,25637,27445,25634,25630,25626,25623,27445,25620,25617,25614,25611,27445,25540,25543,25534,25524,27445,25513,25507,25502,25490,27445,25485,25481,25477,25474,27445,25470,25462,25466,25458,27445,25454,25448,25444,25438,27445,25434,25430,25423,25419,27445,25415,25401,25405,25397,27445,25393,25388,25384,25378,27445,25374,25370,25317,25365,27445,25360,25355,25351,25342,27445,25322,25327,25337,25333,27445,25312,25009,25244,25299,27445,25135,25290,25286,25281,27445,25238,25208,25229,25232,27445,25247,25262,25226,25235,27445,25203,25170,25078,25558,27445,25177,25139,25173,25122,27445,25142,25075,24983,25116,27445,25119,25094,25099,25103,27445,25085,25072,24971,24996,27445,24989,25061,25041,25044,27445,25037,25034,25030,24986,27445,25012,25018,24978,24977,27445,24974,24958,14552,24817,27445,24853,24919,24922,24910,27445,24879,24905,24900,24896,27445,24892,24888,24884,24870,27445,25347,24845,24848,24842,27445,24839,24823,24820,24810,27445,24806,24782,24801,24797,27445,24791,24787,24749,24753,27445,24756,24746,24743,24729,27445,24719,24726,24723,24716,27445,24701,24704,24698,24694,27445,24690,24668,24678,24673,27445,24663,24659,23623,24612,27445,23592,23253,23235,23219,27445,23180,23112,23091,23070,27445,23050,23034,22974,22869,27445,20397,20369,20127,22838,27445,20111,20095,20077,20062,27445,20034,19981,19960,19941,27445,19925,19902,19880,19861,27445,19828,19812,19796,19778,27445,19747,17941,17925,17888,27445,17904,17860,17578,17670,27445,15715,17636,17771,17755,27445,17723,17250,15834,17652,27445,17562,17526,17510,17494,27445,17478,17446,17427,17373,27445,17391,17407,17234,17355,27445,17340,17322,17303,17290,27445,17218,17186,17170,15919,27445,15895,15877,15591,15850,27445,15818,15802,15786,15757,27445,15737,15699,15418,15366,27445,15666,15643,15625,15400,27445,15573,15549,15520,15503,27445,15486,15470,15452,15434,27445,15383,14362,14330,14813,27445,15180,15268,15228,15093,27445,15059,14347,14841,14830,27445,14864,14878,14315,14391,27445,14379,14535,14565,14596,27445,14410,14447,14428,13889,27445,13936,13953,13968,14007,27445,13314,13903,13983,14024,27445,13632,12369,13647,13662,27445,13617,13835,13602,13458,27445,13058,13473,13406,13428,27445,13443,12924,13043,13028,27445,12910,13075,13091,12895,27445,13108,12947,11817,10965,27445,12608,12625,12645,12667,27445,12568,12581,12411,12395,27445,12429,12446,12340,11802,27445,12286,12094,12076,12119,27445,12148,12052,12006,7597,27445,6161,6533,11837,10931,27445,10912,10889,10865,6389,27445,6805,6100,7582,6194,27445,6434,6741,6491,6662,27445,6628,6579,6349,6307,27445,6129,6078,6204,5972,27445,4607,5931,5150,5848,27445,5814,5790,5752,5728,27445,5682,5137,5122,5109,27445,5487,5444,5405,5367,27445,5334,5301,5158,5085,27445,5066,5000,4983,4943,27445,4925,4878,4842,3131,27445,4827,4753,4724,4689,27445,4583,4550,4564,4416,27445,4511,4379,4446,4463,27445,4399,4382,4359,4337,27445,4207,4083,4107,4097,27445,4073,4244,4224,4139,27445,3981,4038,3963,3942,27445,3919,3902,3830,3844,27445,3403,3761,3742,3708,27445,3310,3377,3238,3659,27445,3547,3573,3590,3290,27445,3300,3280,3325,3357,27445,3331,3221,3204,3166,27445,3112,3092,2945,3053,27445,3024,3007,1407,2602,27445,2965,2523,2535,2571,27445,2529,2580,2586,2594,27445,2755,2365,1813,2382,27445,2499,2481,2464,2445,27445,2427,2256,2297,2275,27445,2349,2321,2240,2191,27445,2167,2137,2124,2083,27445,1895,1515,1925,1755,27445,1870,1848,1773,1742,27445,1549,1531,1688,1669,27445,1603,1639,1622,1586,27445,1436,1567,1479,1463,27445,1409,1422,1432,1244,27445,1428,1419,1414,1400,27445,1395,1387,1384,1379,27445,1317,1355,1351,1336,27445,1229,1251,1332,1233,27445,1323,1313,1296,1292,27445,1288,1255,1238,1226,27445,1223,1219,1211,929,27445,926,923,917,912,27445,908,905,291,292,27445,16,17,18,19,27445,20,21,22,23,27445,24,25,26,27,27445,28,29,30,31,27445,32,33,34,35,27445,37,38,39,40,27445,41,42,43,44,27445,45,46,47,48,27445,1345,49,50,51,27445,52,53,54,55,27445,56,57,58,59,27445,61,62,63,64,27445,65,66,67,68,27445,69,70,71,72,27445,73,74,75,77,27445,78,79,80,81,27445,82,83,84,85,27445,86,87,88,89,27445,90,91,92,93,27445,94,95,96,97,27445,98,99,100,101,27445,102,103,104,105,27445,106,107,108,109,27445,110,111,112,113,27445,114,115,116,117,27445,118,119,120,121,27445,122,123,124,125,27445,126,127,128,129,27445,130,131,132,133,27445,134,135,136,137,27445,138,139,140,141,27445,142,143,144,145,27445,146,147,148,149,27445,150,151,152,153,27445,154,155,156,157,27445,158,159,160,161,27445,162,163,164,165,27445,166,167,168,169,27445,170,171,172,173,27445,174,175,176,177,27445,178,179,180,181,27445,182,183,184,185,27445,186,187,188,189,27445,190,191,192,193,27445,194,195,196,198,27445,199,200,202,203,27445,204,205,207,208,27445,209,210,211,212,27445,213,214,215,216,27445,218,219,220,221,27445,222,223,224,225,27445,226,227,228,229,27445,231,232,233,234,27445,235,236,237,238,27445,239,240,241,243,27445,244,245,246,247,27445,249,250,251,252,27445,253,254,255,256,27445,257,258,259,260,27445,261,262,263,264,27445,265,266,267,268,27445,269,271,272,273,27445,274,275,276,277,27445,278,279,280,281,27445,282,283,284,285,27445,286,287,288,289,27445,290&action=alm_get_posts&query_type=standard'
        for n in range(0, 9)
    ]
    [crawl(task) for task in tasklist]
    lt.save(total, name='codemyui.csv')
Exemplo n.º 15
0
"""
我看过的书籍在豆瓣上的归档
"""
import looter as lt

domain = 'https://book.douban.com'
MAX_PAGE = 4
cookie = 'll="108169"; bid=uRkL12JXzhg; gr_user_id=59ffc171-7ba8-45d4-9848-d04cae3b99cc; _vwo_uuid_v2=DB7C868DC8A7D64AC4F5CA5FABCD30D38|4602ea9fbee2cc9464c993b2583c33b2; __yadk_uid=F3jytnOVCtD1iSsRYhBQoJCWnYu4LlvH; __gads=ID=15a1f5c9d2c8e1c1:T=1565230512:S=ALNI_MbiSb0-9zzDosCeLic43kt9Wajqxw; douban-profile-remind=1; __utmv=30149280.15853; viewed="26349497_25909351_25920727_30170670_25872086_30386804_30239781_4279678_2280547_1139336"; push_noty_num=0; push_doumail_num=0; __utmz=81379588.1575535591.31.25.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/people/158535797/; __utmz=30149280.1575727536.50.8.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); ap_v=0,6.0; _pk_ref.100001.3ac3=%5B%22%22%2C%22%22%2C1575772677%2C%22https%3A%2F%2Fwww.douban.com%2Fpeople%2F158535797%2F%22%5D; _pk_ses.100001.3ac3=*; __utma=30149280.535054538.1565077784.1575727536.1575772677.51; __utmc=30149280; __utmt_douban=1; __utmc=81379588; __utma=81379588.2002100436.1565077801.1575535591.1575772677.32; __utmt=1; dbcl2="158535797:Ox8dyd/5hW4"; ck=SDvR; __utmt=1; _pk_id.100001.3ac3=6a5ac498d3bf471e.1565077801.32.1575772866.1575536822.; __utmb=30149280.9.10.1575772677; __utmb=81379588.7.10.1575772677'
headers = lt.DEFAULT_HEADERS
headers['Cookie'] = cookie


def crawl(url):
    tree = lt.fetch(url, headers=headers)
    items = tree.css('.list-view .item')
    for item in items:
        data = {}
        data['title'] = item.css('a::text').extract_first().strip()
        data['url'] = item.css('a::attr(href)').extract_first().strip()
        intro = item.css('span.intro::text').extract_first()
        data['date'] = intro.split('/')[-2].strip()
        data['intro'] = intro
        yield data


if __name__ == '__main__':
    tasklist = [f'{domain}/people/158535797/collect?sort=time&start=0&filter=all&mode=list&tags_sort=count']
    total = lt.crawl_all(crawl, tasklist)
    lt.save(total, name='douban_book_archive.csv')
Exemplo n.º 16
0
        data = requests.get(url, params=params, headers=headers).json()
        uniqid = data['data']['uniqid']
        user_indexes = data['data']['userIndexes'][0]
        key = requests.get(f'{domain}/Interface/api/ptbk?uniqid={uniqid}',
                           headers=headers).json()['data']
        encrypted_data = {kind: user_indexes[kind]['data'] for kind in kinds}
        decrypted_data = {
            kind: decrypt(key, d).split(',')
            for kind, d in encrypted_data.items()
        }
        date_range = pd.date_range(start_date, end_date).to_native_types()
        result = []
        for kind, indexes in decrypted_data.items():
            rows = [{
                'kind': kind,
                'date': date,
                'index': index,
                'keyword': word
            } for date, index in zip(date_range, indexes)]
            result.extend(rows)
            logger.info((rows[0], rows[-1]))
        total.extend(result)
        time.sleep(5)
    except Exception as e:
        logger.error(f'{word}抓取失败')


if __name__ == '__main__':
    [crawl(word) for word in words]
    lt.save(total, name=f'{name}.csv')
Exemplo n.º 17
0
"""
8说了你懂的
"""
from pprint import pprint
from concurrent import futures
import looter as lt

domain = 'https://www.javbus.pw'
total = []


def crawl(url):
    tree = lt.fetch(url)
    items = tree.css('#waterfall .item')
    for item in items:
        data = {}
        data['name'] = item.css('img::attr(title)').extract_first()
        data['cover'] = item.css('img::attr(src)').extract_first()
        data['link'] = item.css('.movie-box::attr(href)').extract_first()
        data['bango'] = item.css('date::text').extract_first()
        data['date'] = item.css('date::text').extract()[1]
        pprint(data)
        total.append(data)


if __name__ == '__main__':
    tasklist = [f'{domain}/page/{i}' for i in range(1, 90)]
    with futures.ThreadPoolExecutor(50) as executor:
        executor.map(crawl, tasklist)
    lt.save(total, name='jav.json')
Exemplo n.º 18
0
            continue
        data = {}
        data['title'] = question['title']
        data[
            'source'] = f"{domain}/question/{question['id']}/answer/{target['id']}"
        data['vote'] = target['voteup_count']
        pprint(data)
        total.append(data)


def generate():
    with open('data.json',
              encoding=encoding) as i, open('zhihu_top_video.md',
                                            'w',
                                            encoding=encoding) as o:
        data = json.loads(i.read())
        o.writelines(f"{i+1}. [{d['title']}]({d['source']})赞数:{d['vote']}\n"
                     for i, d in enumerate(data))


if __name__ == '__main__':
    tasklist = [
        f'{domain}/api/v4/topics/19776749/feeds/essence?&offset={10 * n}&limit=10'
        for n in range(100)
    ]
    with futures.ThreadPoolExecutor(50) as executor:
        executor.map(crawl, tasklist)
    save(total, sort_by='vote', order='desc')
    generate()
    os.remove('data.json')
Exemplo n.º 19
0
"""
dribbble最火的shot
"""
import looter as lt

domain = 'https://dribbble.com'


def crawl(url):
    tree = lt.fetch(url)
    items = tree.css('li.shot-thumbnail')
    for item in items:
        data = {}
        data['title'] = item.css('a strong::text').extract_first()
        data['url'] = f"{domain}{item.css('a::attr(href)').extract_first()}"
        data['author'] = item.css('.display-name::text').extract_first()
        data['fav'] = int(item.css('span.toggle-fav::text').extract_first().strip())
        data['comment'] = int(item.css('li.cmnt span::text').extract_first().strip())
        yield data


if __name__ == '__main__':
    tasklist = [f'{domain}/shots/popular?timeframe=ever&page={n}&per_page=24' for n in range(1, 51)]
    total = lt.crawl_all(crawl, tasklist)
    lt.save(total, name='dribbble.csv', sort_by='fav', no_duplicate=True, order='desc')
Exemplo n.º 20
0
unsplash上的免费壁纸排行
"""
from pprint import pprint
from concurrent import futures
import requests
import looter as lt

domain = 'https://unsplash.com'
total = []


def crawl(url):
    imgs = requests.get(url, headers=lt.DEFAULT_HEADERS).json()
    for img in imgs:
        data = {}
        data['created'] = img['created_at']
        data['url'] = img['urls']['full']
        data['likes'] = img['likes']
        pprint(data)
        total.append(data)


if __name__ == '__main__':
    tasklist = [
        f'{domain}/napi/collections/1065976/photos?page={n}&per_page=10&order_by=latest&share_key=a4a197fc196734b74c9d87e48cc86838'
        for n in range(1, 136)
    ]
    with futures.ThreadPoolExecutor(50) as executor:
        executor.map(crawl, tasklist)
    lt.save(total, name='unsplash.csv', sort_by='likes', order='desc')
Exemplo n.º 21
0
"""
import requests
import looter as lt

domain = 'https://www.zhihu.com'
encoding = 'utf-8'


def crawl(url):
    items = requests.get(url, headers=lt.DEFAULT_HEADERS).json()['data']
    for item in items:
        target = item['target']
        question = target.get('question')
        if not question:  # 只抓视频: or not target.get('topic_thumbnails_extra_info'):
            continue
        data = {}
        data['title'] = question['title']
        data[
            'source'] = f"{domain}/question/{question['id']}/answer/{target['id']}"
        data['vote'] = target['voteup_count']
        yield data


if __name__ == '__main__':
    tasklist = [
        f'{domain}/api/v4/topics/19776749/feeds/essence?&offset={10 * n}&limit=10'
        for n in range(100)
    ]
    total = lt.crawl_all(crawl, tasklist)
    lt.save(total, name='zhihu_top.csv', sort_by='vote', order='desc')
Exemplo n.º 22
0
def crawl(task: str):
    try:
        json = requests.get(url.format(q=task), headers=headers).json()
        pprint(json)
        if json.get('message'):
            time.sleep(10)
        item = json.get('items', [None])[0]
        data = {}
        data['task'] = task
        if item:
            data['repo_name'] = item.get('full_name')
            data['repo_url'] = f"https://github.com/{item.get('full_name')}"
            data['stars'] = item.get('stargazers_count')
            data['forks'] = item.get('forks_count')
            data['watchers'] = item.get('watchers_count')
            pprint(data)
            total.append(data)
    except Exception as e:
        print(f'[Err] {e}')


if __name__ == "__main__":
    tasklist = Path(r'tldr_github.txt').read_text().split(', ')
    [crawl(task) for task in tasklist]
    save(total,
         name='tldr_github.csv',
         sort_by='stars',
         no_duplicate=True,
         order='desc')
Exemplo n.º 23
0
小众软件,按评价数排序
"""
import asyncio
import looter as lt
from pprint import pprint

domain = 'https://www.appinn.com'
categories = ['windows', 'chrome']
total = []


async def crawl(url):
    tree = await lt.async_fetch(url)
    items = tree.css('section#latest-posts article.post-box')
    for item in items:
        data = {}
        data['title'] = item.css('a::attr(title)').extract_first()
        data['url'] = item.css('a::attr(href)').extract_first()
        data['date'] = item.css('span.thetime span::text').extract_first()
        data['comments'] = int(item.css("a[itemprop='interactionCount']::text").extract_first())
        pprint(data)
        total.append(data)


if __name__ == '__main__':
    tasklist = [f'{domain}/category/{category}/page/{n}/' for n in range(1, 21) for category in categories]
    loop = asyncio.get_event_loop()
    result = [crawl(task) for task in tasklist]
    loop.run_until_complete(asyncio.wait(result))
    lt.save(total, name='appinn.csv', sort_by='comments', order='desc')
Exemplo n.º 24
0
"""
8说了你懂的
"""
import looter as lt

domain = 'https://www.javbus.pw'


def crawl(url):
    tree = lt.fetch(url)
    items = tree.css('#waterfall .item')
    for item in items:
        data = {}
        data['name'] = item.css('img::attr(title)').extract_first()
        data['cover'] = item.css('img::attr(src)').extract_first()
        data['link'] = item.css('.movie-box::attr(href)').extract_first()
        data['bango'] = item.css('date::text').extract_first()
        data['date'] = item.css('date::text').extract()[1]
        yield data


if __name__ == '__main__':
    tasklist = [f'{domain}/page/{i}' for i in range(1, 90)]
    total = lt.crawl_all(crawl, tasklist)
    lt.save(total, name='jav.csv', sort_by='date', order='desc')
Exemplo n.º 25
0

def crawl(url):
    text = requests.get(url, headers=lt.DEFAULT_HEADERS).text
    text = text.replace('<!--', '').replace('-->', '')
    tree = Selector(text=text)
    items = tree.css('ul#thread_list li.j_thread_list')
    for item in items:
        data = {}
        data['title'] = item.css('a.j_th_tit::text').extract_first()
        data['abstract'] = item.css(
            '.threadlist_abs::text').extract_first().strip()
        data[
            'url'] = f"{domain}{item.css('a.j_th_tit::attr(href)').extract_first()}"
        data['author'] = item.css('a.frs-author-name::text').extract_first()
        data['reply'] = int(
            item.css('span.threadlist_rep_num::text').extract_first())
        data['date'] = item.css(
            '.threadlist_reply_date::text').extract_first().strip()
        yield data


if __name__ == '__main__':
    tasklist = [f'{domain}/f?kw={keyword}&ie=utf-8&pn={n}' for n in range(501)]
    total = lt.crawl_all(crawl, tasklist)
    lt.save(total,
            name=f'tieba_{keyword}.csv',
            sort_by='reply',
            order='desc',
            no_duplicate=True)
Exemplo n.º 26
0
v2ex上的Python板块
"""
import time
import looter as lt
from pprint import pprint
from concurrent import futures

domain = 'https://www.v2ex.com'
total = []


def crawl(url):
    tree = lt.fetch(url)
    items = tree.css('#TopicsNode .cell')
    for item in items:
        data = {}
        data['title'] = item.css('span.item_title a::text').extract_first()
        data['author'] = item.css('span.small.fade strong a::text').extract_first()
        data['source'] = f"{domain}{item.css('span.item_title a::attr(href)').extract_first()}"
        reply = item.css('a.count_livid::text').extract_first()
        data['reply'] = int(reply) if reply else 0
        pprint(data)
        total.append(data)
    time.sleep(1)


if __name__ == '__main__':
    tasklist = [f'{domain}/go/python?p={n}' for n in range(1, 572)]
    [crawl(task) for task in tasklist]
    lt.save(total, name='v2ex.csv', sort_by='reply', order='desc')
Exemplo n.º 27
0
import looter as lt
from pprint import pprint

domain = 'https://juejin.im'
total = []


def crawl(url):
    items = requests.get(url, headers=lt.DEFAULT_HEADERS).json()['d']
    for item in items:
        data = {}
        data['title'] = item['title']
        data['desc'] = item['desc']
        data['author'] = item['userData']['username']
        data['profile'] = item['profile']
        data['buyCount'] = item['buyCount']
        data['price'] = item['price']
        data['publishDate'] = item['finishedAt']
        data['url'] = f"{domain}/book/{item['_id']}"
        pprint(data)
        total.append(data)


if __name__ == '__main__':
    tasklist = [
        f'https://xiaoce-timeline-api-ms.juejin.im/v1/getListByLastTime?uid=5901b4faac502e0063cf9e02&client_id=1555503959385&token=eyJhY2Nlc3NfdG9rZW4iOiJuM0g1REUzUUZ0RjczNnJwIiwicmVmcmVzaF90b2tlbiI6InVJck0zcURsbjlkU2dJRm8iLCJ0b2tlbl90eXBlIjoibWFjIiwiZXhwaXJlX2luIjoyNTkyMDAwfQ%3D%3D&src=web&alias=&pageNum={n}'
        for n in range(1, 4)
    ]
    [crawl(task) for task in tasklist]
    lt.save(total, name='juejin_books.csv', sort_by='buyCount', order='desc')
Exemplo n.º 28
0
        url = f'{domain}/mmdb/comments/movie/{movie_id}.json?_v_=yes&offset=0&startTime={start_time}'
        start_time = requests.get(url).json()['cmts'][-1]['startTime']
        print(start_time)
        total_timestamps.append(start_time)
    Path('maoyan_comment_timestamps.txt').write_text(
        '\n'.join(total_timestamps))


def crawl(url):
    items = requests.get(url, headers=lt.DEFAULT_HEADERS).json()['cmts']
    for item in items:
        data = {}
        data['nick_name'] = item['nickName']
        data['score'] = item['score']
        data['content'] = item['content']
        data['city_name'] = item['cityName']
        pprint(data)
        total_items.append(data)


if __name__ == '__main__':
    get_timestamps()
    start_times = Path('maoyan_comment_timestamps.txt').read_text().split('\n')
    tasklist = [
        f'{domain}/mmdb/comments/movie/{movie_id}.json?_v_=yes&offset=0&startTime={t}'
        for t in start_times
    ]
    with futures.ThreadPoolExecutor(50) as executor:
        executor.map(crawl, tasklist)
    lt.save(total_items, name='maoyan_comments.csv', no_duplicate=True)
Exemplo n.º 29
0
from parsel import Selector
import looter as lt

domain = 'https://hacpai.com'
total = []
expand_num = lambda num: float(num[:-1]) * 1000 if 'K' in num else int(num)


def crawl(url):
    html = requests.get(url, headers=lt.DEFAULT_HEADERS).json()['contentHTML']
    tree = Selector(text=html)
    items = tree.css('li.article-list__item')
    for item in items:
        data = {}
        data['title'] = item.css('h2 a::text').extract_first()
        data['link'] = item.css('h2 a::attr(href)').extract_first()
        data['views'] = expand_num(
            item.css('a.article-list__cnt span::text').extract_first().strip())
        pprint(data)
        total.append(data)


if __name__ == '__main__':
    tasklist = [f'{domain}/domain/play?ajax=true&p={n}' for n in range(1, 13)]
    [crawl(task) for task in tasklist]
    lt.save(total,
            name='hacpai.csv',
            no_duplicate=True,
            sort_by='views',
            order='desc')
Exemplo n.º 30
0
def crawl(url):
    tree = lt.fetch(url)
    items = tree.css('table.n_worklist tr')
    for item in items:
        data = {}
        data['name'] = item.css('.work_name a::text').extract_first()
        data['link'] = item.css('.work_name a::attr(href)').extract_first()
        data['maker'] = item.css('dd.maker_name a::text').extract_first()
        try:
            data['price'] = int(''.join(item.css('span.work_price::text').extract_first().split(',')))
            data['rate'] = int(item.css('.star_rating::text').re_first('\d+'))
            data['review'] = int(item.css('.work_review a::text').re_first('\d+'))
        except Exception as e:
            print(e)
            data['price'] = 0
            data['rate'] = 0
            data['review'] = 0
        if not data['name']:
            continue
        yield data


if __name__ == '__main__':
    tasklist = [
        f'https://www.dlsite.com/pro/fsr/=/language/jp/order%5B0%5D/trend/per_page/30/page/{n}'
        for n in range(1, 11252)
    ]
    total = lt.crawl_all(crawl, tasklist)
    lt.save(total, name='dlsite.csv', sort_by='rate', order='desc')