Exemplo n.º 1
0
def main():
    output = os.path.abspath(OUTPUT)
    if not os.path.exists(output):
        os.mkdir(output)
    data_file = os.path.join(output, 'data_%s.json' % int(time.time()))
    # root_url = 'https://9gag.com/v1/group-posts/group/gif/type/hot'
    # root_url = 'https://9gag.com/v1/group-posts/group/cute/type/hot'
    root_url = 'https://9gag.com/v1/group-posts/group/comic/type/hot'
    items = []
    query_next = None
    url = '%s?%s' % (root_url, query_next or '')
    r = get(url)
    print('Page: %s' % url)
    while r.status_code < 300:
        posts = r.json()['data']['posts']
        if not posts:
            break
        items.extend(posts)
        utils.write_dict(data_file, items)
        download_images_async(posts)
        query_next = r.json()['data']['nextCursor']
        if query_next:
            url = '%s?%s' % (root_url, query_next)
            print('Page: %s' % url)
            try:
                r = get(url)
            except Exception as e:
                traceback.print_exc()
Exemplo n.º 2
0
def main():
    output = os.path.abspath(OUTPUT)
    if not os.path.exists(output):
        os.mkdir(output)
    data_file = os.path.join(output,'data_%s.json' % int(time.time()))
    # root_url = 'https://9gag.com/v1/group-posts/group/gif/type/hot'
    # root_url = 'https://9gag.com/v1/group-posts/group/cute/type/hot'
    root_url = 'https://9gag.com/v1/group-posts/group/comic/type/hot'
    items = []
    query_next = None
    url = '%s?%s' % (root_url, query_next or '')
    r = get(url)
    print('Page: %s' % url)
    while r.status_code < 300:
        posts = r.json()['data']['posts']
        if not posts:
            break
        items.extend(posts)
        utils.write_dict(data_file, items)
        download_images_async(posts)
        query_next = r.json()['data']['nextCursor']
        if query_next:
            url = '%s?%s' % (root_url, query_next)
            print('Page: %s' % url)
            try:
                r = get(url)
            except Exception as e:
                traceback.print_exc()
Exemplo n.º 3
0
def main():
    text = commons.get('https://pastebin.mozilla.org/?dl=9076632').text
    proxy_list = [p.strip() for p in text.split('\n')]
    for p in proxy_list:
        proxies = {
            "http": "http://%s" % p,
            "https": "http://%s" % p,
        }
        r = commons.get('https://m.douban.com', proxies=proxies)
        print(r.status_code)
        if r.status_code >= 300:
            proxy_list.remove(p)
    for p in proxy_list:
        print(p)
Exemplo n.º 4
0
def main():
    text = commons.get('https://pastebin.mozilla.org/?dl=9076632').text
    proxy_list = [p.strip() for p in text.split('\n')]
    for p in proxy_list:
        proxies = {
            "http": "http://%s" % p,
            "https": "http://%s" % p,
        }
        r = commons.get('https://m.douban.com', proxies=proxies)
        print(r.status_code)
        if r.status_code >= 300:
            proxy_list.remove(p)
    for p in proxy_list:
        print(p)
Exemplo n.º 5
0
def main2():
    # print_css_links(u'https://www.douban.com/note/645097084/')
    r = commons.get('https://baike.baidu.com/item/%E7%8C%AB%E5%92%AA')
    # r = commons.get('https://www.douban.com/note/645097084/')
    print(r.encoding)
    print(r.apparent_encoding)
    print(requests.utils.get_encodings_from_content(r.content))
    print(type(r.text))
    print(r.text[:300])
Exemplo n.º 6
0
def ip3366():
    url = 'http://www.ip3366.net/free/?stype=1&page=1'
    r = commons.get(url)
    t = to_text(r.text)
    tree = etree.HTML(t)
    ps = tree.xpath('//*[@id="list"]/table/tbody/tr')
    for p in ps:
        px = ':'.join(p.xpath('./td/text()')[0:2])
        print(px)
Exemplo n.º 7
0
def main2():
    # print_css_links(u'https://www.douban.com/note/645097084/')
    r = commons.get('https://baike.baidu.com/item/%E7%8C%AB%E5%92%AA')
    # r = commons.get('https://www.douban.com/note/645097084/')
    print(r.encoding)
    print(r.apparent_encoding)
    print(requests.utils.get_encodings_from_content(r.content))
    print(type(r.text))
    print(r.text[:300])
Exemplo n.º 8
0
def ip3366():
    url = 'http://www.ip3366.net/free/?stype=1&page=1'
    r = commons.get(url)
    t = to_text(r.text)
    tree = etree.HTML(t)
    ps = tree.xpath('//*[@id="list"]/table/tbody/tr')
    for p in ps:
        px = ':'.join(p.xpath('./td/text()')[0:2])
        print(px)
Exemplo n.º 9
0
def parse_doulist():
    url = 'https://www.douban.com/doulist/39822487/?sort=time&sub_type=12'
    root = etree.HTML(commons.get(url).text)
    links = root.xpath('//a[contains(@href,"/photos/album/")]')
    return distinct_list([l.attrib['href'] for l in links])
Exemplo n.º 10
0
def parse_doulist():
    url = 'https://www.douban.com/doulist/39822487/?sort=time&sub_type=12'
    root = etree.HTML(commons.get(url).text)
    links = root.xpath('//a[contains(@href,"/photos/album/")]')
    return distinct_list([l.attrib['href'] for l in links])