Exemplo n.º 1
0
def get_toppost100(params):
    while True:
        proxy_ip = cpn.get_proxy_ip()
        try:
            resp = requests.get(toppost100_url,
                                params=params,
                                headers=toppost100_headers,
                                proxies=proxy_ip,
                                timeout=5)
            if resp is not None:
                print("抓取:" + resp.request.url)
                soup = cpn.get_bs(resp.text)
                ul = soup.find(
                    'ul',
                    attrs={
                        'class':
                        'l-clearfix gridList workImageCards js-workTopList'
                    })
                lis = ul.findAll('li')
                for li in lis:
                    img = li.find('img', attrs={'class':
                                                'cardImage'})['src'][:-4]
                    if not img == '':
                        name = li.find('p', attrs={
                            'class': 'fz14 text cut'
                        }).get_text().strip()
                        if name == '':
                            name = str(int(time.time()))
                        cpn.write_str_data(name + "Θ" + img, pic_urls_file)
                return None
        except Exception as e:
            print(threading.current_thread().name + "~" + str(e))
Exemplo n.º 2
0
def get_ajax_data(data):
    while True:
        proxy_ip = cpn.get_proxy_ip()
        try:
            resp = requests.post(ajax_url,
                                 data=data,
                                 headers=ajax_headers,
                                 proxies=proxy_ip,
                                 timeout=5)
            if resp is not None:
                soup = cpn.get_bs(resp.text)
                lis = soup.findAll('li')
                for li in lis:
                    img = li.find('img', attrs={'class':
                                                'cardImage'})['src'][:-4]
                    if not img == '':
                        name = li.find('p', attrs={
                            'class': 'fz14 text cut'
                        }).get_text().strip()
                        if name == '':
                            name = str(int(time.time()))
                        cpn.write_str_data(name + "Θ" + img, pic_urls_file)
                return None
        except Exception as e:
            print(threading.current_thread().name + "~" + str(e))
Exemplo n.º 3
0
def get_page_count():
    try:
        resp = requests.get(tiezi_url, headers=headers, timeout=5)
        if resp is not None:
            soup = cpn.get_bs(resp.text)
            a_s = soup.find("ul", attrs={'class': 'l_posts_num'}).findAll("a")
            for a in a_s:
                if a.get_text() == '尾页':
                    return a['href'].split('=')[1]
    except Exception as e:
        print(str(e))
Exemplo n.º 4
0
def get_page_count():
    try:
        resp = requests.get(tiezi_url, headers=headers, timeout=5)
        if resp is not None:
            soup = cpn.get_bs(resp.text)
            a_s = soup.find("ul", attrs={'class': 'l_posts_num'}).findAll("a")
            for a in a_s:
                if a.get_text() == '尾页':
                    return a['href'].split('=')[1]
    except Exception as e:
        print(str(e))
Exemplo n.º 5
0
def get_page_count():
    try:
        resp = requests.get(list_url, headers=headers, timeout=5)
        if resp is not None:
            soup = cpn.get_bs(resp.text)
            div = soup.find('div', attrs={'id': 'papelist'})
            page_count = (div.findAll('a')[-1]['href']).split('/')[-1]
            print("解析获得文章页数:" + page_count)
            return page_count
    except Exception as e:
        print(str(e))
def catch_page_count():
    proxy_ip = {'https': 'https://' + cpn.get_dx_proxy_ip()}
    try:
        resp = requests.get(base_url, headers=headers, proxies=proxy_ip)
        if resp is not None:
            soup = cpn.get_bs(resp.text)
            # 获得最后一页页码
            last_page_count = soup.find('div',
                                        attrs={'class', 'pagination'
                                               }).findAll('a')[-2].get_text()
            return last_page_count
    except Exception as e:
        print(str(e))
Exemplo n.º 7
0
def get_article_url(url):
    try:
        resp = requests.get(url, headers=headers, timeout=5)
        if resp is not None:
            print("解析:" + resp.request.url)
            soup = cpn.get_bs(resp.text)
            div = soup.find('div', attrs={'id': 'article_list'})
            spans = div.findAll('span', attrs={'class': 'link_title'})
            for span in spans:
                cpn.write_str_data(base_url + span.find('a')['href'], articles_file)
            return None
    except Exception as e:
        print(str(e))
def catch_page_count():
    while True:
        proxy_ip = {'http': 'http://' + cpn.get_dx_proxy_ip()}
        try:
            resp = requests.get(base_url, headers=headers, proxies=proxy_ip, timeout=5)
            if resp is not None:
                print(proxy_ip)
                soup = cpn.get_bs(resp.text)
                # 获得最后一页页码
                last_page_count = soup.find('div', attrs={'class', 'pagination'}).findAll('a')[-2].get_text()
                return last_page_count
        except Exception as e:
            pass
def catch_ip(url):
    proxy_ip = {'https': 'https://' + cpn.get_dx_proxy_ip()}
    try:
        resp = requests.get(url, headers=headers, proxies=proxy_ip)
        if resp is not None:
            soup = cpn.get_bs(resp.text)
            trs = soup.find('table').findAll('tr')[1:]
            for tr in trs:
                if float(tr.find('div', attrs={'bar'})['title'][:-1]) > 1:
                    tds = tr.findAll('td')
                    cpn.write_xc_ip_file(tds[1].get_text() + ":" +
                                         tds[2].get_text())
    except Exception as e:
        print(str(e))
Exemplo n.º 10
0
def catch_ip(url):
    while True:
        proxy_ip = {'http': 'http://' + cpn.get_dx_proxy_ip()}
        print(proxy_ip)
        try:
            resp = requests.get(url, headers=headers, proxies=proxy_ip,timeout=10)
            if resp is not None:
                soup = cpn.get_bs(resp.text)
                trs = soup.find('table').findAll('tr')[1:]
                for tr in trs:
                    if float(tr.find('div', attrs={'bar'})['title'][:-1]) > 1:
                        tds = tr.findAll('td')
                        cpn.write_xc_ip_file(tds[1].get_text() + ":" + tds[2].get_text())
        except Exception as e:
            pass
Exemplo n.º 11
0
def get_pics(count):
    while True:
        params = {'pn': count, 'ajax': '1', 't': int(time.time())}
        try:
            resp = requests.get(tiezi_url,
                                headers=headers,
                                timeout=5,
                                params=params)
            if resp is not None:
                soup = cpn.get_bs(resp.text)
                imgs = soup.findAll('img', attrs={'class': 'BDE_Image'})
                for img in imgs:
                    cpn.write_str_data(img['src'], pic_urls_file)
                return None
        except Exception as e:
            pass
    pass
Exemplo n.º 12
0
def get_pics(count):
    while True:
        params = {
            'pn': count,
            'ajax': '1',
            't': int(time.time())
        }
        try:
            resp = requests.get(tiezi_url, headers=headers, timeout=5, params=params)
            if resp is not None:
                soup = cpn.get_bs(resp.text)
                imgs = soup.findAll('img', attrs={'class': 'BDE_Image'})
                for img in imgs:
                    cpn.write_str_data(img['src'], pic_urls_file)
                return None
        except Exception as e:
            pass
    pass
Exemplo n.º 13
0
def get_ajax_data(data):
    while True:
        proxy_ip = cpn.get_proxy_ip()
        try:
            resp = requests.post(ajax_url, data=data, headers=ajax_headers, proxies=proxy_ip, timeout=5)
            if resp is not None:
                soup = cpn.get_bs(resp.text)
                lis = soup.findAll('li')
                for li in lis:
                    img = li.find('img', attrs={'class': 'cardImage'})['src'][:-4]
                    if not img == '':
                        name = li.find('p', attrs={'class': 'fz14 text cut'}).get_text().strip()
                        if name == '':
                            name = str(int(time.time()))
                        cpn.write_str_data(name + "Θ" + img, pic_urls_file)
                return None
        except Exception as e:
            print(threading.current_thread().name + "~" + str(e))
Exemplo n.º 14
0
def get_toppost100(params):
    while True:
        proxy_ip = cpn.get_proxy_ip()
        try:
            resp = requests.get(toppost100_url, params=params, headers=toppost100_headers, proxies=proxy_ip, timeout=5)
            if resp is not None:
                print("抓取:" + resp.request.url)
                soup = cpn.get_bs(resp.text)
                ul = soup.find('ul', attrs={'class': 'l-clearfix gridList workImageCards js-workTopList'})
                lis = ul.findAll('li')
                for li in lis:
                    img = li.find('img', attrs={'class': 'cardImage'})['src'][:-4]
                    if not img == '':
                        name = li.find('p', attrs={'class': 'fz14 text cut'}).get_text().strip()
                        if name == '':
                            name = str(int(time.time()))
                        cpn.write_str_data(name + "Θ" + img, pic_urls_file)
                return None
        except Exception as e:
            print(threading.current_thread().name + "~" + str(e))