class ProxyPoolScraper:
    def __init__(self, url, bs_parser="lxml"):
        self.parser = WebParser(url)
        self.bs_parser = bs_parser

    def get_proxy_stream(self, limit):
        raw_records = self.extract_table_raw_records()
        clean_records = list(
            map(self._clear_up_record, raw_records)
        )
        for record in clean_records[:limit]:
            self.logger.info(f"Proxy record: {record}")
            if record:
                yield ProxyRecord(*record)

    def extract_table_raw_records(self):
        content = self.parser.get_content()
        soup_object = BeautifulSoup(content, self.bs_parser)
        return (
            soup_object
            .find(id="list")
            .find_all("tr")
        )

    def _clear_up_record(self, raw_record):
        return [
            val.text for val
            in raw_record.find_all("td")
        ]
class ProxyPoolValidator:
    def __init__(self, url, timeout=10):
        self.timeout = timeout
        self.parser = WebParser(url, rotate_header=True)

    def validate_proxy(self, proxy_record):
        content = self.parser.get_content(timeout=self.timeout,
                                          proxies=proxy_record.proxy)
        proxy_status = ProxyStatus(proxy_record.proxy, content is not None)
        self.logger.info(f"Proxy status: {proxy_status}")
        return proxy_status
示例#3
0
class NewsProducer:
    def __init__(self, rss_feed):
        self.parser = WebParser(rss_feed, rotate_header=True)
        self.formatter = NewsFormatter()

    def _extract_news_feed_items(self, proxies):
        content = self.parser.get_content(proxies=proxies)
        news_feed = atoma.parse_rss_bytes(content)
        return news_feed.items

    def get_news_stream(self, proxies):
        news_feed_items = self._extract_news_feed_items(proxies)
        for entry in news_feed_items:
            formatted_entry = self.formatter.format_entry(entry)
            yield formatted_entry
class ProxyPoolValidator:
    def __init__(self, url, timeout=10, checks=3, sleep_interval=0.1):
        self.timeout = timeout
        self.checks = checks
        self.sleep_interval = sleep_interval
        self.parser = WebParser(url, rotate_header=True)

    def validate_proxy(self, proxy_record):
        consecutive_checks = []
        for _ in range(self.checks):
            content = self.parser.get_content(timeout=self.timeout,
                                              proxies=proxy_record.proxy)
            time.sleep(self.sleep_interval)
            consecutive_checks.append(int(content is not None))

        health = sum(consecutive_checks) / self.checks
        proxy_status = ProxyStatus(proxy=proxy_record.proxy,
                                   health=health,
                                   is_valid=health > 0.66)
        self.logger.info(f"Proxy status: {proxy_status}")
        return proxy_status
示例#5
0
 def __init__(self, url, bs_parser="lxml"):
     self.parser = WebParser(url)
     self.bs_parser = bs_parser
 def __init__(self, url, timeout=10, checks=3, sleep_interval=0.1):
     self.timeout = timeout
     self.checks = checks
     self.sleep_interval = sleep_interval
     self.parser = WebParser(url, rotate_header=True)
示例#7
0
import json
from bot import Bot
from parser import WebParser
import sys
from message import PhoneMessage

def get_amazon_links():
    with open('data/links.json') as links_file:
        json_links = json.load(links_file)
        wipe_products_links = json_links['Wipes']
        spray_products_links = json_links['Spray']
        return wipe_products_links, spray_products_links

if __name__ == "__main__":
    wipe_products_links,spray_products_links = get_amazon_links()
    web_parser = WebParser()

    bot = Bot(web_parser)
    bot.start_scrapying_process(wipe_products_links)
    bot.start_scrapying_process(spray_products_links)

    stocked_products_urls = bot.webparser.stocked_product_links
    #Couldnt find any products that were in stock
    if not stocked_products_urls:
        sys.exit(1)
    else:
        phone_message = PhoneMessage(stocked_products_urls)
        phone_message.send_message()
 def __init__(self, url, timeout=10):
     self.timeout = timeout
     self.parser = WebParser(url, rotate_header=True)
def test__str__representation(url, expected):
    web_parser = WebParser(url)

    result = str(web_parser)

    assert result == expected
示例#10
0
def web_parser():
    yield WebParser(TEST_URL)
示例#11
0
 def __init__(self, rss_feed):
     self.parser = WebParser(rss_feed, rotate_header=True)
     self.formatter = NewsFormatter()
 def __init__(self, keywords_dict, redis_key):
     #self.bloom_filter = BloomFilter(redis.StrictRedis(host='localhost', port=6379), 'job_url')
     self.parser = WebParser(redis_key)
     self.keywords = keywords_dict
class JobSpider(object):
    def __init__(self, keywords_dict, redis_key):
        #self.bloom_filter = BloomFilter(redis.StrictRedis(host='localhost', port=6379), 'job_url')
        self.parser = WebParser(redis_key)
        self.keywords = keywords_dict

    def crawl_zhilian(self, city, keyword):
        #url_list = []  # todo url_list 做成堆栈形式
        begin_url = 'https://fe-api.zhaopin.com/c/i/sou?start={page}&pageSize=90&cityId={city}&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw={keyword}&kt=3'
        database = MongoDB('zhilian', self.keywords[keyword])

        url_list = self._get_list(begin_url, city, keyword, page_weight=90)

        print(keyword, city, 'list parser done!')
        print(len(url_list))

        self._get_content(database, url_list)

    def crawl_qiancheng(self, city, keyword):
        begin_url = 'https://search.51job.com/list/{city},000000,0000,00,9,99,{keyword},2,{page}.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
        database = MongoDB('qiancheng', self.keywords[keyword])

        url_list = self._get_list(begin_url,
                                  city,
                                  keyword,
                                  page_begin=1,
                                  web_name='qiancheng')

        print(keyword, city, 'list parser done!')
        if url_list:
            print(len(url_list))

        self._get_content(database, url_list, web_name='qiancheng')

    def crawl_liepin(self, city, keyword):
        begin_url = "https://www.liepin.com/city-{city}/zhaopin/pn{page}/?d_pageSize=40&jobKind=2&key={keyword}"
        database = MongoDB('liepin', self.keywords[keyword])

        url_list = self._get_list(begin_url,
                                  city,
                                  keyword,
                                  page_begin=0,
                                  web_name='liepin')

        print(keyword, city, 'list parser done!')
        if url_list:
            print(len(url_list))

        self._get_content(database, url_list, web_name='liepin')

    def crawl_boss(self):
        pass

    def crawl_shixi(self):
        pass

    def crawl_lagou(self):
        pass

    def _anti_progrosse(self):
        # todo 反爬虫代理函数
        proxy = get_proxy.get_proxy()
        if not proxy:
            proxies = {
                'http': 'http://' + proxy,
                'https': 'https://' + proxy,
            }
            response = requests.get(begin_url.format(page * 90, city, keyword),
                                    headers=headers,
                                    proxies=proxies)
            if response.status_code != 200:
                print(
                    'proxy mode fail!!! please wait a few time, and try again')
                return
            urls = self.parser.list_zhilian(response.text)
        else:
            print("Can't seek useful proxy!")
            return

    def _get_content(self, database, url_list, web_name=None):
        # todo 此处改为多线程
        if url_list:
            for url in url_list:
                try:
                    response = requests.get(url, headers=headers)
                    if response.status_code != 200:
                        print('anti-spider in content: ', response.status_code)
                        print('error url:', url)
                        # todo 反爬虫代理函数
                        time.sleep(3)
                        response = requests.get(url, headers=headers)
                        if response.status_code != 200:
                            print('give up:', url)
                        else:
                            if web_name == 'zhilian':
                                self.parser.content_zhilian(
                                    response, database, url)
                            if web_name == 'qiancheng':
                                self.parser.content_qiancheng(
                                    response, database, url)
                            if web_name == 'liepin':
                                self.parser.content_liepin(
                                    response, database, url)
                        continue
                    if web_name == 'zhilian':
                        self.parser.content_zhilian(response, database, url)
                    if web_name == 'qiancheng':
                        self.parser.content_qiancheng(response, database, url)
                    if web_name == 'liepin':
                        self.parser.content_liepin(response, database, url)
                except Exception as e:
                    print('request_job_contain error : {}'.format(e))

    def _get_list(self,
                  begin_url,
                  city,
                  keyword,
                  page_weight=1,
                  page_begin=0,
                  web_name=None):
        url_list = []
        for page in range(1000):
            urls = []
            try:
                u = begin_url.format(page=page * page_weight + page_begin,
                                     city=city,
                                     keyword=keyword)
                response = requests.get(begin_url.format(
                    page=page * page_weight + page_begin,
                    city=city,
                    keyword=keyword),
                                        headers=headers)
                if response.status_code != 200:
                    print('anti-spider in list')
                    continue  # 如果用下面的话,记得去掉这个return
                    """
                    # # todo 反爬虫代理函数
                    # proxy = get_proxy.get_proxy()
                    # if not proxy:
                    #     proxies = {
                    #         'http': 'http://' + proxy,
                    #         'https': 'https://' + proxy,
                    #     }
                    #     response = requests.get(begin_url.format(page*90, city, keyword), headers=headers, proxies=proxies)
                    #     if response.status_code != 200:
                    #         print('proxy mode fail!!! please wait a few time, and try again')
                    #         return
                    #     urls = self.parser.list_zhilian(response.text)
                    # else:
                    #     print("Can't seek useful proxy!")
                    #     return
                    """
                else:
                    if web_name == 'zhilian':
                        urls = self.parser.list_zhilian(response)
                    if web_name == 'qiancheng':
                        urls = self.parser.list_qiancheng(response)
                    if web_name == 'liepin':
                        urls = self.parser.list_liepin(response)

                if urls == (None or []):
                    break
                url_list.extend(urls)

            except Exception as e:
                print('request_job_list error : {}'.format(e))
        return url_list