class ProxyCheck(object): check_urls = ["http://news.cnstock.com/news/sns_yw/index.html"] def __init__(self): self.redis_action = RedisAction() def check_rules(self, proxy_url): for curl in self.check_urls: try: proxy_res = requests.get(curl, proxies={"http": proxy_url}, timeout=20) if proxy_res.status_code not in [200]: return False except Exception as e: traceback.print_exc() return False else: return True def period_check(self): for proxy_url in self.redis_action.members_set("proxy_set"): if not self.check_rules(proxy_url): self.redis_action.pop_set("proxy_set", proxy_url) print "proxy {} is disabled".format(proxy_url) else: print "proxy {} is enabled".format(proxy_url) def add_new(self): url_list = [ "http://dec.ip3366.net/api/?key=20171207221341061&getnum=30&anonymoustype=3&filter=1&area=1&sarea=1&formats=2&proxytype=0", "http://dec.ip3366.net/api/?key=20171207221341061&getnum=30&anonymoustype=4&filter=1&area=1&sarea=1&formats=2&proxytype=0" ] res = requests.get(choice(url_list)) proxy_json = json.loads(res.text) for proxy in proxy_json: proxy_url = "http://{}:{}".format(proxy["Ip"], proxy["Port"]) for curl in self.check_urls: try: proxy_res = requests.get(curl, proxies={"http": proxy_url}, timeout=10) if proxy_res.status_code not in [200]: print "proxy {} is disabled".format(proxy_url) break except Exception as e: traceback.print_exc() print "proxy {} is disabled".format(proxy_url) break else: self.redis_action.add_set("proxy_set", proxy_url) print "proxy {} is enabled".format(proxy_url)
class CrawlProducer(object): """ 定时扫描 hbase表中应该爬去的url, 然后写入crawl_task_queue crontab -e * * * * * python /home/szliu/fintech_crawler/period/crawl_task_producer.py >> /niub/crontab_log/crontab.log 2>&1 hbase info url: 网址 next_time: 下一次抓取时间 last_time: 最近一次抓取时间 channel: url所属频道 priority: 优先级 parse_func: url的解析函数 once_every_minutes: 多少分钟抓一次 """ def __init__(self): self.redis_action = RedisAction() self.hbase_action = HBASEAction() def run(self, queue_name, hbase_table_name): # 扫描表 count = 0 for hbase_dict in self.hbase_action.scan_table(hbase_table_name, [ "info:url", "info:priority", "info:parse_func", "info:next_time", "info:once_every_minutes" ]): base_time = datetime.now() rate = int(hbase_dict["info:once_every_minutes"]) # 判断是否到达爬取时间 if hbase_dict["info:next_time"] <= base_time.strftime( "%Y-%m-%d %H:%M:%S"): crawl_dict = { "info:url": hbase_dict["info:url"], "info:priority": hbase_dict["info:priority"], "info:parse_func": hbase_dict["info:parse_func"], } count += 1 self.redis_action.priority_queue_push( queue_name, json.dumps(crawl_dict), int(crawl_dict["info:priority"])) # 修改表中下一次的时间 self.hbase_action.insert_data( hbase_table_name, { "info:last_time": base_time.strftime("%Y-%m-%d %H:%M:%S"), "info:url": hbase_dict["info:url"], "info:next_time": (base_time + timedelta(minutes=rate)).strftime("%Y-%m-%d %H:%M:%S") }) print "[%s] write %s finished" % (datetime.now(), count)
class Handler(BaseHandler): file_store = FileStore() redis_action = RedisAction() crawl_config = { } headers = { "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0", } @every(minutes=1) def on_start(self): data_list = self.redis_action.priority_queue_pop("crawl_task_queue", 0) proxy_list = self.redis_action.get_random_set("crawler_set", 3) for data in data_list: if proxy_list: choice_proxy = choice(proxy_list) self.crawl(data["info:url"], callback=self.index_page, save=data, proxy="{}:{}".format(choice_proxy["Ip"], choice_proxy["Port"]), headers=self.headers) else: self.crawl(data["info:url"], callback=self.index_page, save=data, headers=self.headers) @catch_status_code_error def index_page(self, response): if response.status_code in [404, 403, 302, 312, 500]: return {'result': response.url, 'html': response.save["html"], 'status_code': response.status_code, 'crawl_time': datetime.now().strftime("%Y-%m-%d %H:%M:%S")} if isinstance(response.text, unicode): response.save.update({"html": response.text}) else: response.save.update({"html": response.text.decode(response.encoding)}) self.file_store.save(json.dumps(response.save)) return {'result': response.url, 'html': response.save["html"], 'crawl_time': datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
"info:status": "pending", "info:channel": "", "info:url": data["info:url"]}) if hb_action.get_raw(table_name, data["info:url"], "info:is_hub"): redis_action.priority_queue_push(crawl_name, json.dumps(data), int(data["info:priority"])) return "update" if __name__ == "__main__": """ 运行参数 nohup /usr/local/spark2/bin/spark-submit --master spark://abc-cloudera001:7077 --py-files /home/szliu/fintech_crawler/spark_submit_library/db_actions.zip,/home/szliu/fintech_crawler/spark_submit_library/hbase.zip --conf spark.pyspark.virtualenv.enabled=true --conf spark.pyspark.virtualenv.type=native --conf spark.pyspark.virtualenv.requirements=/home/szliu/venv/crawler/requirements.txt --conf spark.pyspark.virtualenv.bin.path=/usr/local/python2.7/bin/virtualenv dupfilter_task_worker.py --executor-memory 2G --total-executor-cores 2 >> /niub/szliu/dupfilter_task_worker.log 2>&1 & nohup /usr/local/spark2/bin/spark-submit --py-files /home/szliu/fintech_crawler/spark_submit_library/db_actions.zip,/home/szliu/fintech_crawler/spark_submit_library/hbase.zip --conf spark.pyspark.virtualenv.enabled=true --conf spark.pyspark.virtualenv.type=native --conf spark.pyspark.virtualenv.requirements=/home/szliu/venv/crawler/requirements.txt --conf spark.pyspark.virtualenv.bin.path=/usr/local/python2.7/bin/virtualenv dupfilter_task_worker.py --executor-memory 2G --total-executor-cores 2 >> /niub/szliu/dupfilter_task_worker.log 2>&1 & """ sc = SparkContext(appName="dupfilter_task_work") # sc.addPyFile("{base_path}/redis.zip".format(base_path=project_path)) # sc.addPyFile("{base_path}/pyhdfs.zip".format(base_path=project_path)) from db_actions.hbase_action import HBASEAction from db_actions.redis_action import RedisAction redis_action = RedisAction() while True: data_lines = redis_action.priority_queue_pop("dupfilter_task_queue", 50) if any(data_lines): news = sc.parallelize(data_lines).filter(lambda x: x).map(lambda x: dupfilter_task_work(x, HBASEAction(), RedisAction(), "crawl_task_queue", "url_schedule")) # news.count() print news.collect() else: time.sleep(10)
def __init__(self): self.redis_action = RedisAction() self.hbase_action = HBASEAction()
def __init__(self): self.link_analysis = LinkExtractor() self.redis_action = RedisAction()
class BaseStock(object): def __init__(self): self.link_analysis = LinkExtractor() self.redis_action = RedisAction() def is_detail_url(self, dom): """ 判断url是否详情页 :param dom: :return True or False: """ return True def get_channel(self, dom): return "" def parse_content(self, response_text, params): """ 判断是否时详情页, 如果是则解析网页内容, 如果不是则提取网页所有网址 :param response_text: 网页内容 :param params: {'info:url': 'info:priority': 'info:pase_func': ,} :return: """ _params = {k: v for k, v in params.items() if k.startswith("info")} dom = PyQuery(response_text.strip()) if self.is_detail_url(dom): return self.parse_detail_url(dom=dom, params=_params) else: return self.parse_other_url(dom=dom, params=_params) def parse_other_url(self, dom, params): """ 获取所有url, 并写入depfilter_task_queue :param dom: :param params: :return: """ result_list = [] channel = self.get_channel(dom) for e in dom.find('a'): sub_url = PyQuery(e).attr('href') if sub_url and sub_url.startswith("."): sub_url = self.link_analysis.url_join(params["info:url"], sub_url) if self.link_analysis.url_legal(sub_url, self.allow_domains): if not self.link_filter(sub_url): # 存入redis队列 _params = dict( params.copy(), **{ "info:url": sub_url, "info:channel": channel }) result_list.extend( [json.dumps(_params), int(_params["info:priority"])]) self.redis_action.priority_queue_push("dupfilter_task_queue", *result_list) return "parse urls" @upload_hbase def parse_detail_url(self, dom, params): pass @html_to_dom def detect_anti(self, ): pass def link_filter(self, url): return False
# encoding: utf-8 import os import sys import urlparse import traceback import requests import urllib2 import urllib from pyquery import PyQuery from datetime import datetime sys.path.append("..") from db_actions.hbase_action import HBASEAction from db_actions.redis_action import RedisAction from utilty.link_analysis import LinkExtractor hbase_client = HBASEAction() redis_client = RedisAction() def parse_item(html): dom = PyQuery(html) store_json = { "info:title": dom("div#img-content > h2#activity-name").text(), "info:publish_time": dom("div.rich_media_meta_list > em#post-date").text(), "info:source": dom("a#post-user").text(), "info:author": dom("div.rich_media_meta_list > em").eq(1).text(), "info:content": LinkExtractor().strip_html5_whitespace(dom("div#page-content").html()),
def __init__(self): self.redis_action = RedisAction()
# encoding: utf-8 import sys import json sys.path.append("..") import traceback from pyquery import PyQuery from db_actions.hbase_action import HBASEAction from db_actions.redis_action import RedisAction hbase_action = HBASEAction() redis_action = RedisAction() def upload_hbase(func): def wrap(*args, **kwargs): try: data = func(*args, **kwargs) hbase_action.insert_data("news_data", data) return "data save in hbase" except Exception as e: traceback.print_exc() print kwargs["params"]["info:url"], "css selector error" return kwargs["params"]["info:url"], "css selector error" return wrap def html_to_dom(is_detect_anti):