__author__ = 'PatrickYeh' import receiver import pickle from pttcrawler import Logger from kafka import KafkaProducer log = Logger.getLogger("kafka_producer") class kafka_producer(receiver.receiver): def __init__(self): self.topic = "Message" def set_kafka_client(self,host,port): self.producer = KafkaProducer(bootstrap_servers="{host}:{port}".format(host=host,port=port)) def set_topic(self,topic): self.topic = topic def send(self,obj_data): log.debug("Broadcast Data") self.producer.send(self.topic,obj_data) class article_producer(kafka_producer): def __init__(self): self.set_topic("ptt_article") class reply_producer(kafka_producer): def __init__(self): self.set_topic("ptt_reply")
# coding=utf-8 __author__ = 'PatrickYeh' import re from pttcrawler import Logger from pttcrawler.WebRetriever import WebRetriever from pttcrawler.Page import Page from pttcrawler.Article import Article from BeautifulSoup import BeautifulSoup log = Logger.getLogger("PttBoard") PAGE_REG = ".*index(?P<page_num>\d*).html" ARTICLE_URL_REG = ".*/(?P<article_id>.*).html" article_url_pattern = re.compile(ARTICLE_URL_REG) class Board(Page): def __init__(self,board_id="Gossiping"): self.base_url = 'https://www.ptt.cc/bbs/{board_id}/index{page_idx}.html' self.board_id = board_id self.refresh() def refresh(self): self.html_raw_soup = self._fetch_data(self.url) def _article_list_iter(self,lst_article_idx): for article_idx in lst_article_idx: yield Article(board_id=self.board_id,article_id=article_idx) def get_articles(self,lst_article_list): return self._article_list_iter(lst_article_list)
__author__ = 'PatrickYeh' import threading,time,pickle,json from pttcrawler import Logger from pttcrawler.Board import Board from pttcrawler.Article import Article import kafka_producer log = Logger.getLogger("Monitor") class monitor(threading.Thread): def __init__(self): self.dict_receiver = {} def set_receiver(self,name,obj_receiver): self.dict_receiver[name] = obj_receiver def get_receiver(self,name): return self.dict_receiver[name] def send(self,name,obj_data): self.dict_receiver[name].send(obj_data) def broadcast(self,obj_data): for key in self.dict_receiver.keys(): self.dict_receiver[key].send(obj_data) class article_monitor(monitor): def __init__(self,board_id,article_id): threading.Thread.__init__(self) monitor.__init__(self)
# coding=utf-8 __author__ = 'Vetom' import requests,time from BeautifulSoup import BeautifulSoup from pttcrawler import Logger requests.packages.urllib3.disable_warnings() log = Logger.getLogger("WebRetriever") class WebRetriever(): def make_request(self,str_url): log.debug("Make Query: {url}".format(url=str_url)) bool_pass = False while not bool_pass: try: html_raw = requests.get(str_url,verify=False,cookies={'over18':'1'}) html_raw.encoding = 'utf-8' bool_pass = True except Exception as e: log.error("URL:{url} - ERROR: {err}".format(url=str_url,err=e)) time.sleep(2) return BeautifulSoup(html_raw.text) if __name__ == '__main__': data = WebRetriever().make_request('https://www.ptt.cc/bbs/joke/index.html') print data