def __init__(self): ''' >>>from v2ex_spider import rss_spider >>>rss_spider.Rss_spider() ''' logging.info('start Rss spider') self.v2ex_rss_url_list=['https://www.v2ex.com/index.xml', 'https://www.v2ex.com/feed/tab/qna.xml', 'https://www.v2ex.com/feed/tab/jobs.xml', 'https://www.v2ex.com/feed/tab/deals.xml', 'https://www.v2ex.com/feed/tab/city.xml', 'https://www.v2ex.com/feed/tab/play.xml', 'https://www.v2ex.com/feed/tab/apple.xml', 'https://www.v2ex.com/feed/tab/creative.xml', 'https://www.v2ex.com/feed/tab/tech.xml'] self.latest_hot_api=['https://www.v2ex.com/api/topics/latest.json','https://www.v2ex.com/api/topics/hot.json'] self.topic_sleep_time=10 logging.debug('open sql database') self.SQ=SQL() self.SQ.open_datebase() self.redis_conn=Redis() self.load_config() #run try: self.latest_and_hot() except APIError as e: pass self.gen_topic_queue() #end self.SQ.close_datebase() logging.info('end the Rss spider')
def __init__(self, url, sleep_time): ''' >>>from v2ex_spider import base_spider >>>base_spider.start(url,sleep_time) ''' self.url = url self.sleep_time = sleep_time time.sleep(int(self.sleep_time)) self.SQ = SQL() self.SQ.open_datebase() #run self.load_config() self.spider() #end self.SQ.close_datebase()
def __init__(self): ''' $ python run.py or $ ./Run.sh ''' logging.info('start') logging.debug('open sql database.') self.SQ = SQL() self.SQ.open_datebase() self.redis_conn = Redis() self.load_config() #base self.load_json() # self.update_cookies() try: self.update_nodes() except APIError as e: pass
class spider(object): ''' A base Spider for v2ex. ''' def __init__(self, url, sleep_time): ''' >>>from v2ex_spider import base_spider >>>base_spider.start(url,sleep_time) ''' self.url = url self.sleep_time = sleep_time time.sleep(int(self.sleep_time)) self.SQ = SQL() self.SQ.open_datebase() #run self.load_config() self.spider() #end self.SQ.close_datebase() def spider(self): resp = self.s.get(self.url) if resp.status_code != 200: self.SQ.close_datebase() error_info = 'proxy status: %s, proxy: %s' % (str( settings.proxy_enable), str(self.s.proxies)) raise APIError(error_info) topics = resp.json() for topic in topics: t_id = topic["id"] title = topic["title"] author = topic["member"]["username"] author_id = topic["member"]["id"] content = topic["content"] content_rendered = topic["content_rendered"] replies = topic["replies"] node = topic["node"]["id"] created = topic["created"] n_time = int(time.time()) self.SQ.write_to_db_base(t_id, title, author, author_id, content, content_rendered, replies, node, created, n_time) self.SQ.conn.commit() return def load_config(self): self.proxy_enable = settings.proxy_enable self.s = requests.session() self.s.headers = settings.API_headers if self.proxy_enable: self.s.proxies = settings.proxies return
def __init__(self): ''' $ python run.py or $ ./Run.sh ''' self.SQ = SQL() self.SQ.open_datebase() self.redis_conn = Redis() self.load_config() #start self.load_json() self.update_cookies() try: self.update_nodes() except APIError as e: print(e) self.get_rss() self.tasker() self.tester_tasker() #end self.end()
class Start(object): ''' Start the project. ''' def __init__(self): ''' $ python run.py or $ ./Run.sh ''' logging.info('start') logging.debug('open sql database.') self.SQ = SQL() self.SQ.open_datebase() self.redis_conn = Redis() self.load_config() #base self.load_json() # self.update_cookies() try: self.update_nodes() except APIError as e: pass def Mode1(self): logging.info('start mode1') #start self.get_rss() self.tasker() self.topic_ids_enqueue() self.tester_tasker() #end self.end() def Mode2(self): logging.info('start mode2') #start self.get_rss() self.topic_ids_enqueue() self.tester_tasker() #end self.end() def end(self): self.SQ.close_datebase() self.dump_json() logging.info('end') def load_json(self): logging.debug('load json') #load .time_log.json if os.path.exists('.time_log.json'): with open('.time_log.json', 'r') as f: self.time_log = json.load(f) else: self.time_log = { 'cookies_time': '0', 'nodes_time': '0', '8000_node': '0', '4000_node': '0', '1000_node': '0', '500_node': '0', '0_node': '0', 'rss_time': '0', 'tester': '0', 'topic_id_reenqueue': '0' } #load .node_number.json if os.path.exists('.node_number.json'): with open('.node_number.json', 'r') as f: self.node_number = json.load(f) else: self.node_number = list() return def dump_json(self): #dump .time_log.json with open('.time_log.json', 'w') as f1: json.dump(self.time_log, f1) #dump .node_number.json with open('.node_number.json', 'w') as f2: self.node_number = list(set(self.node_number)) json.dump(self.node_number, f2) return def topic_ids_enqueue(self): if int(time.time()) - int(self.time_log['topic_id_reenqueue']) >= 1800: logging.info('start topic id reenqueue') max_id = topic_id_reenqueue.max_id topic_id_reenqueue.reenqueue_m(max_id - 2000, max_id - 29) self.time_log['topic_id_reenqueue'] = str(int(time.time())) return def update_cookies(self): if int(time.time()) - int(self.time_log["cookies_time"]) >= 86400 * 4: cookies_time_status = False else: cookies_time_status = True if not os.path.exists('cookies.txt') or cookies_time_status is False: logging.debug('update cookies') try: log_s = log_in.v2ex_log_in() log_s.log_in(3) log_s.save_cookies() except log_in.LogError as e: return self.time_log["cookies_time"] = str(int(time.time())) return def update_nodes(self): if int(time.time()) - int(self.time_log["nodes_time"]) >= 10800: nodes_time_status = False else: nodes_time_status = True if not nodes_time_status: logging.info('update nodes') try: resp = self.s.get('https://www.v2ex.com/api/nodes/all.json', timeout=10) except requests.exceptions.RequestException as e: logging.error('update_node failed.') logging.error('proxy_status: %s' % settings.i_proxy_enable) if settings.i_proxy_enable is True: logging.error('proxy: %s' % self.s.proxies) logging.error(e) self.node_number = list(set(self.node_number)) return if resp.status_code != 200: logging.error('update_node failed.') logging.error('proxy_status: %s' % settings.i_proxy_enable) if settings.i_proxy_enable is True: logging.error('proxy: %s' % self.s.proxies) logging.error(APIError('update_node')) self.node_number = list(set(self.node_number)) raise APIError('update_node') nodes = resp.json() for node in nodes: n_id = node["id"] name = node["name"] url = node["url"] title = node["title"] title_alternative = node["title_alternative"] topics = node["topics"] header = node["header"] footer = node["footer"] created = node["created"] n_time = int(time.time()) if self.SQ.node_test(n_id, topics) is True: self.node_number.append(int(n_id)) self.SQ.write_to_db_node(n_id, name, url, title, title_alternative, topics, header, footer, created, n_time) self.time_log["nodes_time"] = str(int(time.time())) self.node_number = list(set(self.node_number)) return def tasker(self): node_configs_1 = [{ 'sql': 'SELECT ID FROM NODES WHERE topics >= 8000;', 'sleep_time': 5, 'between_time': 900, 'time_log': '8000_node', 'queue_name': 'node1' }, { 'sql': 'SELECT ID FROM NODES WHERE topics BETWEEN 3000 AND 8000;', 'sleep_time': 10, 'between_time': 1800, 'time_log': '4000_node', 'queue_name': 'node2' }, { 'sql': 'SELECT ID FROM NODES WHERE topics BETWEEN 1000 AND 3000;', 'sleep_time': 20, 'between_time': 7200, 'time_log': '1000_node', 'queue_name': 'node3' }, { 'sql': 'SELECT ID FROM NODES WHERE topics BETWEEN 100 AND 1000;', 'sleep_time': 90, 'between_time': 86400, 'time_log': '500_node', 'queue_name': 'node4' }, { 'sql': 'SELECT ID FROM NODES WHERE topics BETWEEN 1 AND 100;', 'sleep_time': 90, 'between_time': 86400, 'time_log': '0_node', 'queue_name': 'node5' }] node_configs_2 = [{ 'sql': 'SELECT ID FROM NODES WHERE topics >= 8000;', 'sleep_time': 5, 'between_time': 1800, 'time_log': '8000_node', 'queue_name': 'node1' }, { 'sql': 'SELECT ID FROM NODES WHERE topics BETWEEN 3000 AND 8000;', 'sleep_time': 10, 'between_time': 3600, 'time_log': '4000_node', 'queue_name': 'node2' }, { 'sql': 'SELECT ID FROM NODES WHERE topics BETWEEN 1000 AND 3000;', 'sleep_time': 20, 'between_time': 14400, 'time_log': '1000_node', 'queue_name': 'node3' }, { 'sql': 'SELECT ID FROM NODES WHERE topics BETWEEN 100 AND 1000;', 'sleep_time': 90, 'between_time': 86400, 'time_log': '500_node', 'queue_name': 'node4' }, { 'sql': 'SELECT ID FROM NODES WHERE topics BETWEEN 1 AND 100;', 'sleep_time': 90, 'between_time': 86400, 'time_log': '0_node', 'queue_name': 'node5' }] time.tzname = ('CST', 'CST') if int(time.strftime('%H')) >= 8 or int(time.strftime('%H')) < 2: node_configs = node_configs_1 else: node_configs = node_configs_2 for node_config in node_configs: sql = node_config['sql'] sleep_time = node_config['sleep_time'] between_time = node_config['between_time'] time_log_name = node_config['time_log'] queue_name = node_config['queue_name'] q_node = Queue(queue_name, connection=self.redis_conn) if int(time.time()) - int( self.time_log[time_log_name]) >= between_time: logging.info('start enqueue, queue name: %s' % queue_name) self.SQ.cursor.execute(sql) node_ids = self.SQ.cursor.fetchall() for node_id in node_ids: node_id = node_id[0] if queue_name not in [ 'node4', 'node5' ] or (queue_name in ['node4', 'node5'] and node_id in self.node_number): if queue_name in ['node4', 'node5']: self.node_number.remove(int(node_id)) q_node.enqueue(node_spider.start, node_id, sleep_time) self.time_log[time_log_name] = str(int(time.time())) return def get_rss(self): if int(time.time()) - int(self.time_log["rss_time"]) >= 600: logging.debug('start get_rss') try: rss_spider.Rss_spider() except requests.exceptions.RequestException as e: self.time_log["rss_time"] = str(int(time.time())) return self.time_log["rss_time"] = str(int(time.time())) return def load_config(self): logging.debug('load config') self.proxy_enable = settings.i_proxy_enable self.s = requests.session() self.s.headers = settings.API_headers if self.proxy_enable: self.s.proxies = settings.i_proxies() return def tester_tasker(self): if int(time.time()) - int(self.time_log["tester"]) >= 1800: logging.info('start enqueue tester') #losd json if os.path.exists('.topics_tester.json'): with open('.topics_tester.json', 'r') as f: tmp_topics = json.load(f) else: tmp_topics = list() #main sql = "SELECT ID FROM TOPIC WHERE (time - created) < 345600 AND ID NOT IN (SELECT T_ID FROM STATUS) AND (STRFTIME('%s','now') - created) > 1209600;" sleep_time = 20 self.SQ.cursor.execute(sql) topic_ids = [x[0] for x in self.SQ.cursor.fetchall()] q = Queue('tester', connection=self.redis_conn) for topic_id in topic_ids: if topic_id not in tmp_topics: q.enqueue(topic_tester.start, topic_id, sleep_time) tmp_topics.append(topic_id) #end tmp_topics = list(set(tmp_topics)) with open('.topics_tester.json', 'w') as f: json.dump(tmp_topics, f) self.time_log["tester"] = str(int(time.time())) return
def init_database(self): logging.debug('init database') self.SQ = SQL() self.SQ.open_datebase()
class tester(object): ''' The tester for v2ex topics. ''' def __init__(self): ''' >>>from v2ex_tester import topic_tester >>>topic_tester(topic_id,sleep_time) ''' logging.debug('init class tester') self.s = requests.session() if settings.proxy_enable is True: self.s.proxies = settings.proxies() self.s.headers = settings.WEB_headers self.log_status = False def init_database(self): logging.debug('init database') self.SQ = SQL() self.SQ.open_datebase() def log_in(self): logging.debug('log in account') with open('.cookies.json', 'r') as f: cookies = requests.utils.cookiejar_from_dict(json.load(f)) self.s.cookies = cookies self.s.headers = settings.WEB_headers self.log_status = True return def web_test(self, t_id, status): logging.debug('Start web_test') url = 'https://www.v2ex.com/t/%s' % str(t_id) n_time = int(time.time()) try: resp = self.s.get(url, timeout=10) except requests.exceptions.RequestException as e: logging.error('web_test failed.') logging.error('proxy_status: %s' % settings.proxy_enable) if settings.proxy_enable is True: logging.error('proxy: %s' % self.s.proxies) logging.error(e) raise e if resp.status_code == 403: error_info = 'proxy status: %s, proxy: %s' % (str( settings.proxy_enable), str(self.s.proxies)) logging.error('API Error: proxy status: %s, proxy: %s' % (str(settings.proxy_enable), str(self.s.proxies))) raise APIError(error_info) if resp.status_code == 404 and '404 Topic Not Found' in resp.text: return { 'T_ID': int(t_id), 'NODE': None, 'STATUS': 3, 'TIME': n_time } if resp.url == 'https://www.v2ex.com/': return self.api_test(t_id, status=2) if 'signin' in resp.url and self.log_status is False: # self.log_in() # return self.web_test(t_id, status=1) return self.api_test(t_id, status=1) tree = etree.HTML(resp.text) node_name = re.findall( r'\/go\/(\w+)', tree.xpath('//div[@class="header"]/a[2]/@href')[0])[0] self.SQ.cursor.execute("SELECT ID FROM NODES WHERE name == '%s';" % node_name) node_id = self.SQ.cursor.fetchone()[0] return { 'T_ID': int(t_id), 'NODE': node_id, 'STATUS': status, 'TIME': n_time } def api_test(self, t_id, status): logging.debug('Start api_test') self.s_a = requests.session() if settings.proxy_enable is True: self.s_a.proxies = settings.proxies() self.s_a.headers = settings.API_headers url = 'https://www.v2ex.com/api/topics/show.json?id=%s' % str(t_id) n_time = int(time.time()) try: resp = self.s_a.get(url, timeout=10) except requests.exceptions.RequestException as e: logging.error('api_test failed.') logging.error('proxy_status: %s' % settings.proxy_enable) if settings.proxy_enable is True: logging.error('proxy: %s' % self.s.proxies) logging.error(e) raise e if resp.status_code != 200: error_info = 'proxy status: %s, proxy: %s' % (str( settings.proxy_enable), str(self.s.proxies)) logging.error('API Error: proxy status: %s, proxy: %s' % (str(settings.proxy_enable), str(self.s.proxies))) raise APIError(error_info) if len(resp.json()) == 0: return { 'T_ID': int(t_id), 'NODE': None, 'STATUS': 3, 'TIME': n_time } topic = resp.json()[0] node_id = topic["node"]["id"] return { 'T_ID': int(t_id), 'NODE': node_id, 'STATUS': status, 'TIME': n_time } def write_to_sql(self, T_ID, NODE, STATUS, TIME): self.SQ.write_to_db_status(T_ID, NODE, STATUS, TIME) return
class Rss_spider(object): ''' A Spider for v2ex's Rss. Get the latest and hot topic on the index. Using the rss generate the topic list that need to spider. ''' def __init__(self): ''' >>>from v2ex_spider import rss_spider >>>rss_spider.Rss_spider() ''' logging.info('start Rss spider') self.v2ex_rss_url_list=['https://www.v2ex.com/index.xml', 'https://www.v2ex.com/feed/tab/qna.xml', 'https://www.v2ex.com/feed/tab/jobs.xml', 'https://www.v2ex.com/feed/tab/deals.xml', 'https://www.v2ex.com/feed/tab/city.xml', 'https://www.v2ex.com/feed/tab/play.xml', 'https://www.v2ex.com/feed/tab/apple.xml', 'https://www.v2ex.com/feed/tab/creative.xml', 'https://www.v2ex.com/feed/tab/tech.xml'] self.latest_hot_api=['https://www.v2ex.com/api/topics/latest.json','https://www.v2ex.com/api/topics/hot.json'] self.topic_sleep_time=10 logging.debug('open sql database') self.SQ=SQL() self.SQ.open_datebase() self.redis_conn=Redis() self.load_config() #run try: self.latest_and_hot() except APIError as e: pass self.gen_topic_queue() #end self.SQ.close_datebase() logging.info('end the Rss spider') def topics_id_rss(self): logging.debug('fetch rss feeds') topic_ids=list() for v2ex_rss_url in self.v2ex_rss_url_list: feed=feedparser.parse(v2ex_rss_url) logging.debug('fetch rss feed: %s' % v2ex_rss_url) items=feed["items"] for item in items: author=item["author"] title=item["title"] link=item["link"] published=item[ "date" ] summary=item["summary"] topic_id=int(re.findall(r't\/(\d+)#?', link)[0]) topic_ids.append(topic_id) topic_ids=set(topic_ids) return topic_ids def topics_id_sqlite(self): logging.debug('SELECT ID FROM TOPIC') sql='SELECT ID FROM TOPIC;' self.SQ.cursor.execute(sql) topics_ids=[x[0] for x in self.SQ.cursor.fetchall()] return topics_ids def latest_and_hot(self): logging.debug('start latest_and_hot') for url in self.latest_hot_api: try: resp=self.s.get(url, timeout=10) except requests.exceptions.RequestException as e: logging.error('latest_and_hot error') logging.error('proxy_status: %s' % self.proxy_enable) if self.proxy_enable is True: logging.error('proxy: %s' % self.s.proxies) logging.error(e) raise e if resp.status_code != 200: logging.error('latest_and_hot error') logging.error('proxy_status: %s' % self.proxy_enable) if self.proxy_enable is True: logging.error('proxy: %s' % self.s.proxies) logging.error(APIError('latest_and_hot')) raise APIError('latest_and_hot') topics=resp.json() for topic in topics: t_id=topic["id"] title=topic["title"] author=topic["member"]["username"] author_id=topic["member"]["id"] content=topic["content"] content_rendered=topic["content_rendered"] replies=topic["replies"] node=topic["node"]["id"] created=topic["created"] n_time=int(time.time()) self.SQ.write_to_db_base(t_id,title,author,author_id,content,content_rendered,replies,node,created,n_time) self.SQ.conn.commit() return def gen_topic_queue(self): logging.debug('start topic enqueue') topics_sql=self.topics_id_sqlite() if len(topics_sql) <= 2000: return topics_rss=self.topics_id_rss() # load topics if os.path.exists('.topics_all.json'): with open('.topics_all.json','r') as f: tmp_topics=json.load(f) else: tmp_topics=list() t_queue=Queue('topic',connection=self.redis_conn) # gen queue for topic in topics_rss: if topic not in topics_sql and topic not in tmp_topics: topic_id=int(topic) t_queue.enqueue(topic_spider.start,topic_id, self.topic_sleep_time) #save topics topics_all=list() topics_all.extend(tmp_topics) topics_all.extend(topics_rss) topics_all.extend(topics_sql) topics_all=list(set(topics_all)) with open('.topics_all.json','w') as f: json.dump(topics_all, f) return def load_config(self): logging.debug('load config') self.proxy_enable=settings.i_proxy_enable self.s=requests.session() self.s.headers=settings.API_headers if self.proxy_enable: self.s.proxies=settings.i_proxies()
def init_database(self): self.SQ=SQL() self.SQ.open_datebase()