def get_last_run(self): last_run_string = self.get_config( 'last_run', get_utc_now_date() - timedelta(days=7)) if isinstance(last_run_string, str): naive_last_run = datetime.strptime(last_run_string, "%d/%m/%Y %H:%M") aware_last_run = self.get_timezone().localize(naive_last_run) return aware_last_run else: last_run = last_run_string self.set_last_run(last_run) return last_run
def multiprocess_crawl(self): # Create Manager Proxy to host shared data for multiprocessed crawled with multiprocessing.Manager() as manager: data_manager = self._data_manager config_manager = self._config_manager keyword_manager = self._keyword_manager time.sleep(1) print("Create Manager Proxy") time.sleep(1) print("Create shared object") crawl_queue = manager.Queue() crawled_articles = manager.Queue() new_blacklists = manager.Queue() browser_list = manager.Queue( ) # keep all firefox browser to release when timeout lock = manager.Lock() timeout_flag = manager.Value( 'i', 0) # shared variable to inform processes if timeout happends # Init crawl queue time.sleep(1) print("Init crawl queue") config_list = config_manager.get_newspaper_list() number_of_job = 0 for webconfig in config_list: # check delay time between crawl last_run = webconfig.get_last_run() min_duration = webconfig.get_minimum_duration_between_crawls() time_pass = int( (get_utc_now_date() - last_run).total_seconds() / 60) if time_pass > min_duration: # print("Print crawl_queue:") # print(webconfig.get_crawl_url()) # for debug crawl_queue.put(webconfig) number_of_job += 1 webconfig.set_last_run() # set last_run to now else: web_name = webconfig.get_webname() print( "Ignore crawling %s. Need more %d minutes more to crawl" % (web_name, min_duration - time_pass)) # Start crawl process max_crawler = config_manager.get_max_crawler() time.sleep(1) print("%s crawlers are set to be run in parallel" % str(max_crawler)) supported_max_crawler = get_max_crawler_can_be_run() if supported_max_crawler == 0: supported_max_crawler = 1 if max_crawler > supported_max_crawler: time.sleep(1) print( "Current system can support only %s crawlers to be run in parallel" % str(supported_max_crawler)) time.sleep(1) print("You should reduce max_crawler in config.yaml") time.sleep(1) print("max_crawler will be set to %s in this run" % str(supported_max_crawler)) max_crawler = supported_max_crawler elif max_crawler < supported_max_crawler: time.sleep(1) print( "Current system can support up to %s crawlers to be run in parallel" % str(supported_max_crawler)) time.sleep(1) print("You should increase max_crawler in config.yaml") if max_crawler > int(number_of_job / 2): time.sleep(1) print("There are only %s newspaper to crawl" % str(number_of_job)) time.sleep(1) print("max_crawler will be set to %s for efficience" % str(int(number_of_job / 2) + 1)) max_crawler = int(number_of_job / 2) + 1 crawler_processes = [] time.sleep(1) print("Can run max to %s crawlers" % str(max_crawler)) timeout = config_manager.get_timeout() start = time.time() alive_crawler = 0 running = True start_timeout = 0 is_timeout = False terminate_time = 120 # 2 min crawler_iterator = 0 while running: # count alive crawler running_crawler = '' alive_crawler = 0 running = False for process in crawler_processes: if process.is_alive(): alive_crawler += 1 running_crawler = running_crawler + " %s " % str( alive_crawler) running = True if running: print("Running crawler:") print(running_crawler) else: # not running process lock.acquire() if crawl_queue.empty(): lock.release() break running = True lock.release() # create new crawler if needed lock.acquire() if alive_crawler < max_crawler and not crawl_queue.empty( ) and not is_timeout: # have more jobs that current browser can't crawl. Maybe need another browser_profiles #epdb.set_trace() lock.release() print("Can create more crawler") crawler_iterator += 1 crawler = multiprocessing.Process( target=self.crawler_process, args=(str(crawler_iterator), lock, timeout_flag, browser_list, crawl_queue, data_manager, crawled_articles, new_blacklists, self._export_to_queue)) crawler_processes.append(crawler) crawler.start() time.sleep(1) print("Start crawler number %s (pid: %s)" % (str(crawler_iterator), crawler.pid)) else: lock.release() # kill all process after timeout if not is_timeout: print("Remaining seconds to timeout %s" % str(int(timeout - time.time() + start))) else: print( "Remaining seconds to terminate %s" % str(int(terminate_time - time.time() + start_timeout))) if (time.time() - start > timeout) and (not is_timeout): start_timeout = time.time() print("Timeout") print( "Inform all processes about timeout. Terminate all after 2 min" ) lock.acquire() timeout_flag.value = 1 lock.release() is_timeout = True if (timeout_flag.value == 1) and (time.time() - start_timeout >= terminate_time): print("Kill unquited browser") while not browser_list.empty(): lock.acquire() browser = browser_list.get() print("Found a running browser") print(browser) print("Close browser") browser.quit() lock.release() print("Kill all processes") for crawler in crawler_processes: crawler.terminate() # some processes may be not terminate. Don't know why #crawler.join() running = False time.sleep(10) # join process to wait for all crawler to finish #for crawler in crawler_processes: # crawler.join() time.sleep(1) print("Finish crawling") time.sleep(1) # Save all new crawled articles and push to ElasticSearch + RabbitMQ print("New crawled articles") rb = RabbitMQ_Client() rb_articles = [] while not crawled_articles.empty(): article = crawled_articles.get() if article.get_id() not in data_manager._data: data_manager._data[article.get_id( )] = article # merge new articles to data rb_articles.append(article) print("%s: %s" % (article.get_newspaper(), article.get_topic())) while not new_blacklists.empty(): href, count = new_blacklists.get() data_manager._blacklist[href] = count # analyze keyword print("Analyze keywords") keyword_manager.build_keyword_list() print("Export data to json files") # export data trending_keywords = keyword_manager.write_trending_keyword_by_growing_speed_to_json_file( ) if self._export_to_json: data_manager.export_to_json() keyword_manager.write_fast_growing_keyword_to_json_file() keyword_manager.write_uncategorized_keyword_to_text_file() keyword_manager.write_trending_article_to_json_file() keyword_manager.write_hot_growing_article_to_json_file() keyword_manager.write_keyword_dicts_to_json_file() keyword_manager.write_keyword_freq_series_to_json_file() keyword_manager.write_new_keyword_to_json_file() trends = [] for topic in trending_keywords[:min(40, len(trending_keywords))]: print('trending topic: %s' % topic) articles = data_manager.get_latest_article_contain_keyword( topic, number=6) first_article = articles[0] posts = [] print('relate article: ') for article in articles: posts.append(str(article.get_id())) print(article.get_topic()) print(posts) #trends.append({'topic': topic, 'posts': posts}) trends.append({ 'topic': first_article.get_topic(), 'posts': posts }) # push data print("Push data to database and other services") if self._export_to_queue: try: # push to RabbitMQ # for Bangtin project only rb.connect() rb.push_trends_to_queue(trends) rb.disconnect() except: print_exception() if self._export_to_wordpress: try: # push to wordpress wp = Wordpress() for article in rb_articles: if article.get_post_type() == 0: # newspaper post topic = article.get_topic().lower() # trending = False # if trending_keywords: # for keyword in trending_keywords: # if keyword in topic: # trending = True # break post_id = wp.add_new_article(article, []) if post_id: article.set_wordpress_id(post_id) #sleep(15) # avoid being banned by wordpress host except: print_exception() if self._export_to_elasticsearch: try: # push to Elasticsearch es = ElasticSearch_Client() for article in rb_articles: es.push_article( article) # put new article to ElasticSearch except: print_exception() # write log data try: with open_utf8_file_to_write( get_independent_os_path(["export", "log_data.json"])) as stream: log_dict = dict() update_time = get_date_string( get_utc_now_date(), date_format="%d/%m/%Y %H:%M", timezone=config_manager.get_display_timezone()) log_dict['update_time'] = update_time log_dict['newspaper_count'] = str( config_manager.get_newspaper_count()) log_dict['database_count'] = str( data_manager.count_database()) log_dict['hub_title'] = config_manager.get_hub_title() log_dict['hub_href'] = config_manager.get_hub_href() stream.write(jsonpickle.encode(log_dict)) stream.close() except: print_exception() print("FINISH")
def set_last_run(self, date=None): if date is None: date = get_utc_now_date() self.set_config( 'last_run', get_date_string(date, "%d/%m/%Y %H:%M", pytz.timezone("UTC")))