def __init__(self, crawl_newspaper=True, crawl_kols=False, crawl_kols_by_smcc=False, max_kols=100, export_to_json=True, export_to_queue=False, export_to_elasticsearch=False, export_to_wordpress=False, export_to_postgres=False): ''' input ----- max_kols: max random-get kols will be crawl in this turn ''' self._crawl_kols = crawl_kols self._crawl_newspaper = crawl_newspaper self._crawl_kols_by_smcc = crawl_kols_by_smcc self._export_to_json = export_to_json self._export_to_queue = export_to_queue self._export_to_elasticsearch = export_to_elasticsearch self._export_to_wordpress = export_to_wordpress self._export_to_postgres = export_to_postgres print(f"""############################### ################################## ####################### ######################### {export_to_wordpress} ############################### ################################## ####################### #########################""") base_dir = os.environ['DOCBAO_BASE_DIR'] # Create shared object self._config_manager = ConfigManager( get_independent_os_path( [base_dir, 'src', 'backend', 'input', 'config.yaml']), get_independent_os_path( [base_dir, 'src', 'backend', 'input', 'kols_list.txt']), get_independent_os_path( [base_dir, 'src', 'backend', 'input', 'fb_list.txt'])) #config object self._data_manager = ArticleManager(self._config_manager, get_independent_os_path([ base_dir, 'src', 'backend', 'data', 'article.dat' ]), get_independent_os_path([ base_dir, 'src', 'backend', "data", "blacklist.dat" ])) #article database object self._keyword_manager = KeywordManager( self._data_manager, self._config_manager, get_independent_os_path( [base_dir, 'src', 'backend', "data", "keyword.dat"]), get_independent_os_path( [base_dir, 'src', 'backend', "input", "collocation.txt"]), get_independent_os_path(["input", "keywords_to_remove.txt" ])) #keyword analyzer object
def process_save_webconfig(webconfig): menu = ['Save in a new file', 'Update an existing file'] choice = display_choice_dialog('How do you want to save: ', menu) file_base_path = get_independent_os_path( ['resources', 'configs', 'newspaper']) if choice == 0: print("Working config will be saved in ./resource/config/newspaper/") filename = input("Filename: ") filepath = get_independent_os_path([file_base_path, filename]) elif choice == 1: #update existing file filepath = display_choose_file_dialog(file_base_path) webconfig.export(filepath) print("File is save OK")
def display_choose_file_dialog(file_base_path): header = ''' ########################################### # LOAD SITE CONFIG # ########################################### ''' ok = False while not ok: clear_screen() print(header) search = input( "Enter keyword to find config file or ENTER to display all: ") file_list = [] for root, directory, files in os.walk(file_base_path): for item in files: if search in item: file_list.append(item) # choose file config_file_index = display_menu_dialog( 'Which config do you want to load ?', 'Choice (ENTER=Cancel): ', file_list) if config_file_index is None: #cancel return None config_file_name = file_list[int(config_file_index)] # make filepath to load filepath = get_independent_os_path([file_base_path, config_file_name]) answer = input( "Are you sure to load site config from %s ? (ENTER=ok, anything=repeat)" % config_file_name) if answer.strip() == '': ok = True return filepath
def push_random_kols_to_queue(self, base_path='..', number=300): print( "Push %s random kols id to smcc service to get their post in the next crawl" % str(number)) channel = self._connection.channel() queue_name = 'facebook_scanning' queue_state = channel.queue_declare(queue_name, durable=True) with open_utf8_file_to_read( get_independent_os_path( [base_path, 'backend', 'input', 'kols_list.txt'])) as stream: kols_list = [ x.strip() for x in stream.read().split('\n') if x.strip() != '' ] choosen = set() count = 0 number_of_kols = len(kols_list) while count < number: index = randint(0, number_of_kols - 1) kol_id = kols_list[index] if kol_id not in choosen: choosen.add(kol_id) count += 1 for kol_id in choosen: body = kol_id print(kol_id) channel.basic_publish(exchange='', routing_key=queue_name, body=body)
def load_default_config(self, site_type=None, config_base_path=None): if config_base_path is None: config_base_path = get_independent_os_path( ['resources', 'configs', 'newspaper']) if site_type is None: filepath = get_independent_os_path( [config_base_path, 'website_template.md']) self.load_config_from_file(filepath) elif site_type == 'newspaper': filepath = get_independent_os_path( [config_base_path, 'newspaper_template.md']) self.load_config_from_file(filepath) elif site_type == 'wordpress': filepath = get_independent_os_path( [config_base_path, 'wordpress_template.md']) self.load_config_from_file(filepath) elif site_type == 'facebook user': filepath = get_independent_os_path( [config_base_path, 'facebook_template.md']) self.load_config_from_file(filepath) elif site_type == 'facebook fanpage': filepath = get_independent_os_path( [config_base_path, 'fanpage_template.md']) self.load_config_from_file(filepath)
def process_create_web_config_from_existing_one(): ''' return ------ webconfig object that contain config load from file ''' load_config_header = ''' ########################################### # PRINT WEB CONFIG # ########################################### ''' load_config_menu = [ 'Load from local config files', 'Load from online config database', 'Return to main menu' ] choice = -1 webconfig = None while choice != 2: clear_screen() print(load_config_header) choice = display_menu_dialog('What do you want to do ?', "Choice (ENTER=cancel): ", load_config_menu) if choice == 0: # display config file list file_base_path = get_independent_os_path( ['resources', 'configs', 'newspaper']) filepath = display_choose_file_dialog(file_base_path) if filepath is not None: webconfig = WebConfig() webconfig.load_config_from_file(filepath) input("Successfully load site config from %s" % filepath) elif choice == 1: pass return webconfig
def multiprocess_crawl(self): # Create Manager Proxy to host shared data for multiprocessed crawled with multiprocessing.Manager() as manager: data_manager = self._data_manager config_manager = self._config_manager keyword_manager = self._keyword_manager time.sleep(1) print("Create Manager Proxy") time.sleep(1) print("Create shared object") crawl_queue = manager.Queue() crawled_articles = manager.Queue() new_blacklists = manager.Queue() browser_list = manager.Queue( ) # keep all firefox browser to release when timeout lock = manager.Lock() timeout_flag = manager.Value( 'i', 0) # shared variable to inform processes if timeout happends # Init crawl queue time.sleep(1) print("Init crawl queue") config_list = config_manager.get_newspaper_list() number_of_job = 0 for webconfig in config_list: # check delay time between crawl last_run = webconfig.get_last_run() min_duration = webconfig.get_minimum_duration_between_crawls() time_pass = int( (get_utc_now_date() - last_run).total_seconds() / 60) if time_pass > min_duration: # print("Print crawl_queue:") # print(webconfig.get_crawl_url()) # for debug crawl_queue.put(webconfig) number_of_job += 1 webconfig.set_last_run() # set last_run to now else: web_name = webconfig.get_webname() print( "Ignore crawling %s. Need more %d minutes more to crawl" % (web_name, min_duration - time_pass)) # Start crawl process max_crawler = config_manager.get_max_crawler() time.sleep(1) print("%s crawlers are set to be run in parallel" % str(max_crawler)) supported_max_crawler = get_max_crawler_can_be_run() if supported_max_crawler == 0: supported_max_crawler = 1 if max_crawler > supported_max_crawler: time.sleep(1) print( "Current system can support only %s crawlers to be run in parallel" % str(supported_max_crawler)) time.sleep(1) print("You should reduce max_crawler in config.yaml") time.sleep(1) print("max_crawler will be set to %s in this run" % str(supported_max_crawler)) max_crawler = supported_max_crawler elif max_crawler < supported_max_crawler: time.sleep(1) print( "Current system can support up to %s crawlers to be run in parallel" % str(supported_max_crawler)) time.sleep(1) print("You should increase max_crawler in config.yaml") if max_crawler > int(number_of_job / 2): time.sleep(1) print("There are only %s newspaper to crawl" % str(number_of_job)) time.sleep(1) print("max_crawler will be set to %s for efficience" % str(int(number_of_job / 2) + 1)) max_crawler = int(number_of_job / 2) + 1 crawler_processes = [] time.sleep(1) print("Can run max to %s crawlers" % str(max_crawler)) timeout = config_manager.get_timeout() start = time.time() alive_crawler = 0 running = True start_timeout = 0 is_timeout = False terminate_time = 120 # 2 min crawler_iterator = 0 while running: # count alive crawler running_crawler = '' alive_crawler = 0 running = False for process in crawler_processes: if process.is_alive(): alive_crawler += 1 running_crawler = running_crawler + " %s " % str( alive_crawler) running = True if running: print("Running crawler:") print(running_crawler) else: # not running process lock.acquire() if crawl_queue.empty(): lock.release() break running = True lock.release() # create new crawler if needed lock.acquire() if alive_crawler < max_crawler and not crawl_queue.empty( ) and not is_timeout: # have more jobs that current browser can't crawl. Maybe need another browser_profiles #epdb.set_trace() lock.release() print("Can create more crawler") crawler_iterator += 1 crawler = multiprocessing.Process( target=self.crawler_process, args=(str(crawler_iterator), lock, timeout_flag, browser_list, crawl_queue, data_manager, crawled_articles, new_blacklists, self._export_to_queue)) crawler_processes.append(crawler) crawler.start() time.sleep(1) print("Start crawler number %s (pid: %s)" % (str(crawler_iterator), crawler.pid)) else: lock.release() # kill all process after timeout if not is_timeout: print("Remaining seconds to timeout %s" % str(int(timeout - time.time() + start))) else: print( "Remaining seconds to terminate %s" % str(int(terminate_time - time.time() + start_timeout))) if (time.time() - start > timeout) and (not is_timeout): start_timeout = time.time() print("Timeout") print( "Inform all processes about timeout. Terminate all after 2 min" ) lock.acquire() timeout_flag.value = 1 lock.release() is_timeout = True if (timeout_flag.value == 1) and (time.time() - start_timeout >= terminate_time): print("Kill unquited browser") while not browser_list.empty(): lock.acquire() browser = browser_list.get() print("Found a running browser") print(browser) print("Close browser") browser.quit() lock.release() print("Kill all processes") for crawler in crawler_processes: crawler.terminate() # some processes may be not terminate. Don't know why #crawler.join() running = False time.sleep(10) # join process to wait for all crawler to finish #for crawler in crawler_processes: # crawler.join() time.sleep(1) print("Finish crawling") time.sleep(1) # Save all new crawled articles and push to ElasticSearch + RabbitMQ print("New crawled articles") rb = RabbitMQ_Client() rb_articles = [] while not crawled_articles.empty(): article = crawled_articles.get() if article.get_id() not in data_manager._data: data_manager._data[article.get_id( )] = article # merge new articles to data rb_articles.append(article) print("%s: %s" % (article.get_newspaper(), article.get_topic())) while not new_blacklists.empty(): href, count = new_blacklists.get() data_manager._blacklist[href] = count # analyze keyword print("Analyze keywords") keyword_manager.build_keyword_list() print("Export data to json files") # export data trending_keywords = keyword_manager.write_trending_keyword_by_growing_speed_to_json_file( ) if self._export_to_json: data_manager.export_to_json() keyword_manager.write_fast_growing_keyword_to_json_file() keyword_manager.write_uncategorized_keyword_to_text_file() keyword_manager.write_trending_article_to_json_file() keyword_manager.write_hot_growing_article_to_json_file() keyword_manager.write_keyword_dicts_to_json_file() keyword_manager.write_keyword_freq_series_to_json_file() keyword_manager.write_new_keyword_to_json_file() trends = [] for topic in trending_keywords[:min(40, len(trending_keywords))]: print('trending topic: %s' % topic) articles = data_manager.get_latest_article_contain_keyword( topic, number=6) first_article = articles[0] posts = [] print('relate article: ') for article in articles: posts.append(str(article.get_id())) print(article.get_topic()) print(posts) #trends.append({'topic': topic, 'posts': posts}) trends.append({ 'topic': first_article.get_topic(), 'posts': posts }) # push data print("Push data to database and other services") if self._export_to_queue: try: # push to RabbitMQ # for Bangtin project only rb.connect() rb.push_trends_to_queue(trends) rb.disconnect() except: print_exception() if self._export_to_wordpress: try: # push to wordpress wp = Wordpress() for article in rb_articles: if article.get_post_type() == 0: # newspaper post topic = article.get_topic().lower() # trending = False # if trending_keywords: # for keyword in trending_keywords: # if keyword in topic: # trending = True # break post_id = wp.add_new_article(article, []) if post_id: article.set_wordpress_id(post_id) #sleep(15) # avoid being banned by wordpress host except: print_exception() if self._export_to_elasticsearch: try: # push to Elasticsearch es = ElasticSearch_Client() for article in rb_articles: es.push_article( article) # put new article to ElasticSearch except: print_exception() # write log data try: with open_utf8_file_to_write( get_independent_os_path(["export", "log_data.json"])) as stream: log_dict = dict() update_time = get_date_string( get_utc_now_date(), date_format="%d/%m/%Y %H:%M", timezone=config_manager.get_display_timezone()) log_dict['update_time'] = update_time log_dict['newspaper_count'] = str( config_manager.get_newspaper_count()) log_dict['database_count'] = str( data_manager.count_database()) log_dict['hub_title'] = config_manager.get_hub_title() log_dict['hub_href'] = config_manager.get_hub_href() stream.write(jsonpickle.encode(log_dict)) stream.close() except: print_exception() print("FINISH")
def process_manage_crawl_list(config_manager): ''' output ====== - config_manager with modified data - None or webconfig loaded from current crawl list ''' manage_crawl_list_header = ''' ########################################### # MANAGE CRAWL LIST # ########################################### ''' # what to do next menu = [ 'Add site config file to list', 'Remove newspaper from list', 'Edit site config in list', 'Load site config in list to working config', 'Edit config of all site in list', 'Add working site config to list', 'Load working site config from list', 'Return', ] user_choice = -1 webconfig = None while user_choice != len(menu) - 1: # finish clear_screen() print(manage_crawl_list_header) newspaper_list = config_manager.print_crawl_list( ) # newspaper_list contain all crawl config. All edits will be made on newspaper_list then merge back into config_manager print() user_choice = display_menu_dialog('What do you want to do next ? ', 'Choice: ', menu) if user_choice == 0: # add config file to list filepath = display_choose_file_dialog( get_independent_os_path(['resources', 'configs', 'newspaper'])) if filepath is not None: new_webconfig = WebConfig() new_webconfig.load_config_from_file(filepath) newspaper_list.append(new_webconfig) input("Successfully add %s to crawl list" % new_webconfig.get_webname()) config_manager.replace_crawl_list( newspaper_list) #save all changes to config_manager config_manager.save_data() elif user_choice == 1: # remove newspaper from list choice = input( "Please input LINE NUMBER to remove or ENTER to cancel: ") if choice.strip() != '' and choice.isdigit(): remove_webconfig = newspaper_list.pop(int(choice) - 1) input("Successfuly remove %s from crawl list" % remove_webconfig.get_webname()) config_manager.replace_crawl_list( newspaper_list) #save all changes to config_manager config_manager.save_data() elif user_choice == 2: # edit site config in list choice = input( "Please input LINE NUMBER to edit or ENTER to cancel: ") if choice.strip() != '' and choice.isdigit(): choose_webconfig = newspaper_list[int(choice) - 1] choose_webconfig = process_edit_config(choose_webconfig) config_manager.add_newspaper(choose_webconfig) # update config config_manager.save_data() elif user_choice == 3: # load site config to working config choice = input( "Please input LINE NUMBER to load or ENTER to cancel: ") if choice.strip() != '' and choice.isdigit(): choose_webconfig = newspaper_list[int(choice) - 1] webconfig = choose_webconfig input("Successfuly load %s config to working config" % choose_webconfig.get_webname()) elif user_choice == 4: # edit single config of all sites in list print() newspaper_list = config_manager.get_newspaper_list() if len(newspaper_list) > 0: print("Sample of a site config:") sample_site = newspaper_list[0] sample_site.print_config() print() key = input( 'Enter config property to edit: (ENTER=cancel, -config_name=remove)' ).strip() key = key.strip() if key != '': if key[0] == '-': # remove config key = key[1:] count = 0 for newspaper in config_manager.get_newspaper_list(): count += 1 newspaper.delete_config(key) input("Successfully remove %s of %s site" % (key, str(count))) config_manager.save_data() else: # edit all new_value = eval(input('Enter new value of %s: ' % key)) count = 0 for newspaper in config_manager.get_newspaper_list(): count += 1 newspaper.set_config(key, new_value) input( "Successfully change %s of %s site to new value" % (key, str(count))) config_manager.save_data() else: print("There haven't been any site config in crawling list") elif user_choice == 5: # add working site config to list if webconfig is not None: config_manager.add_newspaper(webconfig) input("Succesfully add %s to crawl list" % webconfig.get_webname()) config_manager.save_data() else: input( "No working site config. Please go to site config manager to create/load one" ) elif user_choice == 6: # load newspaper to current webconfig choice = input( "Please input LINE NUMBER to load or ENTER to cancel: ") if choice.strip() != '' and choice.isdigit(): webconfig = copy.copy(newspaper_list[int(choice) - 1]) input("Successfuly load %s from crawl list" % webconfig.get_webname()) return webconfig
def process_test_crawl_web_config(webconfig): ''' function -------- try to crawl with webconfig only return ------ modified webconfig ''' test_crawl_header = ''' ########################################### # TEST CRAWLING SITE CONFIG # ########################################### ''' has_change_dir = False try: os.chdir("backend") has_change_dir = True except: pass continue_test = True while continue_test: clear_screen() print(test_crawl_header) # prepare webconfig for test minimum_duration_old_value = webconfig.get_minimum_duration_between_crawls( ) webconfig.set_minimum_duration_between_crawls( -5) # mean always crawl this config maximum_url_old_value = webconfig.get_config('maximum_url', 10) webconfig.set_config('maximum_url', 50) # ask for edit choice = display_yes_no_dialog( "Is there anything to edit before test crawling (y/n) ?") if choice: webconfig = process_edit_config(webconfig) maximum_url_old_value = webconfig.get_maximum_url() # test config_manager = ConfigManager( get_independent_os_path(['src', 'backend', 'input', 'test.yaml']), get_independent_os_path(['input', 'kols_list.txt']), get_independent_os_path(['input', 'fb_list.txt'])) config_manager.load_data() config_manager.replace_crawl_list([webconfig]) data_filename = get_independent_os_path( ['src', 'backend', 'data', 'test_article.dat']) blacklist_filename = get_independent_os_path( ['src', 'backend', 'data', 'test_blacklist.dat']) data_manager = ArticleManager(config_manager, data_filename, blacklist_filename) data_manager.reset_data() # test crawl my_pid = 1 browser = BrowserWrapper() if webconfig.get_crawl_type() == 'newspaper': data_manager.add_articles_from_newspaper(my_pid, webconfig, browser) elif 'facebook' in webconfig.get_crawl_type(): data_manager.add_articles_from_facebook(my_pid, webconfig, browser) # report # continue_test = display_yes_no_dialog( 'Do you want to test again (y/n) ?: ') # return back webconfig.set_config('maximum_url', maximum_url_old_value) webconfig.set_minimum_duration_between_crawls( minimum_duration_old_value) if has_change_dir: os.chdir("..") return webconfig
elif choice == 4: # test crawling process_test_crawl_web_config(webconfig) elif choice == 5: # add/update working site config to crawl list config_manager.add_newspaper(webconfig) config_manager.save_data() print("Successfully add/update %s to crawl list" % webconfig.get_webname()) elif choice == 6: # move to program setting process_manage_config(webconfig, config_manager) return webconfig # MAIN PROGRAM config_manager = ConfigManager( get_independent_os_path(['src', 'backend', 'input', 'config.yaml']), get_independent_os_path(['src, ', 'backend', 'input', 'kols_list.txt']), get_independent_os_path(['src', 'backend', 'input', 'fb_list.txt'])) #config object config_manager.load_data(True, False, False, 30, '.') main_menu_choice = -1 while main_menu_choice != 2: clear_screen() print(header) main_menu = ['Manage site config', 'Manage program settings', 'Quit'] main_menu_choice = display_menu_dialog('WELCOME TO CONFIG MANAGER', 'Your choice: ', main_menu) webconfig = None
def load_data(self, crawl_newspaper=True, crawl_kols=False, crawl_kols_by_smcc=False, random_kols=True, random_fb_account=True, max_kols=5, base_path='..'): ''' input ----- crawl_newspaper: crawl newspaper configs in /backend/input/config.yaml crawl_kols: crawl kols post from kols id list in /backend/input/kols_list.txt by using facebook bot random_kols: choose random (max_kols) from kols list to crawl random_fb_account: set random fb bot (presetup in /backend/input/fb_list.txt file to crawl kol posts crawl_kols_by_smcc: crawl kols posts using smcc service (push some kol id to queue and get back post from queue). Choose random kols id (100 in number) by default and create only one webconfig with crawl_type = "kols smcc" base_path: get value '..' or '.' to specify path to resources folder in comparision with running path ''' #print(self._config) stream = open_utf8_file_to_read(self._filename) self._config = yaml.full_load(stream) stream.close() newspaper_list = [] if not crawl_newspaper: self.replace_crawl_list([]) else: newspaper_list = self.get_newspaper_list( ) # crawl newspaper last to init browser with random profiles first self.replace_crawl_list([]) if crawl_kols: # get kols_list kols_list = [] with open_utf8_file_to_read(self._kol_filename) as stream: kols_list = [ x for x in stream.read().split('\n') if x.strip() != '' ] # get fb account list fb_list = [] if random_fb_account: with open_utf8_file_to_read( self._fb_account_filename) as stream: fb_list = [ x for x in stream.read().split('\n') if x.strip() != '' ] count = 0 index = 0 choosen = set() while count < max_kols and count < len( kols_list): # finish when get max_kols count += 1 if random_kols: index = random.randint(0, len(kols_list) - 1) while index in choosen: # no repeat value index = random.randint(0, len(kols_list) - 1) choosen.add(index) print(f"Choose random kols: {kols_list[index]}" ) # print choosen kol for debugging else: index += 1 if index == len(kols_list): # end of kols list break if ';' not in kols_list[ index]: # this line contain just id, not name;url kol_name = 'unknown_id_' + kols_list[index] crawl_url = kols_list[index].strip() # profile id else: kol_name = kols_list[index].split(';')[0] crawl_url = kols_list[index].split(';')[1] webconfig = WebConfig() webconfig.load_default_config( 'facebook user', get_independent_os_path( [base_path, 'resources', 'configs', 'newspaper'])) webconfig.set_webname(kol_name) webconfig.set_config('crawl_url', crawl_url) webconfig.set_config( 'remove_me', True) # tag for delete when program finish # set random fb account to crawl if random_fb_account: profile_index = random.randint(0, len(fb_list) - 1) profile = fb_list[profile_index] webconfig.set_config('browser_profile', profile) self.add_newspaper(webconfig) # print(self._config) # crawl kols by smcc if crawl_kols_by_smcc: # create a 'crawl_type: kols smcc' WebConfig webconfig = WebConfig() webconfig.load_default_config( 'facebook user', get_independent_os_path( [base_path, 'resources', 'configs', 'newspaper'])) webconfig.set_config('crawl_type', 'kols smcc') webconfig.set_config('remove_me', True) webconfig.set_config('timezone', 'UTC') webconfig.set_webname('kol posts') webconfig.set_config('minimum_duration_between_crawls', -5) self.add_newspaper(webconfig) # append newspaper list if crawl_newspaper: for newspaper in newspaper_list: self.add_newspaper(newspaper, beginning=True)
import os from src.backend.lib.data import ArticleManager from src.backend.lib.rabbitmq_client import RabbitMQ_Client from src.backend.lib.utils import get_independent_os_path base_dir = os.environ['DOCBAO_BASE_DIR'] rb = RabbitMQ_Client() rb.connect() data_manager = ArticleManager(None, get_independent_os_path([base_dir, 'src', 'backend', 'data', 'article.dat']),get_independent_os_path([base_dir, 'src', 'backend', "data","blacklist.dat"]) ) #article database object data_manager.load_data() print("PUSH ALL ARTICLES TO RABBITMQ") rb.push_to_queue(data_manager._data.values())