예제 #1
0
    def __init__(self,
                 crawl_newspaper=True,
                 crawl_kols=False,
                 crawl_kols_by_smcc=False,
                 max_kols=100,
                 export_to_json=True,
                 export_to_queue=False,
                 export_to_elasticsearch=False,
                 export_to_wordpress=False,
                 export_to_postgres=False):
        '''
        input
        -----
        max_kols: max random-get kols will be crawl  in this turn
        '''
        self._crawl_kols = crawl_kols
        self._crawl_newspaper = crawl_newspaper
        self._crawl_kols_by_smcc = crawl_kols_by_smcc
        self._export_to_json = export_to_json
        self._export_to_queue = export_to_queue
        self._export_to_elasticsearch = export_to_elasticsearch
        self._export_to_wordpress = export_to_wordpress
        self._export_to_postgres = export_to_postgres
        print(f"""###############################
        ##################################
        #######################
        #########################
        {export_to_wordpress}
        ###############################
        ##################################
        #######################
        #########################""")
        base_dir = os.environ['DOCBAO_BASE_DIR']

        # Create shared object
        self._config_manager = ConfigManager(
            get_independent_os_path(
                [base_dir, 'src', 'backend', 'input', 'config.yaml']),
            get_independent_os_path(
                [base_dir, 'src', 'backend', 'input', 'kols_list.txt']),
            get_independent_os_path(
                [base_dir, 'src', 'backend', 'input',
                 'fb_list.txt']))  #config object
        self._data_manager = ArticleManager(self._config_manager,
                                            get_independent_os_path([
                                                base_dir, 'src', 'backend',
                                                'data', 'article.dat'
                                            ]),
                                            get_independent_os_path([
                                                base_dir, 'src', 'backend',
                                                "data", "blacklist.dat"
                                            ]))  #article database object
        self._keyword_manager = KeywordManager(
            self._data_manager, self._config_manager,
            get_independent_os_path(
                [base_dir, 'src', 'backend', "data", "keyword.dat"]),
            get_independent_os_path(
                [base_dir, 'src', 'backend', "input", "collocation.txt"]),
            get_independent_os_path(["input", "keywords_to_remove.txt"
                                     ]))  #keyword analyzer object
예제 #2
0
def process_save_webconfig(webconfig):
    menu = ['Save in a new file', 'Update an existing file']
    choice = display_choice_dialog('How do you want to save: ', menu)

    file_base_path = get_independent_os_path(
        ['resources', 'configs', 'newspaper'])

    if choice == 0:
        print("Working config will be saved in ./resource/config/newspaper/")
        filename = input("Filename: ")
        filepath = get_independent_os_path([file_base_path, filename])
    elif choice == 1:  #update existing file
        filepath = display_choose_file_dialog(file_base_path)

    webconfig.export(filepath)
    print("File is save OK")
예제 #3
0
def display_choose_file_dialog(file_base_path):
    header = '''
###########################################
#           LOAD SITE CONFIG              #
###########################################
    '''
    ok = False
    while not ok:
        clear_screen()
        print(header)
        search = input(
            "Enter keyword to find config file or ENTER to display all: ")
        file_list = []
        for root, directory, files in os.walk(file_base_path):
            for item in files:
                if search in item:
                    file_list.append(item)

        # choose file
        config_file_index = display_menu_dialog(
            'Which config do you want to load ?', 'Choice (ENTER=Cancel): ',
            file_list)
        if config_file_index is None:  #cancel
            return None
        config_file_name = file_list[int(config_file_index)]
        # make filepath to load
        filepath = get_independent_os_path([file_base_path, config_file_name])
        answer = input(
            "Are you sure to load site config from %s ? (ENTER=ok, anything=repeat)"
            % config_file_name)
        if answer.strip() == '':
            ok = True
    return filepath
예제 #4
0
    def push_random_kols_to_queue(self, base_path='..', number=300):
        print(
            "Push %s random kols id to smcc service to get their post in the next crawl"
            % str(number))
        channel = self._connection.channel()
        queue_name = 'facebook_scanning'
        queue_state = channel.queue_declare(queue_name, durable=True)

        with open_utf8_file_to_read(
                get_independent_os_path(
                    [base_path, 'backend', 'input',
                     'kols_list.txt'])) as stream:
            kols_list = [
                x.strip() for x in stream.read().split('\n') if x.strip() != ''
            ]
        choosen = set()
        count = 0
        number_of_kols = len(kols_list)
        while count < number:
            index = randint(0, number_of_kols - 1)
            kol_id = kols_list[index]
            if kol_id not in choosen:
                choosen.add(kol_id)
                count += 1

        for kol_id in choosen:
            body = kol_id
            print(kol_id)
            channel.basic_publish(exchange='',
                                  routing_key=queue_name,
                                  body=body)
예제 #5
0
    def load_default_config(self, site_type=None, config_base_path=None):
        if config_base_path is None:
            config_base_path = get_independent_os_path(
                ['resources', 'configs', 'newspaper'])

        if site_type is None:
            filepath = get_independent_os_path(
                [config_base_path, 'website_template.md'])
            self.load_config_from_file(filepath)
        elif site_type == 'newspaper':
            filepath = get_independent_os_path(
                [config_base_path, 'newspaper_template.md'])
            self.load_config_from_file(filepath)
        elif site_type == 'wordpress':
            filepath = get_independent_os_path(
                [config_base_path, 'wordpress_template.md'])
            self.load_config_from_file(filepath)
        elif site_type == 'facebook user':
            filepath = get_independent_os_path(
                [config_base_path, 'facebook_template.md'])
            self.load_config_from_file(filepath)
        elif site_type == 'facebook fanpage':
            filepath = get_independent_os_path(
                [config_base_path, 'fanpage_template.md'])
            self.load_config_from_file(filepath)
예제 #6
0
def process_create_web_config_from_existing_one():
    '''
    return
    ------
    webconfig object that contain config load from file
    '''

    load_config_header = '''
###########################################
#            PRINT WEB CONFIG             #
###########################################
    '''

    load_config_menu = [
        'Load from local config files', 'Load from online config database',
        'Return to main menu'
    ]

    choice = -1
    webconfig = None
    while choice != 2:
        clear_screen()
        print(load_config_header)
        choice = display_menu_dialog('What do you want to do ?',
                                     "Choice (ENTER=cancel): ",
                                     load_config_menu)
        if choice == 0:
            # display config file list
            file_base_path = get_independent_os_path(
                ['resources', 'configs', 'newspaper'])
            filepath = display_choose_file_dialog(file_base_path)
            if filepath is not None:
                webconfig = WebConfig()
                webconfig.load_config_from_file(filepath)
                input("Successfully load site config from %s" % filepath)

        elif choice == 1:
            pass
    return webconfig
예제 #7
0
    def multiprocess_crawl(self):
        # Create Manager Proxy to host shared data for multiprocessed crawled
        with multiprocessing.Manager() as manager:

            data_manager = self._data_manager
            config_manager = self._config_manager
            keyword_manager = self._keyword_manager

            time.sleep(1)
            print("Create Manager Proxy")
            time.sleep(1)
            print("Create shared object")
            crawl_queue = manager.Queue()
            crawled_articles = manager.Queue()
            new_blacklists = manager.Queue()
            browser_list = manager.Queue(
            )  # keep all firefox browser to release when timeout
            lock = manager.Lock()
            timeout_flag = manager.Value(
                'i',
                0)  # shared variable to inform processes if timeout happends

            # Init crawl queue
            time.sleep(1)
            print("Init crawl queue")
            config_list = config_manager.get_newspaper_list()
            number_of_job = 0

            for webconfig in config_list:
                # check delay time between crawl
                last_run = webconfig.get_last_run()
                min_duration = webconfig.get_minimum_duration_between_crawls()
                time_pass = int(
                    (get_utc_now_date() - last_run).total_seconds() / 60)

                if time_pass > min_duration:
                    # print("Print crawl_queue:")
                    # print(webconfig.get_crawl_url()) # for debug
                    crawl_queue.put(webconfig)
                    number_of_job += 1
                    webconfig.set_last_run()  # set last_run to now
                else:
                    web_name = webconfig.get_webname()
                    print(
                        "Ignore crawling %s. Need more %d minutes more to crawl"
                        % (web_name, min_duration - time_pass))

            # Start crawl process
            max_crawler = config_manager.get_max_crawler()
            time.sleep(1)
            print("%s crawlers are set to be run in parallel" %
                  str(max_crawler))
            supported_max_crawler = get_max_crawler_can_be_run()
            if supported_max_crawler == 0:
                supported_max_crawler = 1
            if max_crawler > supported_max_crawler:
                time.sleep(1)
                print(
                    "Current system can support only %s crawlers to be run in parallel"
                    % str(supported_max_crawler))
                time.sleep(1)
                print("You should reduce max_crawler in config.yaml")
                time.sleep(1)
                print("max_crawler will be set to %s in this run" %
                      str(supported_max_crawler))
                max_crawler = supported_max_crawler
            elif max_crawler < supported_max_crawler:
                time.sleep(1)
                print(
                    "Current system can support up to %s crawlers to be run in parallel"
                    % str(supported_max_crawler))
                time.sleep(1)
                print("You should increase max_crawler in config.yaml")
            if max_crawler > int(number_of_job / 2):
                time.sleep(1)
                print("There are only %s newspaper to crawl" %
                      str(number_of_job))
                time.sleep(1)
                print("max_crawler will be set to %s for efficience" %
                      str(int(number_of_job / 2) + 1))
                max_crawler = int(number_of_job / 2) + 1

            crawler_processes = []
            time.sleep(1)

            print("Can run max to %s crawlers" % str(max_crawler))

            timeout = config_manager.get_timeout()
            start = time.time()

            alive_crawler = 0

            running = True
            start_timeout = 0
            is_timeout = False
            terminate_time = 120  # 2 min
            crawler_iterator = 0

            while running:
                # count alive crawler
                running_crawler = ''
                alive_crawler = 0
                running = False
                for process in crawler_processes:
                    if process.is_alive():
                        alive_crawler += 1
                        running_crawler = running_crawler + " %s " % str(
                            alive_crawler)
                        running = True
                if running:
                    print("Running crawler:")
                    print(running_crawler)
                else:  # not running process
                    lock.acquire()
                    if crawl_queue.empty():
                        lock.release()
                        break
                    running = True
                    lock.release()

                # create new crawler if needed
                lock.acquire()
                if alive_crawler < max_crawler and not crawl_queue.empty(
                ) and not is_timeout:  # have more jobs that current browser can't crawl. Maybe need another browser_profiles
                    #epdb.set_trace()
                    lock.release()
                    print("Can create more crawler")
                    crawler_iterator += 1
                    crawler = multiprocessing.Process(
                        target=self.crawler_process,
                        args=(str(crawler_iterator), lock, timeout_flag,
                              browser_list, crawl_queue, data_manager,
                              crawled_articles, new_blacklists,
                              self._export_to_queue))
                    crawler_processes.append(crawler)
                    crawler.start()
                    time.sleep(1)
                    print("Start crawler number %s (pid: %s)" %
                          (str(crawler_iterator), crawler.pid))
                else:
                    lock.release()

                # kill all process after timeout
                if not is_timeout:
                    print("Remaining seconds to timeout %s" %
                          str(int(timeout - time.time() + start)))
                else:
                    print(
                        "Remaining seconds to terminate %s" %
                        str(int(terminate_time - time.time() + start_timeout)))
                if (time.time() - start > timeout) and (not is_timeout):
                    start_timeout = time.time()

                    print("Timeout")
                    print(
                        "Inform all processes about timeout. Terminate all after 2 min"
                    )
                    lock.acquire()
                    timeout_flag.value = 1
                    lock.release()
                    is_timeout = True

                if (timeout_flag.value == 1) and (time.time() - start_timeout
                                                  >= terminate_time):
                    print("Kill unquited browser")
                    while not browser_list.empty():
                        lock.acquire()
                        browser = browser_list.get()
                        print("Found a running browser")
                        print(browser)
                        print("Close browser")
                        browser.quit()
                        lock.release()
                    print("Kill all processes")
                    for crawler in crawler_processes:
                        crawler.terminate()
                        # some processes may be not terminate. Don't know why
                        #crawler.join()
                    running = False

                time.sleep(10)

            # join process to wait for all crawler to finish
            #for crawler in crawler_processes:
            #    crawler.join()

            time.sleep(1)
            print("Finish crawling")
            time.sleep(1)

            # Save all new crawled articles and push to ElasticSearch + RabbitMQ
            print("New crawled articles")
            rb = RabbitMQ_Client()
            rb_articles = []

            while not crawled_articles.empty():
                article = crawled_articles.get()
                if article.get_id() not in data_manager._data:
                    data_manager._data[article.get_id(
                    )] = article  # merge new articles to data
                    rb_articles.append(article)
                    print("%s: %s" %
                          (article.get_newspaper(), article.get_topic()))

            while not new_blacklists.empty():
                href, count = new_blacklists.get()
                data_manager._blacklist[href] = count

            # analyze keyword
            print("Analyze keywords")
            keyword_manager.build_keyword_list()

            print("Export data to json files")

            # export data
            trending_keywords = keyword_manager.write_trending_keyword_by_growing_speed_to_json_file(
            )

            if self._export_to_json:
                data_manager.export_to_json()
                keyword_manager.write_fast_growing_keyword_to_json_file()
                keyword_manager.write_uncategorized_keyword_to_text_file()
                keyword_manager.write_trending_article_to_json_file()
                keyword_manager.write_hot_growing_article_to_json_file()
                keyword_manager.write_keyword_dicts_to_json_file()
                keyword_manager.write_keyword_freq_series_to_json_file()
                keyword_manager.write_new_keyword_to_json_file()

            trends = []

            for topic in trending_keywords[:min(40, len(trending_keywords))]:
                print('trending topic: %s' % topic)

                articles = data_manager.get_latest_article_contain_keyword(
                    topic, number=6)
                first_article = articles[0]
                posts = []

                print('relate article: ')
                for article in articles:
                    posts.append(str(article.get_id()))
                    print(article.get_topic())

                print(posts)
                #trends.append({'topic': topic, 'posts': posts})
                trends.append({
                    'topic': first_article.get_topic(),
                    'posts': posts
                })

            # push data
            print("Push data to database and other services")
            if self._export_to_queue:
                try:
                    # push to RabbitMQ # for Bangtin project only
                    rb.connect()
                    rb.push_trends_to_queue(trends)
                    rb.disconnect()
                except:
                    print_exception()

            if self._export_to_wordpress:
                try:
                    # push to wordpress
                    wp = Wordpress()
                    for article in rb_articles:
                        if article.get_post_type() == 0:  # newspaper post
                            topic = article.get_topic().lower()
                            # trending = False
                            # if trending_keywords:
                            #     for keyword in trending_keywords:
                            #         if keyword in topic:
                            #             trending = True
                            #             break

                            post_id = wp.add_new_article(article, [])
                            if post_id:
                                article.set_wordpress_id(post_id)
                            #sleep(15) # avoid being banned by wordpress host
                except:
                    print_exception()

            if self._export_to_elasticsearch:
                try:
                    # push to Elasticsearch
                    es = ElasticSearch_Client()
                    for article in rb_articles:
                        es.push_article(
                            article)  # put new article to ElasticSearch

                except:
                    print_exception()

            # write log data
            try:
                with open_utf8_file_to_write(
                        get_independent_os_path(["export",
                                                 "log_data.json"])) as stream:
                    log_dict = dict()
                    update_time = get_date_string(
                        get_utc_now_date(),
                        date_format="%d/%m/%Y %H:%M",
                        timezone=config_manager.get_display_timezone())
                    log_dict['update_time'] = update_time
                    log_dict['newspaper_count'] = str(
                        config_manager.get_newspaper_count())
                    log_dict['database_count'] = str(
                        data_manager.count_database())
                    log_dict['hub_title'] = config_manager.get_hub_title()
                    log_dict['hub_href'] = config_manager.get_hub_href()
                    stream.write(jsonpickle.encode(log_dict))
                    stream.close()
            except:
                print_exception()

        print("FINISH")
예제 #8
0
def process_manage_crawl_list(config_manager):
    '''
    output
    ======
        - config_manager with modified data 
        - None or webconfig loaded from current crawl list 
    '''
    manage_crawl_list_header = '''
###########################################
#           MANAGE CRAWL LIST             #
###########################################
    '''

    # what to do next
    menu = [
        'Add site config file to list',
        'Remove newspaper from list',
        'Edit site config in list',
        'Load site config in list to working config',
        'Edit config of all site in list',
        'Add working site config to list',
        'Load working site config from list',
        'Return',
    ]
    user_choice = -1
    webconfig = None

    while user_choice != len(menu) - 1:  # finish
        clear_screen()
        print(manage_crawl_list_header)
        newspaper_list = config_manager.print_crawl_list(
        )  # newspaper_list contain all crawl config. All edits will be made on newspaper_list then merge back into config_manager
        print()
        user_choice = display_menu_dialog('What do you want to do next ? ',
                                          'Choice: ', menu)
        if user_choice == 0:  # add config file to list
            filepath = display_choose_file_dialog(
                get_independent_os_path(['resources', 'configs', 'newspaper']))
            if filepath is not None:
                new_webconfig = WebConfig()
                new_webconfig.load_config_from_file(filepath)
                newspaper_list.append(new_webconfig)
                input("Successfully add %s to crawl list" %
                      new_webconfig.get_webname())
                config_manager.replace_crawl_list(
                    newspaper_list)  #save all changes to config_manager
                config_manager.save_data()

        elif user_choice == 1:  # remove newspaper from list
            choice = input(
                "Please input LINE NUMBER to remove or ENTER to cancel: ")
            if choice.strip() != '' and choice.isdigit():
                remove_webconfig = newspaper_list.pop(int(choice) - 1)
                input("Successfuly remove %s from crawl list" %
                      remove_webconfig.get_webname())
                config_manager.replace_crawl_list(
                    newspaper_list)  #save all changes to config_manager
                config_manager.save_data()

        elif user_choice == 2:  # edit site config in list
            choice = input(
                "Please input LINE NUMBER to edit or ENTER to cancel: ")
            if choice.strip() != '' and choice.isdigit():
                choose_webconfig = newspaper_list[int(choice) - 1]
                choose_webconfig = process_edit_config(choose_webconfig)
                config_manager.add_newspaper(choose_webconfig)  # update config
                config_manager.save_data()

        elif user_choice == 3:  # load site config to working config
            choice = input(
                "Please input LINE NUMBER to load or ENTER to cancel: ")
            if choice.strip() != '' and choice.isdigit():
                choose_webconfig = newspaper_list[int(choice) - 1]
                webconfig = choose_webconfig
                input("Successfuly load %s config to working config" %
                      choose_webconfig.get_webname())

        elif user_choice == 4:  # edit single config of all sites in list
            print()
            newspaper_list = config_manager.get_newspaper_list()
            if len(newspaper_list) > 0:
                print("Sample of a site config:")
                sample_site = newspaper_list[0]
                sample_site.print_config()
                print()

                key = input(
                    'Enter config property to edit: (ENTER=cancel, -config_name=remove)'
                ).strip()
                key = key.strip()
                if key != '':
                    if key[0] == '-':  # remove config
                        key = key[1:]
                        count = 0
                        for newspaper in config_manager.get_newspaper_list():
                            count += 1
                            newspaper.delete_config(key)
                        input("Successfully remove %s of %s site" %
                              (key, str(count)))
                        config_manager.save_data()

                    else:  # edit all
                        new_value = eval(input('Enter new value of %s: ' %
                                               key))
                        count = 0
                        for newspaper in config_manager.get_newspaper_list():
                            count += 1
                            newspaper.set_config(key, new_value)
                        input(
                            "Successfully change %s of %s site to new value" %
                            (key, str(count)))
                        config_manager.save_data()
            else:
                print("There haven't been any site config in crawling list")

        elif user_choice == 5:  # add working site config to list
            if webconfig is not None:
                config_manager.add_newspaper(webconfig)
                input("Succesfully add %s to crawl list" %
                      webconfig.get_webname())
                config_manager.save_data()
            else:
                input(
                    "No working site config. Please go to site config manager to create/load one"
                )

        elif user_choice == 6:  # load newspaper to current webconfig
            choice = input(
                "Please input LINE NUMBER to load or ENTER to cancel: ")
            if choice.strip() != '' and choice.isdigit():
                webconfig = copy.copy(newspaper_list[int(choice) - 1])
                input("Successfuly load %s from crawl list" %
                      webconfig.get_webname())
    return webconfig
예제 #9
0
def process_test_crawl_web_config(webconfig):
    '''
    function
    --------
    try to crawl with webconfig only  

    return
    ------
    modified webconfig
    '''
    test_crawl_header = '''
###########################################
#        TEST CRAWLING SITE CONFIG        #
###########################################
    '''
    has_change_dir = False
    try:
        os.chdir("backend")
        has_change_dir = True
    except:
        pass

    continue_test = True
    while continue_test:
        clear_screen()
        print(test_crawl_header)

        # prepare webconfig for test
        minimum_duration_old_value = webconfig.get_minimum_duration_between_crawls(
        )
        webconfig.set_minimum_duration_between_crawls(
            -5)  # mean always crawl this config
        maximum_url_old_value = webconfig.get_config('maximum_url', 10)
        webconfig.set_config('maximum_url', 50)

        # ask for edit
        choice = display_yes_no_dialog(
            "Is there anything to edit before test crawling (y/n) ?")
        if choice:
            webconfig = process_edit_config(webconfig)
            maximum_url_old_value = webconfig.get_maximum_url()

        # test
        config_manager = ConfigManager(
            get_independent_os_path(['src', 'backend', 'input', 'test.yaml']),
            get_independent_os_path(['input', 'kols_list.txt']),
            get_independent_os_path(['input', 'fb_list.txt']))
        config_manager.load_data()
        config_manager.replace_crawl_list([webconfig])
        data_filename = get_independent_os_path(
            ['src', 'backend', 'data', 'test_article.dat'])
        blacklist_filename = get_independent_os_path(
            ['src', 'backend', 'data', 'test_blacklist.dat'])

        data_manager = ArticleManager(config_manager, data_filename,
                                      blacklist_filename)
        data_manager.reset_data()

        # test crawl
        my_pid = 1
        browser = BrowserWrapper()
        if webconfig.get_crawl_type() == 'newspaper':
            data_manager.add_articles_from_newspaper(my_pid, webconfig,
                                                     browser)
        elif 'facebook' in webconfig.get_crawl_type():
            data_manager.add_articles_from_facebook(my_pid, webconfig, browser)

        # report
        #

        continue_test = display_yes_no_dialog(
            'Do you want to test again (y/n) ?: ')

        # return back
        webconfig.set_config('maximum_url', maximum_url_old_value)
        webconfig.set_minimum_duration_between_crawls(
            minimum_duration_old_value)

    if has_change_dir:
        os.chdir("..")

    return webconfig
예제 #10
0
        elif choice == 4:  # test crawling
            process_test_crawl_web_config(webconfig)
        elif choice == 5:  # add/update working site config to crawl list
            config_manager.add_newspaper(webconfig)
            config_manager.save_data()
            print("Successfully add/update %s to crawl list" %
                  webconfig.get_webname())

        elif choice == 6:  # move to program setting
            process_manage_config(webconfig, config_manager)
    return webconfig


# MAIN PROGRAM
config_manager = ConfigManager(
    get_independent_os_path(['src', 'backend', 'input', 'config.yaml']),
    get_independent_os_path(['src, ', 'backend', 'input', 'kols_list.txt']),
    get_independent_os_path(['src', 'backend', 'input',
                             'fb_list.txt']))  #config object
config_manager.load_data(True, False, False, 30, '.')

main_menu_choice = -1
while main_menu_choice != 2:
    clear_screen()
    print(header)

    main_menu = ['Manage site config', 'Manage program settings', 'Quit']

    main_menu_choice = display_menu_dialog('WELCOME TO CONFIG MANAGER',
                                           'Your choice: ', main_menu)
    webconfig = None
예제 #11
0
    def load_data(self,
                  crawl_newspaper=True,
                  crawl_kols=False,
                  crawl_kols_by_smcc=False,
                  random_kols=True,
                  random_fb_account=True,
                  max_kols=5,
                  base_path='..'):
        '''
        input
        -----
        crawl_newspaper: crawl newspaper configs in /backend/input/config.yaml
        crawl_kols: crawl kols post from kols id list in /backend/input/kols_list.txt by using facebook bot
            random_kols: choose random (max_kols) from kols list to crawl
            random_fb_account: set random fb bot (presetup in /backend/input/fb_list.txt file to crawl kol posts
        crawl_kols_by_smcc: crawl kols posts using smcc service (push some kol id to queue and get back post from queue). Choose random kols id (100 in number) by default and create only one webconfig with crawl_type = "kols smcc"
        base_path: get value '..' or '.' to specify path to resources folder in comparision with running path
        '''
        #print(self._config)
        stream = open_utf8_file_to_read(self._filename)
        self._config = yaml.full_load(stream)
        stream.close()

        newspaper_list = []

        if not crawl_newspaper:
            self.replace_crawl_list([])
        else:
            newspaper_list = self.get_newspaper_list(
            )  # crawl newspaper last to init browser with random profiles first
            self.replace_crawl_list([])

        if crawl_kols:

            # get kols_list
            kols_list = []
            with open_utf8_file_to_read(self._kol_filename) as stream:
                kols_list = [
                    x for x in stream.read().split('\n') if x.strip() != ''
                ]

            # get fb account list
            fb_list = []
            if random_fb_account:
                with open_utf8_file_to_read(
                        self._fb_account_filename) as stream:
                    fb_list = [
                        x for x in stream.read().split('\n') if x.strip() != ''
                    ]

            count = 0
            index = 0
            choosen = set()

            while count < max_kols and count < len(
                    kols_list):  # finish when get max_kols
                count += 1
                if random_kols:
                    index = random.randint(0, len(kols_list) - 1)
                    while index in choosen:  # no repeat value
                        index = random.randint(0, len(kols_list) - 1)
                    choosen.add(index)
                    print(f"Choose random kols: {kols_list[index]}"
                          )  # print choosen kol for debugging
                else:
                    index += 1
                    if index == len(kols_list):  # end of kols list
                        break

                if ';' not in kols_list[
                        index]:  # this line contain just id, not name;url
                    kol_name = 'unknown_id_' + kols_list[index]
                    crawl_url = kols_list[index].strip()  # profile id
                else:
                    kol_name = kols_list[index].split(';')[0]
                    crawl_url = kols_list[index].split(';')[1]

                webconfig = WebConfig()
                webconfig.load_default_config(
                    'facebook user',
                    get_independent_os_path(
                        [base_path, 'resources', 'configs', 'newspaper']))
                webconfig.set_webname(kol_name)
                webconfig.set_config('crawl_url', crawl_url)
                webconfig.set_config(
                    'remove_me', True)  # tag for delete when program finish
                # set random fb account to crawl
                if random_fb_account:
                    profile_index = random.randint(0, len(fb_list) - 1)
                    profile = fb_list[profile_index]
                    webconfig.set_config('browser_profile', profile)

                self.add_newspaper(webconfig)
        # print(self._config)
        # crawl kols by smcc
        if crawl_kols_by_smcc:
            # create a 'crawl_type: kols smcc' WebConfig
            webconfig = WebConfig()
            webconfig.load_default_config(
                'facebook user',
                get_independent_os_path(
                    [base_path, 'resources', 'configs', 'newspaper']))
            webconfig.set_config('crawl_type', 'kols smcc')
            webconfig.set_config('remove_me', True)
            webconfig.set_config('timezone', 'UTC')
            webconfig.set_webname('kol posts')
            webconfig.set_config('minimum_duration_between_crawls', -5)

            self.add_newspaper(webconfig)

        # append newspaper list
        if crawl_newspaper:
            for newspaper in newspaper_list:
                self.add_newspaper(newspaper, beginning=True)
예제 #12
0
import os
from src.backend.lib.data import ArticleManager
from src.backend.lib.rabbitmq_client import RabbitMQ_Client
from src.backend.lib.utils import get_independent_os_path

base_dir = os.environ['DOCBAO_BASE_DIR']

rb = RabbitMQ_Client()
rb.connect()
data_manager = ArticleManager(None, get_independent_os_path([base_dir, 'src', 'backend', 'data', 'article.dat']),get_independent_os_path([base_dir, 'src', 'backend', "data","blacklist.dat"]) ) #article database object
data_manager.load_data()

print("PUSH ALL ARTICLES TO RABBITMQ")
rb.push_to_queue(data_manager._data.values())