示例#1
0
    def multiprocess_crawl(self):
        # Create Manager Proxy to host shared data for multiprocessed crawled
        with multiprocessing.Manager() as manager:

            data_manager = self._data_manager
            config_manager = self._config_manager
            keyword_manager = self._keyword_manager

            time.sleep(1)
            print("Create Manager Proxy")
            time.sleep(1)
            print("Create shared object")
            crawl_queue = manager.Queue()
            crawled_articles = manager.Queue()
            new_blacklists = manager.Queue()
            browser_list = manager.Queue(
            )  # keep all firefox browser to release when timeout
            lock = manager.Lock()
            timeout_flag = manager.Value(
                'i',
                0)  # shared variable to inform processes if timeout happends

            # Init crawl queue
            time.sleep(1)
            print("Init crawl queue")
            config_list = config_manager.get_newspaper_list()
            number_of_job = 0

            for webconfig in config_list:
                # check delay time between crawl
                last_run = webconfig.get_last_run()
                min_duration = webconfig.get_minimum_duration_between_crawls()
                time_pass = int(
                    (get_utc_now_date() - last_run).total_seconds() / 60)

                if time_pass > min_duration:
                    # print("Print crawl_queue:")
                    # print(webconfig.get_crawl_url()) # for debug
                    crawl_queue.put(webconfig)
                    number_of_job += 1
                    webconfig.set_last_run()  # set last_run to now
                else:
                    web_name = webconfig.get_webname()
                    print(
                        "Ignore crawling %s. Need more %d minutes more to crawl"
                        % (web_name, min_duration - time_pass))

            # Start crawl process
            max_crawler = config_manager.get_max_crawler()
            time.sleep(1)
            print("%s crawlers are set to be run in parallel" %
                  str(max_crawler))
            supported_max_crawler = get_max_crawler_can_be_run()
            if supported_max_crawler == 0:
                supported_max_crawler = 1
            if max_crawler > supported_max_crawler:
                time.sleep(1)
                print(
                    "Current system can support only %s crawlers to be run in parallel"
                    % str(supported_max_crawler))
                time.sleep(1)
                print("You should reduce max_crawler in config.yaml")
                time.sleep(1)
                print("max_crawler will be set to %s in this run" %
                      str(supported_max_crawler))
                max_crawler = supported_max_crawler
            elif max_crawler < supported_max_crawler:
                time.sleep(1)
                print(
                    "Current system can support up to %s crawlers to be run in parallel"
                    % str(supported_max_crawler))
                time.sleep(1)
                print("You should increase max_crawler in config.yaml")
            if max_crawler > int(number_of_job / 2):
                time.sleep(1)
                print("There are only %s newspaper to crawl" %
                      str(number_of_job))
                time.sleep(1)
                print("max_crawler will be set to %s for efficience" %
                      str(int(number_of_job / 2) + 1))
                max_crawler = int(number_of_job / 2) + 1

            crawler_processes = []
            time.sleep(1)

            print("Can run max to %s crawlers" % str(max_crawler))

            timeout = config_manager.get_timeout()
            start = time.time()

            alive_crawler = 0

            running = True
            start_timeout = 0
            is_timeout = False
            terminate_time = 120  # 2 min
            crawler_iterator = 0

            while running:
                # count alive crawler
                running_crawler = ''
                alive_crawler = 0
                running = False
                for process in crawler_processes:
                    if process.is_alive():
                        alive_crawler += 1
                        running_crawler = running_crawler + " %s " % str(
                            alive_crawler)
                        running = True
                if running:
                    print("Running crawler:")
                    print(running_crawler)
                else:  # not running process
                    lock.acquire()
                    if crawl_queue.empty():
                        lock.release()
                        break
                    running = True
                    lock.release()

                # create new crawler if needed
                lock.acquire()
                if alive_crawler < max_crawler and not crawl_queue.empty(
                ) and not is_timeout:  # have more jobs that current browser can't crawl. Maybe need another browser_profiles
                    #epdb.set_trace()
                    lock.release()
                    print("Can create more crawler")
                    crawler_iterator += 1
                    crawler = multiprocessing.Process(
                        target=self.crawler_process,
                        args=(str(crawler_iterator), lock, timeout_flag,
                              browser_list, crawl_queue, data_manager,
                              crawled_articles, new_blacklists,
                              self._export_to_queue))
                    crawler_processes.append(crawler)
                    crawler.start()
                    time.sleep(1)
                    print("Start crawler number %s (pid: %s)" %
                          (str(crawler_iterator), crawler.pid))
                else:
                    lock.release()

                # kill all process after timeout
                if not is_timeout:
                    print("Remaining seconds to timeout %s" %
                          str(int(timeout - time.time() + start)))
                else:
                    print(
                        "Remaining seconds to terminate %s" %
                        str(int(terminate_time - time.time() + start_timeout)))
                if (time.time() - start > timeout) and (not is_timeout):
                    start_timeout = time.time()

                    print("Timeout")
                    print(
                        "Inform all processes about timeout. Terminate all after 2 min"
                    )
                    lock.acquire()
                    timeout_flag.value = 1
                    lock.release()
                    is_timeout = True

                if (timeout_flag.value == 1) and (time.time() - start_timeout
                                                  >= terminate_time):
                    print("Kill unquited browser")
                    while not browser_list.empty():
                        lock.acquire()
                        browser = browser_list.get()
                        print("Found a running browser")
                        print(browser)
                        print("Close browser")
                        browser.quit()
                        lock.release()
                    print("Kill all processes")
                    for crawler in crawler_processes:
                        crawler.terminate()
                        # some processes may be not terminate. Don't know why
                        #crawler.join()
                    running = False

                time.sleep(10)

            # join process to wait for all crawler to finish
            #for crawler in crawler_processes:
            #    crawler.join()

            time.sleep(1)
            print("Finish crawling")
            time.sleep(1)

            # Save all new crawled articles and push to ElasticSearch + RabbitMQ
            print("New crawled articles")
            rb = RabbitMQ_Client()
            rb_articles = []

            while not crawled_articles.empty():
                article = crawled_articles.get()
                if article.get_id() not in data_manager._data:
                    data_manager._data[article.get_id(
                    )] = article  # merge new articles to data
                    rb_articles.append(article)
                    print("%s: %s" %
                          (article.get_newspaper(), article.get_topic()))

            while not new_blacklists.empty():
                href, count = new_blacklists.get()
                data_manager._blacklist[href] = count

            # analyze keyword
            print("Analyze keywords")
            keyword_manager.build_keyword_list()

            print("Export data to json files")

            # export data
            trending_keywords = keyword_manager.write_trending_keyword_by_growing_speed_to_json_file(
            )

            if self._export_to_json:
                data_manager.export_to_json()
                keyword_manager.write_fast_growing_keyword_to_json_file()
                keyword_manager.write_uncategorized_keyword_to_text_file()
                keyword_manager.write_trending_article_to_json_file()
                keyword_manager.write_hot_growing_article_to_json_file()
                keyword_manager.write_keyword_dicts_to_json_file()
                keyword_manager.write_keyword_freq_series_to_json_file()
                keyword_manager.write_new_keyword_to_json_file()

            trends = []

            for topic in trending_keywords[:min(40, len(trending_keywords))]:
                print('trending topic: %s' % topic)

                articles = data_manager.get_latest_article_contain_keyword(
                    topic, number=6)
                first_article = articles[0]
                posts = []

                print('relate article: ')
                for article in articles:
                    posts.append(str(article.get_id()))
                    print(article.get_topic())

                print(posts)
                #trends.append({'topic': topic, 'posts': posts})
                trends.append({
                    'topic': first_article.get_topic(),
                    'posts': posts
                })

            # push data
            print("Push data to database and other services")
            if self._export_to_queue:
                try:
                    # push to RabbitMQ # for Bangtin project only
                    rb.connect()
                    rb.push_trends_to_queue(trends)
                    rb.disconnect()
                except:
                    print_exception()

            if self._export_to_wordpress:
                try:
                    # push to wordpress
                    wp = Wordpress()
                    for article in rb_articles:
                        if article.get_post_type() == 0:  # newspaper post
                            topic = article.get_topic().lower()
                            # trending = False
                            # if trending_keywords:
                            #     for keyword in trending_keywords:
                            #         if keyword in topic:
                            #             trending = True
                            #             break

                            post_id = wp.add_new_article(article, [])
                            if post_id:
                                article.set_wordpress_id(post_id)
                            #sleep(15) # avoid being banned by wordpress host
                except:
                    print_exception()

            if self._export_to_elasticsearch:
                try:
                    # push to Elasticsearch
                    es = ElasticSearch_Client()
                    for article in rb_articles:
                        es.push_article(
                            article)  # put new article to ElasticSearch

                except:
                    print_exception()

            # write log data
            try:
                with open_utf8_file_to_write(
                        get_independent_os_path(["export",
                                                 "log_data.json"])) as stream:
                    log_dict = dict()
                    update_time = get_date_string(
                        get_utc_now_date(),
                        date_format="%d/%m/%Y %H:%M",
                        timezone=config_manager.get_display_timezone())
                    log_dict['update_time'] = update_time
                    log_dict['newspaper_count'] = str(
                        config_manager.get_newspaper_count())
                    log_dict['database_count'] = str(
                        data_manager.count_database())
                    log_dict['hub_title'] = config_manager.get_hub_title()
                    log_dict['hub_href'] = config_manager.get_hub_href()
                    stream.write(jsonpickle.encode(log_dict))
                    stream.close()
            except:
                print_exception()

        print("FINISH")
示例#2
0
 def set_last_run(self, date=None):
     if date is None:
         date = get_utc_now_date()
     self.set_config(
         'last_run',
         get_date_string(date, "%d/%m/%Y %H:%M", pytz.timezone("UTC")))