Exemplo n.º 1
0
    def compute(self, item_id):
        # refresh Tracked term
        if self.last_refresh_word < Term.get_tracked_term_last_updated_by_type(
                'word'):
            self.list_tracked_words = Term.get_tracked_words_list()
            self.last_refresh_word = time.time()
            self.redis_logger.debug('Tracked word refreshed')
            print('Tracked word refreshed')

        if self.last_refresh_set < Term.get_tracked_term_last_updated_by_type(
                'set'):
            self.set_tracked_words_list = Term.get_set_tracked_words_list()
            self.last_refresh_set = time.time()
            self.redis_logger.debug('Tracked set refreshed')
            print('Tracked set refreshed')

        # Cast message as Item
        item = Item(item_id)
        item_date = item.get_date()
        item_content = item.get_content()

        signal.alarm(self.max_execution_time)

        dict_words_freq = None
        try:
            dict_words_freq = Term.get_text_word_frequency(item_content)
        except TimeoutException:
            self.redis_logger.warning(f"{item.get_id()} processing timeout")
        else:
            signal.alarm(0)

        if dict_words_freq:
            # create token statistics
            # for word in dict_words_freq:
            #    Term.create_token_statistics(item_date, word, dict_words_freq[word])
            item_source = item.get_source()

            # check solo words
            ####### # TODO: check if source needed #######
            for word in self.list_tracked_words:
                if word in dict_words_freq:
                    self.new_term_found(word, 'word', item.get_id(),
                                        item_source)

            # check words set
            for elem in self.set_tracked_words_list:
                list_words = elem[0]
                nb_words_threshold = elem[1]
                word_set = elem[2]
                nb_uniq_word = 0

                for word in list_words:
                    if word in dict_words_freq:
                        nb_uniq_word += 1
                if nb_uniq_word >= nb_words_threshold:
                    self.new_term_found(word_set, 'set', item.get_id(),
                                        item_source)
Exemplo n.º 2
0
                    new_term_found(word, 'word', item_id, item_date)

            # check words set
            for elem in set_tracked_words_list:
                list_words = elem[0]
                nb_words_threshold = elem[1]
                word_set = elem[2]
                nb_uniq_word = 0

                for word in list_words:
                    if word in dict_words_freq:
                        nb_uniq_word += 1
                if nb_uniq_word >= nb_words_threshold:
                    new_term_found(word_set, 'set', item_id, item_date)

        else:
            time.sleep(5)

        # refresh Tracked term
        if last_refresh_word < Term.get_tracked_term_last_updated_by_type(
                'word'):
            list_tracked_words = Term.get_tracked_words_list()
            last_refresh_word = time.time()
            print('Tracked word refreshed')

        if last_refresh_set < Term.get_tracked_term_last_updated_by_type(
                'set'):
            set_tracked_words_list = Term.get_set_tracked_words_list()
            last_refresh_set = time.time()
            print('Tracked set refreshed')
Exemplo n.º 3
0
    # Regex Frequency
    while True:

        item_id = p.get_from_set()

        if item_id is not None:

            item_date = Item.get_item_date(item_id)
            item_content = Item.get_item_content(item_id)

            for regex in dict_regex_tracked:
                matched = regex_helper.regex_search(
                    module_name,
                    redis_cache_key,
                    dict_regex_tracked[regex],
                    item_id,
                    item_content,
                    max_time=max_execution_time)
                if matched:
                    new_term_found(regex, 'regex', item_id, item_date)

        else:
            time.sleep(5)

        # refresh Tracked term
        if last_refresh < Term.get_tracked_term_last_updated_by_type('regex'):
            dict_regex_tracked = Term.get_regex_tracked_words_dict()
            last_refresh = time.time()
            print('Tracked set refreshed')