Python Indexer.GetIDFForTerm примеры использования

Язык программирования: Python

Пространство имен/Пакет: indexer

Класс/Тип: Indexer

Метод/Функция: GetIDFForTerm

Примеров на hotexamples.com: 1

Python Indexer.GetIDFForTerm - 1 пример найден. Это лучшие примеры Python кода для indexer.Indexer.GetIDFForTerm, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

add_new_doc(30)

Indexer(30)

create_index(6)

create_unigram_index(3)

calculate_idf(3)

LoadIndexes(3)

close(3)

dump(3)

coords_to_indices(2)

indices_to_coords(2)

calculationSummerize(2)

add_idf_to_dictionary(2)

add_document(2)

LoadDict(2)

fix_inverted_index(2)

finish(2)

evaluate_input(1)

execute(1)

create_save_indexer_with_relevant_docs(1)

entities_and_small_big(1)

directory(1)

delete_dict_after_saving(1)

create_indexer(1)

create_dirs(1)

create_bulk_index_string(1)

finish_index(1)

CreatInvertedIndex(1)

finish_indexing(1)

get_num_spatial_nodes(1)

tokenize(1)

set_idx_fields(1)

process(1)

keys(1)

isStopword(1)

ignore_extensions(1)

get__lda__(1)

fit(1)

getStemmed(1)

getOr(1)

getAnd(1)

get(1)

generate_local_index(1)

create_block(1)

generate_global_index(1)

compute_tf(1)

createIndex(1)

add_square_Wij(1)

bp_index(1)

batch_get_feat_stacked(1)

after_indexing(1)

Пример #1

Показать файл

Файл: noisefilter.py Проект: vdevos/2ID26

class NoiseFilter:
    """
    NoiseFilter takes a list of Unidentified Terms (UTs), filters out noise (garbage) and returns a (noise)filtered list of UTs

    """
    # Regexes to use as a filter
    ONLYNUM = "^[0-9]*$"
    SPECIAL = "[/_$&+,:;{}\"=?\[\]@#|~'<>^*()%!]"
    NON_ASCII = "[^\x00-\x7F]"
    PUNCT = "[.?\-\",]"
    CONSONANT_4 = "[bBcCdDfFgGhHjJkKlLmMnNpPqQrRsStTvVwWxXyYzZ]{4}"
    VOWEL_4 = "[aAeEiIoOuU]{4}"

    def __init__(self, args):

        self.args = args

        # define noisefilter arguments/parameter input
        self.ACTIONS = ('filter')

        # instance variables
        self.output_filename = "ut_filtered.txt"
        self.output_filename_regex = "ut_filtered_regex.txt"
        self.output_filename_idf = "ut_filtered_idf.txt"

        self.output_filename_noise = "ut_noise.txt"
        self.output_filename_noise_regex = "ut_noise_regex.txt"
        self.output_filename_noise_idf = "ut_noise_idf.txt"

        # global vars to store output of FilterNoise
        self.unfiltered_terms = []
        self.filtered_terms_regex = []
        self.filtered_terms_idf = []
        self.noise_terms_regex = []
        self.noise_terms_idf = []
        self.combined_filtered_terms = []
        self.combined_noise_terms = []

        # parse noisefilter action
        if args is not None:
            if not args.action[0] in self.ACTIONS:
                error("Action not recognized, try: %s" %
                      ', '.join(self.ACTIONS))
            self.ACTION = args.action[0]
        else:
            print("No args supplied to NoiseFilter.")

        # Create an instance of the indexer
        self.indexer = Indexer()
        # Index the tweets
        self.indexer.LoadIndexes()

    def PerformAction(self):
        if self.ACTION == 'filter':
            self.FilterNoiseFromFile(self.args.file)

    def FilterNoiseFromFile(self, fname):
        # make sure file exists
        if not os.path.isfile(fname):
            error("Provided file does not exist?")

        unfiltered_terms = set()

        # read file and iterate over the lines
        with open(fname) as fd:
            lines = fd.readlines()
            for line in lines:
                term = line.strip().split('\t')[0]
                unfiltered_terms.add(term)
            self.FilterNoise(unfiltered_terms, self.args.idf_factor)
            with open(self.output_filename_regex, 'w') as outputfile_regex:
                for ut in self.filtered_terms_regex:
                    outputfile_regex.write(ut + '\n')

            with open(self.output_filename_noise_regex,
                      'w') as outputfile_noise_regex:
                for nt in self.noise_terms_regex:
                    outputfile_noise_regex.write(nt + '\n')

            with open(self.output_filename_idf, 'w') as outputfile_idf:
                for ut in self.filtered_terms_idf:
                    outputfile_idf.write(ut + '\n')

            with open(self.output_filename_noise_idf,
                      'w') as outputfile_noise_idf:
                for nt in self.noise_terms_idf:
                    outputfile_noise_idf.write(nt + '\n')

            with open(self.output_filename, 'w') as outputfile:
                for ut in self.combined_filtered_terms:
                    outputfile.write(ut + '\n')

            with open(self.output_filename_noise, 'w') as outputfile_noise:
                for nt in self.combined_noise_terms:
                    outputfile_noise.write(nt + '\n')

    def FilterNoise(self, unfiltered_input, idf_factor):
        self.unfiltered_terms = []
        self.filtered_terms_regex = []
        self.filtered_terms_idf = []
        self.noise_terms_regex = []
        self.noise_terms_idf = []
        self.combined_filtered_terms = []
        self.combined_noise_terms = []

        for term in unfiltered_input:

            self.unfiltered_terms.append(term)

            # Applied filters:
            # 1. terms of 1 or 2 characters or terms larger than 10 characters
            # 2. terms containing non-ascii characters
            # 3. terms containing special characters
            # 4. terms consisting only of numbers
            # 5. terms having more punctuation than characters
            # 6. Four or more consecutive vowels, or five or more consecutive consonants.

            if len(term) < 3 or len(term) >= 7 \
                    or re.search(NoiseFilter.NON_ASCII, term) is not None \
                    or re.search(NoiseFilter.SPECIAL, term) is not None \
                    or re.search(NoiseFilter.ONLYNUM, term) is not None \
                    or len(re.findall(NoiseFilter.PUNCT, term)) > (len(term) - len(re.findall(NoiseFilter.PUNCT, term))) \
                    or re.search(NoiseFilter.VOWEL_4, term) is not None \
                    or re.search(NoiseFilter.CONSONANT_4, term) is not None:
                self.noise_terms_regex.append(term)
            else:
                self.filtered_terms_regex.append(term)

            # Get IDF term values. idf_base is the idf factor for terms that only appear once
            # in the whole collection.
            # Values lower than the idf_base can be a valid UTs, otherwise not
            idf = self.indexer.GetIDFForTerm(term)
            doccount = len(self.indexer.index_tweets)
            if doccount > 0:
                idf_base = math.log(float(doccount))
            else:
                idf_base = 100.0
                print("Tried to take the log of a <= 0 doccount! Was: ",
                      doccount)
            threshold_idf = idf_factor * idf_base

            if idf <= threshold_idf:
                self.filtered_terms_idf.append(term)
            else:
                self.noise_terms_idf.append(term)

        self.combined_filtered_terms = intersect(self.filtered_terms_regex,
                                                 self.filtered_terms_idf)
        self.combined_noise_terms = diff(self.unfiltered_terms,
                                         self.combined_filtered_terms)

        print('Input Terms: ' + str(len(self.unfiltered_terms)))
        print('Unidentified Terms Regex: ' +
              str(len(self.filtered_terms_regex)))
        print('Noisy Terms Regex: ' + str(len(self.noise_terms_regex)))
        print('Unidentified Terms IDF: ' + str(len(self.filtered_terms_idf)))
        print('Noisy Terms IDF: ' + str(len(self.noise_terms_idf)))
        print('Combined Unidentified Terms: ' +
              str(len(self.combined_filtered_terms)))
        print('Combined Noisy Terms: ' + str(len(self.combined_noise_terms)))

        # This is the list we use as a result.
        return self.combined_filtered_terms