Exemplo n.º 1
0
def main():
    start_time = time()

    print("Running Basic Setup Steps....")
    config_loader = ConfigLoader()
    output_directory_path = config_loader.get_base_path(
    ) + config_loader.get_output_directory_name()
    if not os.path.exists(output_directory_path):
        os.makedirs(output_directory_path)
    parser = Parser(config_loader)
    profanity_filter = ProfanityFilter(config_loader, parser)
    de_duplicator = DeDuplicator(parser)
    keyword_dictionary_builder = KeywordDictionaryBuilder(parser)
    sym_spell_checker = SymSpellChecker(config_loader, parser)

    print("Running Parser....")
    parser.parse(config_loader.get_query_logs_file_path(),
                 config_loader.get_frequency_file_path(),
                 config_loader.get_max_total_queries())

    print("Running De-duplicator....")
    de_duplicator.remove_duplicates(
        config_loader.get_frequency_file_path(),
        config_loader.get_frequency_file_path(),
        config_loader.get_de_duplicated_keyword_ordered_1_file_path(),
        config_loader.get_de_duplicated_missing_space_1_file_path(),
        config_loader.get_de_duplicated_synonyms_1_file_path())

    print("Running Profanity Filter....")
    profanity_filter.remove_profane_queries(
        config_loader.get_frequency_file_path(),
        config_loader.get_frequency_file_path(),
        config_loader.get_filtered_profane_queries_file_path())

    print("Running Keyword Dictionary Builder....")
    keyword_dictionary_builder.build_dictionary_file_from_frequency_file(
        config_loader.get_frequency_file_path(),
        config_loader.get_dictionary_file_path())

    print("Running SymSpell Checker....")
    sym_spell_checker.run_sym_spell(config_loader.get_sym_spell_iterations(),
                                    config_loader.get_frequency_file_path(),
                                    config_loader.get_dictionary_file_path(),
                                    config_loader.get_dictionary_file_path())

    print("Running De-duplicator....")
    de_duplicator.remove_duplicates(
        config_loader.get_dictionary_file_path(),
        config_loader.get_dictionary_file_path(),
        config_loader.get_de_duplicated_keyword_ordered_2_file_path(),
        config_loader.get_de_duplicated_missing_space_2_file_path(),
        config_loader.get_de_duplicated_synonyms_2_file_path())

    print("Completed!!!")

    print("Total time taken: ", (time() - start_time) / 60, " minutes")