from Config.pre_process_config import PreProcessConfig if len(sys.argv) != 2: raise Exception( "Incorrect number of arguments passed - one expected, the config file name" ) #sys.argv[0] is this script file, sys.argv[1] should be the config file config = PreProcessConfig(sys.argv[1]) start = time.time() if config.empty_processed_documents_folder: delete_files(config.processed_documents_folder, config.file_mask) files = find_files(config.documents_folder, config.file_mask, True) for i, fpath in enumerate(files): with open(fpath) as f: contents = f.read() if len(contents) < config.minimum_file_size_chars: continue if config.parse_html: contents = parse_html(contents) if len(contents) < config.minimum_file_size_chars: continue sents = split_into_sentences(contents) doc = "\n".join(sents) file_name = get_file_name(fpath) fout_name = config.processed_documents_folder + "/" + file_name.split(
if len(sys.argv) != 2: raise Exception("Incorrect number of arguments passed - one expected, the config file name") # sys.argv[0] is this script file, sys.argv[1] should be the config file config = ExtractKeywordsConfig(sys.argv[1]) script_start = time.time() if config.stop_words_file: stop_words = load_stop_words(config.stop_words_file) print ("%i stop words loaded" % len(stop_words)) else: stop_words = set() """ Load Documents """ start = time.time() files = find_files(config.processed_documents_folder, config.file_mask, True) print ("%s files found in %s" % (len(files), config.processed_documents_folder)) documents = [] for i, fname in enumerate(files): with open(fname) as f: contents = f.read() documents.append(contents.split("\n")) end = time.time() print ("Loading %i documents took %s seconds" % (len(files), str(end - start))) """ Extract Common Terms and Phrases """ start = time.time() # Or use a counter here. doc_freq = defaultdict(int) # remove short docs
# - TODO: use functional composition to speed up is_a_synonym_filter = fact_is_synonym_filter(syn_mapper) analysis_chain = [ clean_str, white_space_tokenize, remove_punct_at_end_filter, lower_case_filter, stop_filter, syn_mapper.map_synonyms, remove_empty_tokens_filter ] # is_a_synonym_filter] - Un-comment to just train on keywords. #Test #rslt = debug_analyze("$150k as400 Sr.\ Java/j2ee and the C#.! developer. FIT \"HOT\" dev. -IBM's business, sql server management", analysis_chain) """ Load Documents """ start = time.time() sentences = [] files = find_files(config.processed_documents_folder, config.file_mask, True) print("%s files found in %s" % (len(files), config.processed_documents_folder)) documents = [] for i, fname in enumerate(files): with open(fname) as f: contents = f.read() sentences.extend(contents.split("\n")) end = time.time() print("Loading %i sentences took %s seconds" % (len(sentences), str(end - start))) """ Analyze - clean, tokenize, extract phrases """ print("%i sentences to process" % len(sentences)) tokenized = [] print("Tokenizing sentences")
""" Process Files """ import sys from Config.pre_process_config import PreProcessConfig if len(sys.argv) != 2: raise Exception("Incorrect number of arguments passed - one expected, the config file name") #sys.argv[0] is this script file, sys.argv[1] should be the config file config = PreProcessConfig(sys.argv[1]) start = time.time() if config.empty_processed_documents_folder: delete_files(config.processed_documents_folder, config.file_mask) files = find_files(config.documents_folder, config.file_mask, True) for i, fpath in enumerate(files): with open(fpath) as f: contents = f.read() if len(contents) < config.minimum_file_size_chars: continue if config.parse_html: contents = parse_html(contents) if len(contents) < config.minimum_file_size_chars: continue sents = split_into_sentences(contents) doc = "\n".join(sents) file_name = get_file_name(fpath) fout_name = config.processed_documents_folder + "/" + file_name.split(".")[0] + "_proc.txt"