def transform_text_first_time(self, vectorizer, file_list, path='static/vectors'): vectors = vectorizer.transform(file_list) utilities.write_file(vectors, path) return vectors
def dim_reduction_not_first(self, vector, reducer, path='static/reduced_vector'): vector = vector.toarray() reduced_vector = reducer.transform(vector) utilities.write_file(reduced_vector, path) return reduced_vector
def vectorize(self, outfile, path='static/vectorizer'): vectorizer = TfidfVectorizer(max_df=0.5, ngram_range=(1, 1), min_df=2, stop_words='english', max_features=10000) vectorizer.fit(outfile) utilities.write_file(vectorizer, path) return vectorizer
def dim_reduction_first_time(self, vectors, n_components=1000, \ path='static/PCA', \ path_red='static/reduced_vectors'): vectors = vectors.toarray() pca = PCA(n_components=n_components) pca.fit(vectors) utilities.write_file(pca, path) reduced_vectors = pca.transform(vectors) utilities.write_file(reduced_vectors, path_red) return reduced_vectors
def load_filenames_first(self, path="static/NSW/*.html", save_path="static/filenames"): all_files = glob.glob(path) all_filenames = [] count = 0 for filename in all_files: file_id = re.sub(r"^.*\/", "", filename) file_id = re.sub("^.*[\\\\]", "", file_id) with open(filename, 'r', encoding="ISO-8859-1") as f: title = self.extract_html_title(f) if title == 'Unknown Title': continue all_filenames.append([file_id, title]) count += 1 utilities.write_file(all_filenames, save_path) return all_files, all_filenames
def load_all_top_words_first(self, vectorizer, path="static/NSW/*.html", save_path="static/all_file_topwords"): hp = html_parser.html_parser() indices = np.argsort(vectorizer.idf_)[::-1] features = vectorizer.get_feature_names() all_files = glob.glob(path) all_file_topwords = [] for filename in all_files: file_id = re.sub(r"^.*\/", "", filename) file_id = re.sub("^.*[\\\\]", "", file_id) top_words = self.get_top_words_one_gram(filename, indices, features, hp) with open(filename, 'r', encoding="ISO-8859-1") as f: title = hp.extract_html_title(f) if title == 'Unknown Title': #print(filename) continue all_file_topwords.append([file_id, top_words]) utilities.write_file(all_file_topwords, save_path) return all_file_topwords
def load_files_first(self, path="static/NSW/*.html", savefile='static/outfile'): start = datetime.now() all_files = glob.glob(path) all_filenames = [ re.sub("^.*[\\\\]", "", re.sub(r"^.*\/", "", i)) for i in all_files ] files = [] count = 0 for filename in all_files: with open(filename, 'r', encoding="ISO-8859-1") as f: title = self.extract_html_title(f) if title == 'Unknown Title': #print(filename) continue f.close() f = open(filename, 'r', encoding="ISO-8859-1").read() f = self.clean_file(f) files.append(f) count += 1 end = datetime.now() utilities.write_file(files, savefile) return files, all_files, all_filenames
def parse_input_text(self, input_text): f = self.clean_file(input_text) utilities.write_file(f, "static/input_cleaned_test") return f
if not os.path.isdir(config.OUTPUT_DIR): print('Creating output directory {0}'.format(config.OUTPUT_DIR)) try: os.makedirs(config.OUTPUT_DIR) except Exception, e: print("ERROR {0} :: Failed to create output directory {1}".format( e, config.OUTPUT_DIR)) if __name__ == '__main__': user_data = utilities.build_data(utilities.read_data_file( config.TEST_DATA)) ''' Serializations ''' json_blob = serializer['JsonSerializer'](data=user_data).serialize() print("json file saved here : %s " % config.SERIALIZED_JSON) utilities.write_file(file_path=config.SERIALIZED_JSON, data=json_blob) pickle_blob = serializer['PickleSerializer'](data=user_data).serialize() print("pkl file saved here : %s " % config.SERIALIZED_PKL) utilities.write_file(file_path=config.SERIALIZED_PKL, data=pickle_blob) ''' De-Serializations ''' print(serializer['JsonSerializer'].deserialize(json_blob)) print(serializer['PickleSerializer'].deserialize(pickle_blob)) ''' User readable output files ''' html_file = exporter['HtmlExporter'](data=user_data).export( file_path=config.HTML) utilities.open_file(file_path=html_file) text_file = exporter['TextExporter'](data=user_data).export( file_path=config.TXT) utilities.open_file(file_path=text_file)