Пример #1
0
 def transform_text_first_time(self,
                               vectorizer,
                               file_list,
                               path='static/vectors'):
     vectors = vectorizer.transform(file_list)
     utilities.write_file(vectors, path)
     return vectors
Пример #2
0
 def dim_reduction_not_first(self,
                             vector,
                             reducer,
                             path='static/reduced_vector'):
     vector = vector.toarray()
     reduced_vector = reducer.transform(vector)
     utilities.write_file(reduced_vector, path)
     return reduced_vector
Пример #3
0
 def vectorize(self, outfile, path='static/vectorizer'):
     vectorizer = TfidfVectorizer(max_df=0.5,
                                  ngram_range=(1, 1),
                                  min_df=2,
                                  stop_words='english',
                                  max_features=10000)
     vectorizer.fit(outfile)
     utilities.write_file(vectorizer, path)
     return vectorizer
Пример #4
0
 def dim_reduction_first_time(self, vectors, n_components=1000, \
                              path='static/PCA', \
                              path_red='static/reduced_vectors'):
     vectors = vectors.toarray()
     pca = PCA(n_components=n_components)
     pca.fit(vectors)
     utilities.write_file(pca, path)
     reduced_vectors = pca.transform(vectors)
     utilities.write_file(reduced_vectors, path_red)
     return reduced_vectors
Пример #5
0
    def load_filenames_first(self,
                             path="static/NSW/*.html",
                             save_path="static/filenames"):
        all_files = glob.glob(path)
        all_filenames = []
        count = 0
        for filename in all_files:
            file_id = re.sub(r"^.*\/", "", filename)
            file_id = re.sub("^.*[\\\\]", "", file_id)

            with open(filename, 'r', encoding="ISO-8859-1") as f:
                title = self.extract_html_title(f)
                if title == 'Unknown Title':
                    continue
                all_filenames.append([file_id, title])

            count += 1
        utilities.write_file(all_filenames, save_path)
        return all_files, all_filenames
Пример #6
0
    def load_all_top_words_first(self,
                                 vectorizer,
                                 path="static/NSW/*.html",
                                 save_path="static/all_file_topwords"):
        hp = html_parser.html_parser()
        indices = np.argsort(vectorizer.idf_)[::-1]
        features = vectorizer.get_feature_names()
        all_files = glob.glob(path)
        all_file_topwords = []
        for filename in all_files:
            file_id = re.sub(r"^.*\/", "", filename)
            file_id = re.sub("^.*[\\\\]", "", file_id)
            top_words = self.get_top_words_one_gram(filename, indices,
                                                    features, hp)
            with open(filename, 'r', encoding="ISO-8859-1") as f:
                title = hp.extract_html_title(f)
                if title == 'Unknown Title':
                    #print(filename)
                    continue
                all_file_topwords.append([file_id, top_words])

        utilities.write_file(all_file_topwords, save_path)
        return all_file_topwords
Пример #7
0
 def load_files_first(self,
                      path="static/NSW/*.html",
                      savefile='static/outfile'):
     start = datetime.now()
     all_files = glob.glob(path)
     all_filenames = [
         re.sub("^.*[\\\\]", "", re.sub(r"^.*\/", "", i)) for i in all_files
     ]
     files = []
     count = 0
     for filename in all_files:
         with open(filename, 'r', encoding="ISO-8859-1") as f:
             title = self.extract_html_title(f)
             if title == 'Unknown Title':
                 #print(filename)
                 continue
             f.close()
         f = open(filename, 'r', encoding="ISO-8859-1").read()
         f = self.clean_file(f)
         files.append(f)
         count += 1
     end = datetime.now()
     utilities.write_file(files, savefile)
     return files, all_files, all_filenames
Пример #8
0
 def parse_input_text(self, input_text):
     f = self.clean_file(input_text)
     utilities.write_file(f, "static/input_cleaned_test")
     return f
Пример #9
0
if not os.path.isdir(config.OUTPUT_DIR):
    print('Creating output directory {0}'.format(config.OUTPUT_DIR))
    try:
        os.makedirs(config.OUTPUT_DIR)
    except Exception, e:
        print("ERROR {0} :: Failed to create output directory {1}".format(
            e, config.OUTPUT_DIR))

if __name__ == '__main__':
    user_data = utilities.build_data(utilities.read_data_file(
        config.TEST_DATA))
    ''' Serializations '''
    json_blob = serializer['JsonSerializer'](data=user_data).serialize()
    print("json file saved here :  %s " % config.SERIALIZED_JSON)
    utilities.write_file(file_path=config.SERIALIZED_JSON, data=json_blob)

    pickle_blob = serializer['PickleSerializer'](data=user_data).serialize()
    print("pkl file saved here :  %s " % config.SERIALIZED_PKL)
    utilities.write_file(file_path=config.SERIALIZED_PKL, data=pickle_blob)
    ''' De-Serializations '''
    print(serializer['JsonSerializer'].deserialize(json_blob))
    print(serializer['PickleSerializer'].deserialize(pickle_blob))
    ''' User readable output files '''
    html_file = exporter['HtmlExporter'](data=user_data).export(
        file_path=config.HTML)
    utilities.open_file(file_path=html_file)

    text_file = exporter['TextExporter'](data=user_data).export(
        file_path=config.TXT)
    utilities.open_file(file_path=text_file)