Exemplo n.º 1
0
    def relearn(self):

        try:

            print('getting data ... ')

            data, k = get_data_dict(self.txt_data_path, print_all=False)

            print('creating the model ...')

            tfidf_matrix, labels, self.vectorizer = \
                get_vector_space_model(data,
                                       min_doc_list_size=0,
                                       make_equal_size=False)

            print('saving vectorizer to disk ...')
            ds.serialize(self.vectorizer, self.clf_data_path + 'vectorizer')

            vector_dim = tfidf_matrix.shape[1]

            if self.svd_dim < vector_dim:

                print('reducing dimension')

                self.svd = TruncatedSVD(n_components=self.svd_dim)

                lsi_matrix = self.svd.fit_transform(tfidf_matrix)

                print('saving svd transformer to disk ...')
                ds.serialize(self.svd, self.clf_data_path + 'svd')

                clf1 = get_classifier(self.clf_name, clf_params=(10, ))

                self.clf = OneVsRestClassifier(clf1)

                print('\nfitting classifyer ...')

                self.clf.fit(lsi_matrix, labels)

                print('saving classifier to disk ...')
                ds.serialize(self.clf, self.clf_data_path + 'clf')

                scores = cross_val_score(self.clf, lsi_matrix, labels, cv=5)

                print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), \
                                                       scores.std() * 2))

                print('relearning done')

        except:
            print('\nrelearning failed\n')
            Debug.print_exception_info()
Exemplo n.º 2
0
def get_vector_space_model_train_test(data_dict,
                                      min_doc_list_size=0,
                                      make_equal_size=False,
                                      train_perc=0.66):
    """ returns matrixes, labels, vectorizer """

    print('\ncreating models ...\n')

    doc_list = []
    labels = []

    print('making doc list')

    for class_name in data_dict:

        try:

            dl = data_dict[class_name]['doc_list']

            if len(dl) < min_doc_list_size > 0:
                continue

            if make_equal_size is False:
                doc_list += dl
                labels += len(dl) * [class_name]

            elif min_doc_list_size > 0:
                doc_list += random.sample(dl, min_doc_list_size)
                labels += min_doc_list_size * [class_name]

        except:
            Debug.print_exception_info()

    print('making matrix')
    doc_list_train, doc_list_test, Y_train, Y_test = train_test_split(\
                                    doc_list,
                                    labels,
                                    test_size=1-train_perc,
                                    random_state=random.randint(0,31))

    vectorizer = TfidfVectorizer(stop_words=stop_words, tokenizer=tokenize)

    matrix_train = vectorizer.fit_transform(doc_list_train)

    matrix_test = vectorizer.transform(doc_list_test)

    print('matrix ready')

    return (matrix_train, matrix_test, Y_train, Y_test, vectorizer)
Exemplo n.º 3
0
def get_data_dict(data_path, print_all=False):
    """ returns dictionary like
    d['class_name'] => list of text documents """

    data = dict()

    class_names = os.listdir(data_path)

    not_readed_files_counter = 0

    for class_name in class_names:

        data[class_name] = {'doc_list': [], 'not_readed': 0}

        files = os.listdir(data_path + class_name)

        i = 0

        for fname in files:

            i += 1

            if print_all:
                print(class_name, i, ' from ', len(files))

            if '.DS_Store' == fname:
                continue

            try:
                f = open(data_path + class_name + '/' + fname,
                         'r')  #,  encoding='utf-8')

                txt = str(f.read())

                data[class_name]['doc_list'].append(txt)

            except:
                print('-------------------------------------------')
                print(class_name)
                print(fname)
                data[class_name]['not_readed'] += 1
                not_readed_files_counter += 1
                Debug.print_exception_info()

    return data, not_readed_files_counter
Exemplo n.º 4
0
data_path = clf_data_path

# GET DATA

if not serialized_data:

    data, not_readed_files_counter = get_data_dict(txt_data_path)

    ds.serialize((data, not_readed_files_counter), data_path + 'data')

else:
    try:
        data, not_readed_files_counter = ds.deserialize(data_path + \
                                                        'data')
    except:
        Debug.print_exception_info()

# CREATE MODEL

if not serialized_model:

    matrix_train, matrix_test, Y_train, Y_test, vect = \
        \
        get_vector_space_model_train_test(data,
                                          min_doc_list_size,
                                          make_equal_size,
                                          train_perc)

    ds.serialize(vect, data_path + 'vectorizer')

    ds.serialize((matrix_train, matrix_test, Y_train, Y_test),