示例#1
0
def get_data_(train_limit, AuthorProc, DataProc, labels_enc_with_data,
              encoder_size):
    tr, te = import_data()

    # cross_val_predict (test_sklearnclassifier) does the train/test/cv split
    # internally
    text = list(tr.text) + list(te.text)
    author = list(tr.author) + list(te.author)

    label_enc = AuthorProc()
    data_enc = DataProc(encoder_size=encoder_size)

    if train_limit == None:
        train_limit = len(text)

    y = label_enc.fit_transform(author[:train_limit])

    if labels_enc_with_data:
        X, y = zip(*data_enc.fit_transform(text[:train_limit], y))
    else:
        X = data_enc.fit_transform(text[:train_limit])

    return (X, y)
        sens = self.pipeline_factory(x)
        sens = list(sens)

        features = np.array([self.get_features(s) for s in sens])

        features = (features - self.means) / self.devs

        out = self.svm.predict(features)

        return out


myc = []

if __name__ == '__main__':
    tr, te = import_data()

    author_enum = {'HPL': 0, 'EAP': 1, 'MWS': 2}

    classed_auths = [author_enum[a] for a in tr.author]

    myc = ProbabilisticSVMClassifier(beta_method=True,
                                     stem=True,
                                     lemma=True,
                                     n_jobs=3,
                                     kernel='rbf')

    y_train_pred = cross_val_predict(myc,
                                     tr.text,
                                     classed_auths,
                                     cv=3,
import matplotlib.pyplot as plt
import numpy as np
# noinspection PyUnresolvedReferences
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import KernelPCA
from sklearn.feature_extraction.text import CountVectorizer

from data.import_data import import_data

train, ts = import_data()

# Storing the entire training text in a list
text = list(train.text.values)
authors = np.array(list(train.author.values))

tf_vectorizer = CountVectorizer(
    max_df=0.95,
    min_df=2,
    # stop_words='english',
    decode_error='ignore')

tf = tf_vectorizer.fit_transform(text)

# print(tf)

rbf_pca = KernelPCA(n_components=3, kernel="sigmoid", gamma=0.04)
x_reduced = rbf_pca.fit_transform(tf)

# print(x_reduced)

fig = plt.figure()
示例#4
0
文件: views.py 项目: NieMinMax/Kitte
def do_import_data(request):
    settings = request.registry.settings
    dbmaker = request.registry.dbmaker
    db_session = dbmaker()
    import_data(settings, db_session)
    return u'Importing Data...'