experiments += [("embeddings",)]

scores = np.zeros((10, len(experiments)))
sizes = []

for e, experiment in enumerate(experiments):
    sizes = []
    print experiment
    for i, train_size in enumerate(np.arange(0.1, 1.1, 0.1)):
        size = int(len(X_train_docs) * train_size)
        sizes.append(size)
        if experiment == ("embeddings",):
            features = FeatureStacker(("embeddings", WordEmbeddings(model)))
            experiment = ("word",) + experiment
        else:
            features = FeatureStacker(("windower", Windower(window_size=3)), ("embeddings", WordEmbeddings(model)))
        X_train = include_features(X_train_docs[:size], experiment)
        X_test = include_features(X_test_docs, experiment)
        X_train = features.fit_transform(X_train)
        X_test = features.transform(X_test)
        le = LabelEncoder()
        y_train = le.fit_transform(y_train_docs[: X_train.shape[0]])
        y_test = le.transform(y_test_docs)
        clf = LogisticRegression(C=1.0)
        clf.fit(X_train, y_train)
        preds = clf.predict(X_test)
        scores[i, e] = f1_score(y_test, preds, average="micro")

df = pd.DataFrame(scores, index=sizes, columns=["_".join(exp) for exp in experiments])
df.to_csv("learning_curve.csv")
예제 #2
0
                                         ('embeddings', WordEmbeddings(model)))
backoff_feature_vectorizer = FeatureStacker(('windower', Windower(window_size=3)))

X_full = full_feature_vectorizer.fit_transform([[word for word in doc] for doc in X])
X_backoff = backoff_feature_vectorizer.fit_transform([[word for word in doc] for doc in X])
y = LabelEncoder().fit_transform([l for labels in y for l in labels])

clf_full = LogisticRegression().fit(X_full, y)
clf_backoff = LogisticRegression().fit(X_backoff, y)
frogger = Frog(int(sys.argv[3]))

for filename in glob.glob(os.path.join(sys.argv[4], "*")):
    print filename
    characters = Counter()
    with codecs.open(filename, encoding='utf-8') as infile:
        doc = infile.read()
        document = frogger.tag(doc)
        document = [[f.decode('utf-8') for f in w[:-1]]
                    for sentence in document for w in sentence]
        words = [word[0] for word in document]
    X_test_full = full_feature_vectorizer.transform([document])
    X_test_backoff = backoff_feature_vectorizer.transform([document])
    for i, word in enumerate(X_test_full):
        if words[i].lower() not in model:
            pred = clf_backoff.predict(X_test_backoff[i])[0]
        else:
            pred = clf_full.predict(X_test_full[i])[0]
        if pred == 1 and document[i][2] in ('N', 'SPEC'):
            characters[document[i][0]] += 1
    print ', '.join(sorted(characters, key=characters.__getitem__, reverse=True))
sizes = []

for e, experiment in enumerate(experiments):
    sizes = []
    print experiment
    for i, train_size in enumerate(np.arange(0.1, 1.1, 0.1)):
        size = int(len(X_train_docs) * train_size)
        sizes.append(size)
        if experiment == ('embeddings', ):
            features = FeatureStacker(('embeddings', WordEmbeddings(model)))
            experiment = ('word', ) + experiment
        else:
            features = FeatureStacker(('windower', Windower(window_size=3)),
                                      ('embeddings', WordEmbeddings(model)))
        X_train = include_features(X_train_docs[:size], experiment)
        X_test = include_features(X_test_docs, experiment)
        X_train = features.fit_transform(X_train)
        X_test = features.transform(X_test)
        le = LabelEncoder()
        y_train = le.fit_transform(y_train_docs[:X_train.shape[0]])
        y_test = le.transform(y_test_docs)
        clf = LogisticRegression(C=1.0)
        clf.fit(X_train, y_train)
        preds = clf.predict(X_test)
        scores[i, e] = f1_score(y_test, preds, average='micro')

df = pd.DataFrame(scores,
                  index=sizes,
                  columns=['_'.join(exp) for exp in experiments])
df.to_csv("learning_curve.csv")