experiments += [("embeddings",)] scores = np.zeros((10, len(experiments))) sizes = [] for e, experiment in enumerate(experiments): sizes = [] print experiment for i, train_size in enumerate(np.arange(0.1, 1.1, 0.1)): size = int(len(X_train_docs) * train_size) sizes.append(size) if experiment == ("embeddings",): features = FeatureStacker(("embeddings", WordEmbeddings(model))) experiment = ("word",) + experiment else: features = FeatureStacker(("windower", Windower(window_size=3)), ("embeddings", WordEmbeddings(model))) X_train = include_features(X_train_docs[:size], experiment) X_test = include_features(X_test_docs, experiment) X_train = features.fit_transform(X_train) X_test = features.transform(X_test) le = LabelEncoder() y_train = le.fit_transform(y_train_docs[: X_train.shape[0]]) y_test = le.transform(y_test_docs) clf = LogisticRegression(C=1.0) clf.fit(X_train, y_train) preds = clf.predict(X_test) scores[i, e] = f1_score(y_test, preds, average="micro") df = pd.DataFrame(scores, index=sizes, columns=["_".join(exp) for exp in experiments]) df.to_csv("learning_curve.csv")
('embeddings', WordEmbeddings(model))) backoff_feature_vectorizer = FeatureStacker(('windower', Windower(window_size=3))) X_full = full_feature_vectorizer.fit_transform([[word for word in doc] for doc in X]) X_backoff = backoff_feature_vectorizer.fit_transform([[word for word in doc] for doc in X]) y = LabelEncoder().fit_transform([l for labels in y for l in labels]) clf_full = LogisticRegression().fit(X_full, y) clf_backoff = LogisticRegression().fit(X_backoff, y) frogger = Frog(int(sys.argv[3])) for filename in glob.glob(os.path.join(sys.argv[4], "*")): print filename characters = Counter() with codecs.open(filename, encoding='utf-8') as infile: doc = infile.read() document = frogger.tag(doc) document = [[f.decode('utf-8') for f in w[:-1]] for sentence in document for w in sentence] words = [word[0] for word in document] X_test_full = full_feature_vectorizer.transform([document]) X_test_backoff = backoff_feature_vectorizer.transform([document]) for i, word in enumerate(X_test_full): if words[i].lower() not in model: pred = clf_backoff.predict(X_test_backoff[i])[0] else: pred = clf_full.predict(X_test_full[i])[0] if pred == 1 and document[i][2] in ('N', 'SPEC'): characters[document[i][0]] += 1 print ', '.join(sorted(characters, key=characters.__getitem__, reverse=True))
sizes = [] for e, experiment in enumerate(experiments): sizes = [] print experiment for i, train_size in enumerate(np.arange(0.1, 1.1, 0.1)): size = int(len(X_train_docs) * train_size) sizes.append(size) if experiment == ('embeddings', ): features = FeatureStacker(('embeddings', WordEmbeddings(model))) experiment = ('word', ) + experiment else: features = FeatureStacker(('windower', Windower(window_size=3)), ('embeddings', WordEmbeddings(model))) X_train = include_features(X_train_docs[:size], experiment) X_test = include_features(X_test_docs, experiment) X_train = features.fit_transform(X_train) X_test = features.transform(X_test) le = LabelEncoder() y_train = le.fit_transform(y_train_docs[:X_train.shape[0]]) y_test = le.transform(y_test_docs) clf = LogisticRegression(C=1.0) clf.fit(X_train, y_train) preds = clf.predict(X_test) scores[i, e] = f1_score(y_test, preds, average='micro') df = pd.DataFrame(scores, index=sizes, columns=['_'.join(exp) for exp in experiments]) df.to_csv("learning_curve.csv")