Пример #1
0
def main():
    scriptdir = os.path.dirname(os.path.realpath(__file__))
    defaultdata = scriptdir+"/../data/cwi_training/cwi_training.txt.lbl.conll"
    parser = argparse.ArgumentParser(description="Baselines")
    parser.add_argument('--train', help="parsed-and-label input format", default=defaultdata)
    args = parser.parse_args()

    labels = []
    featuredicts = []

    print("Collecting features...")
    count=0
    for s in readSentences(args.train):
       print("\r"+str(count), end="")
       count+=1
       for l,i in zip(s["label"],s["idx"]):
            if l != "-":
                w = WordInContext(s, i, s["form"][i],s["lemma"][i],s["pos"][i],s["ne"][i],l,s["head"],s["deprel"])
                featuredicts.append(w.baselinefeatures())
                labels.append(w.label)
    print()
    vec = DictVectorizer()
    features = vec.fit_transform(featuredicts).toarray()
    labels = np.array(labels)
    classifiers = [LogisticRegression(penalty='l1'),LogisticRegression(penalty='l2'),SGDClassifier(),tree.DecisionTreeClassifier(),dummy.DummyClassifier(strategy="most_frequent") ]


    scores = defaultdict(list)

    for classifier in classifiers:
        for TrainIndices, TestIndices in cross_validation.KFold(n=features.shape[0], n_folds=10, shuffle=False, random_state=1):
            TrainX_i = features[TrainIndices]
            Trainy_i = labels[TrainIndices]

            TestX_i = features[TestIndices]
            Testy_i =  labels[TestIndices]

            classifier.fit(TrainX_i,Trainy_i)
            ypred_i = classifier.predict(TestX_i)

            scores["Accuracy"].append(accuracy_score(ypred_i,Testy_i))
            scores["F1"].append(f1_score(ypred_i,Testy_i))
            scores["Precision"].append(precision_score(ypred_i,Testy_i))
            scores["Recall"].append(recall_score(ypred_i,Testy_i))
        print("--", str(classifier))
        for key in sorted(scores.keys()):
            currentmetric = np.array(scores[key])
            print("%s : %0.2f (+/- %0.2f)" % (key,currentmetric.mean(), currentmetric.std()))

    sys.exit(0)
Пример #2
0
def main():
    #global brownclusters, cluster_heights, ave_brown_depth, ave_brown_height, max_brown_depth, embeddings
    #brownclusters, cluster_heights, ave_brown_depth, ave_brown_height, max_brown_depth=read_brown_clusters('/coastal/brown_clusters/rcv1.64M-c1000-p1.paths', 1000)
    #embeddings=read_embeddings('/coastal/mono_embeddings/glove.6B.300d.txt.gz')

    scriptdir = os.path.dirname(os.path.realpath(__file__))
    defaultdata = scriptdir+"/../data/cwi_training/cwi_training_allannotations.txt.lbl.conll"
    parser = argparse.ArgumentParser(description="Skeleton for features and classifier for CWI-2016")
    parser.add_argument('--train', help="parsed-and-label input format", default=defaultdata)
    parser.add_argument('--instance_weighting', choices=["uniform","linear","inverse_class_relevance","log_and_mode","tf_idf","log_and_max"], default="uniform")
    args = parser.parse_args()

    labels = []
    featuredicts = []
    
    #print("Collecting features...")
    count=0
    positive_votes = []
    for s in readSentences(args.train):
       #print("\r"+str(count), end="")
       count+=1
       for l,i in zip(s["label"],s["idx"]):
            if l != "-":
                w = WordInContext(s, i, s["form"][i],s["lemma"][i],s["pos"][i],s["ne"][i],positive_votes=l,heads=s["head"],deprels=s["deprel"])
                featuredicts.append(w.featurize())
                labels.append(w.label)
                positive_votes.append(w.positive_votes)
    vec = DictVectorizer()
    features = vec.fit_transform(featuredicts).toarray()
    labels = np.array(labels)
    positive_votes = np.array(positive_votes)

    learners = [tree.DecisionTreeClassifier(), svm.NuSVC(nu=0.2)]
    for learner in learners:
        scores = defaultdict(list)
        for TrainIndices, TestIndices in cross_validation.KFold(n=features.shape[0], n_folds=10, shuffle=False, random_state=None):

            TrainX_i = features[TrainIndices]
            Trainy_i = labels[TrainIndices]
            sampleweights_i = get_sample_weights(positive_votes[TrainIndices], args.instance_weighting)
            #print(sampleweights_i)
            TestX_i = features[TestIndices]
            Testy_i =  labels[TestIndices]

            learner.fit(TrainX_i,Trainy_i,sample_weight=sampleweights_i)
            ypred_i = learner.predict(TestX_i)

            acc = accuracy_score(ypred_i, Testy_i)
            pre = precision_score(ypred_i, Testy_i)
            rec = recall_score(ypred_i, Testy_i)
            # shared task uses f1 of *accuracy* and recall!
            f1 = 2 * acc * rec / (acc + rec)

            scores["Accuracy"].append(acc)
            scores["F1"].append(f1)
            scores["Precision"].append(pre)
            scores["Recall"].append(rec)

        print("--")
        print(learner)
        for key in sorted(scores.keys()):
            currentmetric = np.array(scores[key])
            print("%s : %0.2f (+/- %0.2f)" % (key,currentmetric.mean(), currentmetric.std()))
        print("--")