Exemplo n.º 1
0
    def train_parse_models(self, examples):
        self.current_parser_feat_vectorizer = FeatureVectorizer(
            min_feature_freq=self.min_feature_freq, sparse=self.sparse)
        xs = self.current_parser_feat_vectorizer.fit_transform(examples.xs)
        ys = examples.get_labels()

        weights = []

        for ix in range(xs.shape[0]):
            costs_by_action = {}
            gold_action = ys[ix]
            gold_action_wt = 0
            for action in PARSE_ACTIONS:
                cost = examples.get_weights_for(action)[ix]
                if action == gold_action:
                    gold_action_wt = cost
                else:
                    costs_by_action[action] = cost

            worse_action, worse_cost = max(costs_by_action.items(),
                                           key=lambda tpl: tpl[1])
            assert gold_action_wt >= 0 and worse_cost >= 0, "Costs should be non negative"
            # Weight of example is the difference between the best action and the worse action
            # as both are positive, we simply add them up
            weight = gold_action_wt + worse_cost
            weights.append(weight)

        mdl = self.base_learner_fact()
        mdl.fit(xs, ys, sample_weight=weights)

        #cost = examples.get_weights_for(action)[ix]
        self.current_parser_models = mdl
        self.parser_models.append(mdl)
 def train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags):
     # TD and VD are lists of Essay objects. The sentences are lists
     # of featureextractortransformer.Word objects
     """ Data Partitioning and Training """
     td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD)
     vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD)
     feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ,
                                             sparse=SPARSE_WD_FEATS)
     td_X, vd_X = feature_transformer.fit_transform(
         td_feats), feature_transformer.transform(vd_feats)
     wd_td_ys_bytag = get_wordlevel_ys_by_code(td_tags, wd_train_tags)
     wd_vd_ys_bytag = get_wordlevel_ys_by_code(vd_tags, wd_train_tags)
     """ TRAIN Tagger """
     tag2word_classifier = train_classifier_per_code(
         td_X,
         wd_td_ys_bytag,
         lambda: LogisticRegression(),
         wd_train_tags,
         verbose=False)
     """ TEST Tagger """
     td_wd_predictions_by_code = test_classifier_per_code(
         td_X, tag2word_classifier, wd_test_tags)
     vd_wd_predictions_by_code = test_classifier_per_code(
         vd_X, tag2word_classifier, wd_test_tags)
     return td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_bytag, wd_vd_ys_bytag
Exemplo n.º 3
0
def train_tagger(fold, essays_TD, essays_VD, wd_test_tags, wd_train_tags):
    # TD and VD are lists of Essay objects. The sentences are lists
    # of featureextractortransformer.Word objects
    print "\nFold %s" % fold
    print "Training Tagging Model"
    """ Data Partitioning and Training """
    td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD)
    vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD)
    feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ,
                                            sparse=SPARSE_WD_FEATS)
    td_X, vd_X = feature_transformer.fit_transform(
        td_feats), feature_transformer.transform(vd_feats)

    wd_td_ys = get_wordlevel_powerset_ys(td_tags, wd_train_tags)
    wd_vd_ys = get_wordlevel_powerset_ys(vd_tags, wd_train_tags)

    wd_td_ys_by_code = get_by_code_from_powerset_predictions(
        wd_td_ys, wd_test_tags)
    wd_vd_ys_by_code = get_by_code_from_powerset_predictions(
        wd_vd_ys, wd_test_tags)
    """ TRAIN Tagger """
    model = fn_create_wd_cls()
    model.fit(td_X, wd_td_ys)

    wd_td_pred = model.predict(td_X)
    wd_vd_pred = model.predict(vd_X)
    """ TEST Tagger """
    td_wd_predictions_by_code = get_by_code_from_powerset_predictions(
        wd_td_pred, wd_test_tags)
    vd_wd_predictions_by_code = get_by_code_from_powerset_predictions(
        wd_vd_pred, wd_test_tags)

    return td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_by_code, wd_vd_ys_by_code
Exemplo n.º 4
0
 def train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags):
     # TD and VD are lists of Essay objects. The sentences are lists
     # of featureextractortransformer.Word objects
     """ Data Partitioning and Training """
     td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD)
     vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD)
     feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ,
                                             sparse=SPARSE_WD_FEATS)
     td_X, vd_X = feature_transformer.fit_transform(
         td_feats), feature_transformer.transform(vd_feats)
     return td_X.shape, vd_X.shape
    def train_crel_models(self, examples):

        feat_vectorizer = FeatureVectorizer(
            min_feature_freq=self.min_feature_freq, sparse=self.sparse)

        model = self.crel_learner_fact()
        xs = feat_vectorizer.fit_transform(examples.xs)
        ys = examples.get_labels()
        # There are no weights here as this is a simple binary classification problem
        model.fit(xs, ys)

        self.crel_models.append(model)
        self.crel_feat_vectorizers.append(feat_vectorizer)
def train_tagger(fold, essays_TD, essays_VD, wd_test_tags, wd_train_tags, dual,
                 C, penalty, fit_intercept, multi_class):

    # TD and VD are lists of Essay objects. The sentences are lists
    # of featureextractortransformer.Word objects
    """ Data Partitioning and Training """
    td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD)
    vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD)

    feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ,
                                            sparse=SPARSE_WD_FEATS)
    td_X, vd_X = feature_transformer.fit_transform(
        td_feats), feature_transformer.transform(vd_feats)

    wd_td_ys = get_wordlevel_powerset_ys(td_tags, wd_train_tags)
    wd_vd_ys = get_wordlevel_powerset_ys(vd_tags, wd_train_tags)

    wd_td_ys_by_code = get_by_code_from_powerset_predictions(
        wd_td_ys, wd_test_tags)
    wd_vd_ys_by_code = get_by_code_from_powerset_predictions(
        wd_vd_ys, wd_test_tags)
    """ TRAIN Tagger """

    solver = 'liblinear'
    if multi_class == 'multinomial':
        solver = "lbfgs"
    model = LogisticRegression(dual=dual,
                               C=C,
                               penalty=penalty,
                               fit_intercept=fit_intercept,
                               multi_class=multi_class,
                               solver=solver)
    if fold == 0:
        print(model)

    model.fit(td_X, wd_td_ys)

    wd_td_pred = model.predict(td_X)
    wd_vd_pred = model.predict(vd_X)
    """ TEST Tagger """
    td_wd_predictions_by_code = get_by_code_from_powerset_predictions(
        wd_td_pred, wd_test_tags)
    vd_wd_predictions_by_code = get_by_code_from_powerset_predictions(
        wd_vd_pred, wd_test_tags)

    return td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_by_code, wd_vd_ys_by_code
def train_tagger(fold, essays_TD, essays_VD, wd_test_tags, wd_train_tags):

    wd_train_tags = set(wd_train_tags)

    # TD and VD are lists of Essay objects. The sentences are lists
    # of featureextractortransformer.Word objects
    print "\nFold %s" % fold
    print "Training Tagging Model"

    _, lst_every_tag = flatten_to_wordlevel_feat_tags(essay_feats)
    tag_freq = Counter(flatten(lst_every_tag))

    """ Data Partitioning and Training """
    td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD)
    vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD)
    feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ, sparse=SPARSE_WD_FEATS)
    td_X, vd_X = feature_transformer.fit_transform(td_feats), feature_transformer.transform(vd_feats)

    #TODO: compute most common tags per word for training only (but not for evaluation)
    wd_td_ys = get_wordlevel_mostfrequent_ys(td_tags, wd_train_tags, tag_freq)

    # Get Actual Ys by code (dict of label to predictions
    wd_td_ys_by_code = get_wordlevel_ys_by_code(td_tags, wd_train_tags)
    wd_vd_ys_by_code = get_wordlevel_ys_by_code(vd_tags, wd_train_tags)

    #TODO: get most common tags for each word, predict from that using multi class method

    """ TRAIN Tagger """
    model = fn_create_wd_cls()
    model.fit(td_X, wd_td_ys)

    wd_td_pred = model.predict(td_X)
    wd_vd_pred = model.predict(vd_X)

    """ TEST Tagger """
    td_wd_predictions_by_code = get_by_code_from_powerset_predictions(wd_td_pred, wd_test_tags)
    vd_wd_predictions_by_code = get_by_code_from_powerset_predictions(wd_vd_pred, wd_test_tags)

    return td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_by_code, wd_vd_ys_by_code
Exemplo n.º 8
0
    def train_parse_models(self, examples):
        models = {}
        feat_vectorizer = FeatureVectorizer(min_feature_freq=self.min_feature_freq, sparse=self.sparse)

        xs = feat_vectorizer.fit_transform(examples.xs)
        for action in PARSE_ACTIONS:
            ys = [1 if i > 0 else 0 for i in examples.get_labels_for(action)]
            weights = examples.get_weights_for(action)

            # filter out zero cost actions
            # triples = zip(xs, ys, weights)
            # triple_no_zeros = [(x,y,c) for (x,y,c) in triples if c > 0.0]
            # tmp_xs, ys, weights = zip(*triple_no_zeros)
            # # need to re-constitute the matrix
            # xs = scipy.sparse.vstack(tmp_xs)

            mdl = self.base_learner_fact()
            mdl.fit(xs, ys, sample_weight=weights)

            models[action] = mdl

        self.parser_models.append(models)
        self.parser_feature_vectorizers.append(feat_vectorizer)
    def train_parse_models(self, examples):
        models = {}
        self.current_parser_feat_vectorizer = FeatureVectorizer(
            min_feature_freq=self.min_feature_freq, sparse=self.sparse)
        xs = self.current_parser_feat_vectorizer.fit_transform(examples.xs)

        for action in PARSE_ACTIONS:
            # positive examples have negative cost, negative examples have positive cost
            lbls = [
                -1 if i > 0 else 1 for i in examples.get_labels_for(action)
            ]  # type: List[int]
            costs = examples.get_weights_for(action)  # type: List[float]

            # Ensure the costs is > 0 so that the lost cost examples provide some more info
            #ys = [lbl * max(0.1, cost) for (lbl,cost) in zip(lbls, costs)]
            ys = [lbl * cost for (lbl, cost) in zip(lbls, costs)]

            mdl = self.base_learner_fact()
            mdl.fit(xs, ys)

            models[action] = mdl

        self.current_parser_models = models
        self.parser_models.append(models)
Exemplo n.º 10
0
cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag = defaultdict(list), defaultdict(list)
cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag = defaultdict(list), defaultdict(list)

folds = cross_validation(essay_feats, CV_FOLDS)
#TODO Parallelize
for i,(essays_TD, essays_VD) in enumerate(folds):

    # TD and VD are lists of Essay objects. The sentences are lists
    # of featureextractortransformer.Word objects
    print "\nFold %s" % i
    print "Training Tagging Model"
    """ Data Partitioning and Training """
    td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD)
    vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD)

    feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ, sparse=SPARSE_WD_FEATS)
    td_X, vd_X = feature_transformer.fit_transform(td_feats), feature_transformer.transform(vd_feats)
    wd_td_ys_bytag = get_wordlevel_ys_by_code(td_tags, wd_train_tags)
    wd_vd_ys_bytag = get_wordlevel_ys_by_code(vd_tags, wd_train_tags)

    """ TRAIN Tagger """
    tag2word_classifier = train_classifier_per_code(td_X, wd_td_ys_bytag, fn_create_wd_cls, wd_train_tags)

    """ TEST Tagger """
    td_wd_predictions_by_code = test_classifier_per_code(td_X, tag2word_classifier, wd_test_tags)
    vd_wd_predictions_by_code = test_classifier_per_code(vd_X, tag2word_classifier, wd_test_tags)

    print "\nTraining Sentence Model"
    """ SENTENCE LEVEL PREDICTIONS FROM STACKING """
    sent_td_xs, sent_td_ys_bycode = get_sent_feature_for_stacking_from_tagging_model(sent_input_feat_tags, sent_input_interaction_tags, essays_TD, td_X, wd_td_ys_bytag, tag2word_classifier, SPARSE_SENT_FEATS, LOOK_BACK)
    sent_vd_xs, sent_vd_ys_bycode = get_sent_feature_for_stacking_from_tagging_model(sent_input_feat_tags, sent_input_interaction_tags, essays_VD, vd_X, wd_vd_ys_bytag, tag2word_classifier, SPARSE_SENT_FEATS, LOOK_BACK)