示例#1
0
    def train_parse_models(self, examples):
        self.current_parser_feat_vectorizer = FeatureVectorizer(
            min_feature_freq=self.min_feature_freq, sparse=self.sparse)
        xs = self.current_parser_feat_vectorizer.fit_transform(examples.xs)
        ys = examples.get_labels()

        weights = []

        for ix in range(xs.shape[0]):
            costs_by_action = {}
            gold_action = ys[ix]
            gold_action_wt = 0
            for action in PARSE_ACTIONS:
                cost = examples.get_weights_for(action)[ix]
                if action == gold_action:
                    gold_action_wt = cost
                else:
                    costs_by_action[action] = cost

            worse_action, worse_cost = max(costs_by_action.items(),
                                           key=lambda tpl: tpl[1])
            assert gold_action_wt >= 0 and worse_cost >= 0, "Costs should be non negative"
            # Weight of example is the difference between the best action and the worse action
            # as both are positive, we simply add them up
            weight = gold_action_wt + worse_cost
            weights.append(weight)

        mdl = self.base_learner_fact()
        mdl.fit(xs, ys, sample_weight=weights)

        #cost = examples.get_weights_for(action)[ix]
        self.current_parser_models = mdl
        self.parser_models.append(mdl)
示例#2
0
def train_tagger(fold, essays_TD, essays_VD, wd_test_tags, wd_train_tags):
    # TD and VD are lists of Essay objects. The sentences are lists
    # of featureextractortransformer.Word objects
    print "\nFold %s" % fold
    print "Training Tagging Model"
    """ Data Partitioning and Training """
    td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD)
    vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD)
    feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ,
                                            sparse=SPARSE_WD_FEATS)
    td_X, vd_X = feature_transformer.fit_transform(
        td_feats), feature_transformer.transform(vd_feats)

    wd_td_ys = get_wordlevel_powerset_ys(td_tags, wd_train_tags)
    wd_vd_ys = get_wordlevel_powerset_ys(vd_tags, wd_train_tags)

    wd_td_ys_by_code = get_by_code_from_powerset_predictions(
        wd_td_ys, wd_test_tags)
    wd_vd_ys_by_code = get_by_code_from_powerset_predictions(
        wd_vd_ys, wd_test_tags)
    """ TRAIN Tagger """
    model = fn_create_wd_cls()
    model.fit(td_X, wd_td_ys)

    wd_td_pred = model.predict(td_X)
    wd_vd_pred = model.predict(vd_X)
    """ TEST Tagger """
    td_wd_predictions_by_code = get_by_code_from_powerset_predictions(
        wd_td_pred, wd_test_tags)
    vd_wd_predictions_by_code = get_by_code_from_powerset_predictions(
        wd_vd_pred, wd_test_tags)

    return td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_by_code, wd_vd_ys_by_code
 def train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags):
     # TD and VD are lists of Essay objects. The sentences are lists
     # of featureextractortransformer.Word objects
     """ Data Partitioning and Training """
     td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD)
     vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD)
     feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ,
                                             sparse=SPARSE_WD_FEATS)
     td_X, vd_X = feature_transformer.fit_transform(
         td_feats), feature_transformer.transform(vd_feats)
     wd_td_ys_bytag = get_wordlevel_ys_by_code(td_tags, wd_train_tags)
     wd_vd_ys_bytag = get_wordlevel_ys_by_code(vd_tags, wd_train_tags)
     """ TRAIN Tagger """
     tag2word_classifier = train_classifier_per_code(
         td_X,
         wd_td_ys_bytag,
         lambda: LogisticRegression(),
         wd_train_tags,
         verbose=False)
     """ TEST Tagger """
     td_wd_predictions_by_code = test_classifier_per_code(
         td_X, tag2word_classifier, wd_test_tags)
     vd_wd_predictions_by_code = test_classifier_per_code(
         vd_X, tag2word_classifier, wd_test_tags)
     return td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_bytag, wd_vd_ys_bytag
def train_tagger(fold, essays_TD, essays_VD, wd_test_tags, wd_train_tags):
    # TD and VD are lists of Essay objects. The sentences are lists
    # of featureextractortransformer.Word objects
    print "\nFold %s" % fold
    print "Training Tagging Model"

    """ Data Partitioning and Training """
    td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD)
    vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD)
    feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ, sparse=SPARSE_WD_FEATS)
    td_X, vd_X = feature_transformer.fit_transform(td_feats), feature_transformer.transform(vd_feats)

    wd_td_ys = get_wordlevel_powerset_ys(td_tags, wd_train_tags)
    wd_vd_ys = get_wordlevel_powerset_ys(vd_tags, wd_train_tags)

    wd_td_ys_by_code = get_by_code_from_powerset_predictions(wd_td_ys, wd_test_tags)
    wd_vd_ys_by_code = get_by_code_from_powerset_predictions(wd_vd_ys, wd_test_tags)

    """ TRAIN Tagger """
    model = fn_create_wd_cls()
    model.fit(td_X, wd_td_ys)

    wd_td_pred = model.predict(td_X)
    wd_vd_pred = model.predict(vd_X)

    """ TEST Tagger """
    td_wd_predictions_by_code = get_by_code_from_powerset_predictions(wd_td_pred, wd_test_tags)
    vd_wd_predictions_by_code = get_by_code_from_powerset_predictions(wd_vd_pred, wd_test_tags)

    return td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_by_code, wd_vd_ys_by_code
def train_tagger(fold, essays_TD, essays_VD, wd_test_tags, wd_train_tags,
                 dual, C, penalty, fit_intercept, multi_class):

    # TD and VD are lists of Essay objects. The sentences are lists
    # of featureextractortransformer.Word objects
    """ Data Partitioning and Training """
    td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD)
    vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD)
    feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ, sparse=SPARSE_WD_FEATS)
    td_X, vd_X = feature_transformer.fit_transform(td_feats), feature_transformer.transform(vd_feats)

    """ compute most common tags per word for training only (but not for evaluation) """
    wd_td_ys = get_wordlevel_mostfrequent_ys(td_tags, wd_train_tags, tag_freq)

    """ TRAIN Tagger """
    solver = 'liblinear'
    if multi_class == 'multinomial':
        solver = "lbfgs"
    model = LogisticRegression(dual=dual, C=C, penalty=penalty, fit_intercept=fit_intercept, multi_class=multi_class, solver=solver)
    if fold == 0:
        print(model)

    model.fit(td_X, wd_td_ys)

    wd_td_pred = model.predict(td_X)
    wd_vd_pred = model.predict(vd_X)

    """ TEST Tagger """
    td_wd_predictions_by_code = get_by_code_from_powerset_predictions(wd_td_pred, wd_test_tags)
    vd_wd_predictions_by_code = get_by_code_from_powerset_predictions(wd_vd_pred, wd_test_tags)

    """ Get Actual Ys by code (dict of label to predictions """
    wd_td_ys_by_code = get_wordlevel_ys_by_code(td_tags, wd_train_tags)
    wd_vd_ys_by_code = get_wordlevel_ys_by_code(vd_tags, wd_train_tags)
    return td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_by_code, wd_vd_ys_by_code
 def train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags):
     # TD and VD are lists of Essay objects. The sentences are lists
     # of featureextractortransformer.Word objects
     """ Data Partitioning and Training """
     td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD)
     vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD)
     feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ, sparse=SPARSE_WD_FEATS)
     td_X, vd_X = feature_transformer.fit_transform(td_feats), feature_transformer.transform(vd_feats)
     return td_X.shape, vd_X.shape
示例#7
0
 def train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags):
     # TD and VD are lists of Essay objects. The sentences are lists
     # of featureextractortransformer.Word objects
     """ Data Partitioning and Training """
     td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD)
     vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD)
     feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ,
                                             sparse=SPARSE_WD_FEATS)
     td_X, vd_X = feature_transformer.fit_transform(
         td_feats), feature_transformer.transform(vd_feats)
     return td_X.shape, vd_X.shape
    def train_crel_models(self, examples):

        feat_vectorizer = FeatureVectorizer(min_feature_freq=self.min_feature_freq,
                                            sparse=self.sparse)

        model = self.crel_learner_fact()
        xs = feat_vectorizer.fit_transform(examples.xs)
        ys = examples.get_labels()
        # There are no weights here as this is a simple binary classification problem
        model.fit(xs, ys)

        self.crel_models.append(model)
        self.crel_feat_vectorizers.append(feat_vectorizer)
    def train_crel_models(self, examples):

        feat_vectorizer = FeatureVectorizer(
            min_feature_freq=self.min_feature_freq, sparse=self.sparse)

        model = self.crel_learner_fact()
        xs = feat_vectorizer.fit_transform(examples.xs)
        ys = examples.get_labels()
        # There are no weights here as this is a simple binary classification problem
        model.fit(xs, ys)

        self.crel_models.append(model)
        self.crel_feat_vectorizers.append(feat_vectorizer)
    def train_parse_models(self, examples):
        self.current_parser_feat_vectorizer = FeatureVectorizer(min_feature_freq=self.min_feature_freq,
                                                                sparse=self.sparse)
        xs = self.current_parser_feat_vectorizer.fit_transform(examples.xs)
        ys = examples.get_labels()

        weights = []

        for ix in range(xs.shape[0]):
            costs_by_action = {}
            gold_action = ys[ix]
            gold_action_wt = 0
            for action in PARSE_ACTIONS:
                cost = examples.get_weights_for(action)[ix]
                if action == gold_action:
                    gold_action_wt = cost
                else:
                    costs_by_action[action] = cost

            worse_action, worse_cost = max(costs_by_action.items(), key= lambda tpl: tpl[1])
            assert gold_action_wt >=0 and worse_cost >= 0, "Costs should be non negative"
            # Weight of example is the difference between the best action and the worse action
            # as both are positive, we simply add them up
            weight = gold_action_wt + worse_cost
            weights.append(weight)

        mdl = self.base_learner_fact()
        mdl.fit(xs, ys, sample_weight=weights)

        #cost = examples.get_weights_for(action)[ix]
        self.current_parser_models = mdl
        self.parser_models.append(mdl)
def train_tagger(fold, essays_TD, essays_VD, wd_test_tags, wd_train_tags, dual,
                 C, penalty, fit_intercept, multi_class):

    # TD and VD are lists of Essay objects. The sentences are lists
    # of featureextractortransformer.Word objects
    """ Data Partitioning and Training """
    td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD)
    vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD)

    feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ,
                                            sparse=SPARSE_WD_FEATS)
    td_X, vd_X = feature_transformer.fit_transform(
        td_feats), feature_transformer.transform(vd_feats)

    wd_td_ys = get_wordlevel_powerset_ys(td_tags, wd_train_tags)
    wd_vd_ys = get_wordlevel_powerset_ys(vd_tags, wd_train_tags)

    wd_td_ys_by_code = get_by_code_from_powerset_predictions(
        wd_td_ys, wd_test_tags)
    wd_vd_ys_by_code = get_by_code_from_powerset_predictions(
        wd_vd_ys, wd_test_tags)
    """ TRAIN Tagger """

    solver = 'liblinear'
    if multi_class == 'multinomial':
        solver = "lbfgs"
    model = LogisticRegression(dual=dual,
                               C=C,
                               penalty=penalty,
                               fit_intercept=fit_intercept,
                               multi_class=multi_class,
                               solver=solver)
    if fold == 0:
        print(model)

    model.fit(td_X, wd_td_ys)

    wd_td_pred = model.predict(td_X)
    wd_vd_pred = model.predict(vd_X)
    """ TEST Tagger """
    td_wd_predictions_by_code = get_by_code_from_powerset_predictions(
        wd_td_pred, wd_test_tags)
    vd_wd_predictions_by_code = get_by_code_from_powerset_predictions(
        wd_vd_pred, wd_test_tags)

    return td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_by_code, wd_vd_ys_by_code
 def train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags):
     # TD and VD are lists of Essay objects. The sentences are lists
     # of featureextractortransformer.Word objects
     """ Data Partitioning and Training """
     td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD)
     vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD)
     feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ, sparse=SPARSE_WD_FEATS)
     td_X, vd_X = feature_transformer.fit_transform(td_feats), feature_transformer.transform(vd_feats)
     wd_td_ys_bytag = get_wordlevel_ys_by_code(td_tags, wd_train_tags)
     wd_vd_ys_bytag = get_wordlevel_ys_by_code(vd_tags, wd_train_tags)
     """ TRAIN Tagger """
     tag2word_classifier = train_classifier_per_code(td_X, wd_td_ys_bytag, lambda: LogisticRegression(),
                                                     wd_train_tags, verbose=False)
     """ TEST Tagger """
     td_wd_predictions_by_code = test_classifier_per_code(td_X, tag2word_classifier, wd_test_tags)
     vd_wd_predictions_by_code = test_classifier_per_code(vd_X, tag2word_classifier, wd_test_tags)
     return td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_bytag, wd_vd_ys_bytag
def train_tagger(fold, essays_TD, essays_VD, wd_test_tags, wd_train_tags):

    wd_train_tags = set(wd_train_tags)

    # TD and VD are lists of Essay objects. The sentences are lists
    # of featureextractortransformer.Word objects
    print "\nFold %s" % fold
    print "Training Tagging Model"

    _, lst_every_tag = flatten_to_wordlevel_feat_tags(essay_feats)
    tag_freq = Counter(flatten(lst_every_tag))

    """ Data Partitioning and Training """
    td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD)
    vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD)
    feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ, sparse=SPARSE_WD_FEATS)
    td_X, vd_X = feature_transformer.fit_transform(td_feats), feature_transformer.transform(vd_feats)

    #TODO: compute most common tags per word for training only (but not for evaluation)
    wd_td_ys = get_wordlevel_mostfrequent_ys(td_tags, wd_train_tags, tag_freq)

    # Get Actual Ys by code (dict of label to predictions
    wd_td_ys_by_code = get_wordlevel_ys_by_code(td_tags, wd_train_tags)
    wd_vd_ys_by_code = get_wordlevel_ys_by_code(vd_tags, wd_train_tags)

    #TODO: get most common tags for each word, predict from that using multi class method

    """ TRAIN Tagger """
    model = fn_create_wd_cls()
    model.fit(td_X, wd_td_ys)

    wd_td_pred = model.predict(td_X)
    wd_vd_pred = model.predict(vd_X)

    """ TEST Tagger """
    td_wd_predictions_by_code = get_by_code_from_powerset_predictions(wd_td_pred, wd_test_tags)
    vd_wd_predictions_by_code = get_by_code_from_powerset_predictions(wd_vd_pred, wd_test_tags)

    return td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_by_code, wd_vd_ys_by_code
示例#14
0
    def train_parse_models(self, examples):
        models = {}
        feat_vectorizer = FeatureVectorizer(min_feature_freq=self.min_feature_freq, sparse=self.sparse)

        xs = feat_vectorizer.fit_transform(examples.xs)
        for action in PARSE_ACTIONS:
            ys = [1 if i > 0 else 0 for i in examples.get_labels_for(action)]
            weights = examples.get_weights_for(action)

            # filter out zero cost actions
            # triples = zip(xs, ys, weights)
            # triple_no_zeros = [(x,y,c) for (x,y,c) in triples if c > 0.0]
            # tmp_xs, ys, weights = zip(*triple_no_zeros)
            # # need to re-constitute the matrix
            # xs = scipy.sparse.vstack(tmp_xs)

            mdl = self.base_learner_fact()
            mdl.fit(xs, ys, sample_weight=weights)

            models[action] = mdl

        self.parser_models.append(models)
        self.parser_feature_vectorizers.append(feat_vectorizer)
    def train_parse_models(self, examples):
        models = {}
        self.current_parser_feat_vectorizer = FeatureVectorizer(
            min_feature_freq=self.min_feature_freq, sparse=self.sparse)
        xs = self.current_parser_feat_vectorizer.fit_transform(examples.xs)

        for action in PARSE_ACTIONS:
            # positive examples have negative cost, negative examples have positive cost
            lbls = [
                -1 if i > 0 else 1 for i in examples.get_labels_for(action)
            ]  # type: List[int]
            costs = examples.get_weights_for(action)  # type: List[float]

            # Ensure the costs is > 0 so that the lost cost examples provide some more info
            #ys = [lbl * max(0.1, cost) for (lbl,cost) in zip(lbls, costs)]
            ys = [lbl * cost for (lbl, cost) in zip(lbls, costs)]

            mdl = self.base_learner_fact()
            mdl.fit(xs, ys)

            models[action] = mdl

        self.current_parser_models = models
        self.parser_models.append(models)
class SearnModelTemplateFeaturesRegression(SearnModelTemplateFeatures):
    def __init__(self,
                 ngram_extractor,
                 feature_extractor,
                 cost_function,
                 min_feature_freq,
                 cr_tags,
                 base_learner_fact,
                 crel_learner_fact,
                 beta=0.2,
                 positive_val=1,
                 sparse=True,
                 log_fn=lambda s: print(s)):

        super(SearnModelTemplateFeaturesRegression,
              self).__init__(ngram_extractor=ngram_extractor,
                             feature_extractor=feature_extractor,
                             cost_function=cost_function,
                             min_feature_freq=min_feature_freq,
                             cr_tags=cr_tags,
                             base_learner_fact=base_learner_fact,
                             beta=beta,
                             positive_val=positive_val,
                             sparse=sparse,
                             log_fn=log_fn)
        self.crel_learner_fact = crel_learner_fact

    def train_parse_models(self, examples):
        models = {}
        self.current_parser_feat_vectorizer = FeatureVectorizer(
            min_feature_freq=self.min_feature_freq, sparse=self.sparse)
        xs = self.current_parser_feat_vectorizer.fit_transform(examples.xs)

        for action in PARSE_ACTIONS:
            # positive examples have negative cost, negative examples have positive cost
            lbls = [
                -1 if i > 0 else 1 for i in examples.get_labels_for(action)
            ]  # type: List[int]
            costs = examples.get_weights_for(action)  # type: List[float]

            # Ensure the costs is > 0 so that the lost cost examples provide some more info
            #ys = [lbl * max(0.1, cost) for (lbl,cost) in zip(lbls, costs)]
            ys = [lbl * cost for (lbl, cost) in zip(lbls, costs)]

            mdl = self.base_learner_fact()
            mdl.fit(xs, ys)

            models[action] = mdl

        self.current_parser_models = models
        self.parser_models.append(models)

    def predict_parse_action(self, feats, tos):
        xs = self.current_parser_feat_vectorizer.transform(feats)
        pred_by_label = {}
        for action in self.randomize_actions():
            if not allowed_action(action, tos):
                continue

            pred_by_label[action] = self.current_parser_models[action].predict(
                xs)[0]

        # Get label with the lowest cost
        min_act, min_val = min(pred_by_label.items(), key=lambda tpl: tpl[1])
        return min_act
示例#17
0
cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag = defaultdict(list), defaultdict(list)
cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag = defaultdict(list), defaultdict(list)

folds = cross_validation(essay_feats, CV_FOLDS)
#TODO Parallelize
for i,(essays_TD, essays_VD) in enumerate(folds):

    # TD and VD are lists of Essay objects. The sentences are lists
    # of featureextractortransformer.Word objects
    print "\nFold %s" % i
    print "Training Tagging Model"
    """ Data Partitioning and Training """
    td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD)
    vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD)

    feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ, sparse=SPARSE_WD_FEATS)
    td_X, vd_X = feature_transformer.fit_transform(td_feats), feature_transformer.transform(vd_feats)
    wd_td_ys_bytag = get_wordlevel_ys_by_code(td_tags, wd_train_tags)
    wd_vd_ys_bytag = get_wordlevel_ys_by_code(vd_tags, wd_train_tags)

    """ TRAIN Tagger """
    tag2word_classifier = train_classifier_per_code(td_X, wd_td_ys_bytag, fn_create_wd_cls, wd_train_tags)

    """ TEST Tagger """
    td_wd_predictions_by_code = test_classifier_per_code(td_X, tag2word_classifier, wd_test_tags)
    vd_wd_predictions_by_code = test_classifier_per_code(vd_X, tag2word_classifier, wd_test_tags)

    print "\nTraining Sentence Model"
    """ SENTENCE LEVEL PREDICTIONS FROM STACKING """
    sent_td_xs, sent_td_ys_bycode = get_sent_feature_for_stacking_from_tagging_model(sent_input_feat_tags, sent_input_interaction_tags, essays_TD, td_X, wd_td_ys_bytag, tag2word_classifier, SPARSE_SENT_FEATS, LOOK_BACK)
    sent_vd_xs, sent_vd_ys_bycode = get_sent_feature_for_stacking_from_tagging_model(sent_input_feat_tags, sent_input_interaction_tags, essays_VD, vd_X, wd_vd_ys_bytag, tag2word_classifier, SPARSE_SENT_FEATS, LOOK_BACK)
class SearnModelTemplateFeaturesMultinomialLogisticRegression(SearnModelTemplateFeatures):
    def __init__(self, ngram_extractor, feature_extractor, cost_function, min_feature_freq, cr_tags,
                 base_learner_fact, crel_learner_fact,
                 beta=0.2, positive_val=1, sparse=True, log_fn=lambda s: print(s)):

        super(SearnModelTemplateFeaturesMultinomialLogisticRegression, self).__init__(ngram_extractor=ngram_extractor,
                                                                                      feature_extractor=feature_extractor,
                                                                                      cost_function=cost_function,
                                                                                      min_feature_freq=min_feature_freq,
                                                                                      cr_tags=cr_tags,
                                                                                      base_learner_fact=base_learner_fact,
                                                                                      beta=beta,
                                                                                      positive_val=positive_val,
                                                                                      sparse=sparse,
                                                                                      log_fn=log_fn)
        self.crel_learner_fact = crel_learner_fact

    def train_parse_models(self, examples):
        self.current_parser_feat_vectorizer = FeatureVectorizer(min_feature_freq=self.min_feature_freq,
                                                                sparse=self.sparse)
        xs = self.current_parser_feat_vectorizer.fit_transform(examples.xs)
        ys = examples.get_labels()

        weights = []

        for ix in range(xs.shape[0]):
            costs_by_action = {}
            gold_action = ys[ix]
            gold_action_wt = 0
            for action in PARSE_ACTIONS:
                cost = examples.get_weights_for(action)[ix]
                if action == gold_action:
                    gold_action_wt = cost
                else:
                    costs_by_action[action] = cost

            worse_action, worse_cost = max(costs_by_action.items(), key= lambda tpl: tpl[1])
            assert gold_action_wt >=0 and worse_cost >= 0, "Costs should be non negative"
            # Weight of example is the difference between the best action and the worse action
            # as both are positive, we simply add them up
            weight = gold_action_wt + worse_cost
            weights.append(weight)

        mdl = self.base_learner_fact()
        mdl.fit(xs, ys, sample_weight=weights)

        #cost = examples.get_weights_for(action)[ix]
        self.current_parser_models = mdl
        self.parser_models.append(mdl)

    def predict_parse_action(self, feats, tos):
        model = self.current_parser_models

        xs = self.current_parser_feat_vectorizer.transform(feats)
        # get first row, as just looking at one data point
        ys_probs = model.predict_proba(xs)[0]

        prob_by_label = {}
        for action, prob in zip(model.classes_, ys_probs):
            if not allowed_action(action, tos):
                continue
            prob_by_label[action] = prob

        items = list(prob_by_label.items())
        # randomize order so that max returns different items in the case of a tie
        np.random.shuffle(items)
        max_act, max_prob = max(items, key=lambda tpl: tpl[1])
        return max_act
示例#19
0
class SearnModelTemplateFeaturesMultinomialLogisticRegression(
        SearnModelTemplateFeatures):
    def __init__(self,
                 ngram_extractor,
                 feature_extractor,
                 cost_function,
                 min_feature_freq,
                 cr_tags,
                 base_learner_fact,
                 crel_learner_fact,
                 beta=0.2,
                 positive_val=1,
                 sparse=True,
                 log_fn=lambda s: print(s)):

        super(SearnModelTemplateFeaturesMultinomialLogisticRegression,
              self).__init__(ngram_extractor=ngram_extractor,
                             feature_extractor=feature_extractor,
                             cost_function=cost_function,
                             min_feature_freq=min_feature_freq,
                             cr_tags=cr_tags,
                             base_learner_fact=base_learner_fact,
                             beta=beta,
                             positive_val=positive_val,
                             sparse=sparse,
                             log_fn=log_fn)
        self.crel_learner_fact = crel_learner_fact

    def train_parse_models(self, examples):
        self.current_parser_feat_vectorizer = FeatureVectorizer(
            min_feature_freq=self.min_feature_freq, sparse=self.sparse)
        xs = self.current_parser_feat_vectorizer.fit_transform(examples.xs)
        ys = examples.get_labels()

        weights = []

        for ix in range(xs.shape[0]):
            costs_by_action = {}
            gold_action = ys[ix]
            gold_action_wt = 0
            for action in PARSE_ACTIONS:
                cost = examples.get_weights_for(action)[ix]
                if action == gold_action:
                    gold_action_wt = cost
                else:
                    costs_by_action[action] = cost

            worse_action, worse_cost = max(costs_by_action.items(),
                                           key=lambda tpl: tpl[1])
            assert gold_action_wt >= 0 and worse_cost >= 0, "Costs should be non negative"
            # Weight of example is the difference between the best action and the worse action
            # as both are positive, we simply add them up
            weight = gold_action_wt + worse_cost
            weights.append(weight)

        mdl = self.base_learner_fact()
        mdl.fit(xs, ys, sample_weight=weights)

        #cost = examples.get_weights_for(action)[ix]
        self.current_parser_models = mdl
        self.parser_models.append(mdl)

    def predict_parse_action(self, feats, tos):
        model = self.current_parser_models

        xs = self.current_parser_feat_vectorizer.transform(feats)
        # get first row, as just looking at one data point
        ys_probs = model.predict_proba(xs)[0]

        prob_by_label = {}
        for action, prob in zip(model.classes_, ys_probs):
            if not allowed_action(action, tos):
                continue
            prob_by_label[action] = prob

        items = list(prob_by_label.items())
        # randomize order so that max returns different items in the case of a tie
        np.random.shuffle(items)
        max_act, max_prob = max(items, key=lambda tpl: tpl[1])
        return max_act
fn_create_sent_cls  = lambda : LogisticRegression(dual=True) # C around 1.0 seems pretty optimal
# NOTE - GBT is stochastic in the SPLITS, and so you will get non-deterministic results

if type(fn_create_sent_cls()) == GradientBoostingClassifier:
    SPARSE_SENT_FEATS = False

#TODO Parallelize
essays_TD = essay_feats

# TD and VD are lists of Essay objects. The sentences are lists
# of featureextractortransformer.Word objects

print("Training Tagging Model")
""" Data Partitioning and Training """
td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD)
feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ, sparse=SPARSE_WD_FEATS)

td_X = feature_transformer.fit_transform(td_feats)
wd_td_ys_bytag = get_wordlevel_ys_by_code(td_tags, wd_train_tags)

""" TRAIN Tagger """
tag2word_classifier = train_classifier_per_code(td_X, wd_td_ys_bytag, fn_create_wd_cls, wd_train_tags)

print("\nTraining Sentence Model")
""" SENTENCE LEVEL PREDICTIONS FROM STACKING """
sent_td_xs, sent_td_ys_bycode = get_sent_feature_for_stacking_from_tagging_model(sent_input_feat_tags, sent_input_interaction_tags, essays_TD, td_X, wd_td_ys_bytag, tag2word_classifier, SPARSE_SENT_FEATS, LOOK_BACK)

""" Train Stacked Classifier """
tag2sent_classifier = train_classifier_per_code(sent_td_xs, sent_td_ys_bycode , fn_create_sent_cls, sent_output_train_test_tags)

""" Persist Models """
""" Log Reg + Log Reg is best!!! """
#fn_create_wd_cls    = lambda : LinearSVC(C=1.0)
fn_create_wd_cls = lambda: LogisticRegression() # C=1, dual = False seems optimal

if USE_SVM:
    fn_create_sent_cls  = lambda : LinearSVC(C=1.0)
else:
    fn_create_sent_cls  = lambda : LogisticRegression(dual=True) # C around 1.0 seems pretty optimal

# TD and VD are lists of Essay objects. The sentences are lists
# of featureextractortransformer.Word objects

print "Training Tagging Model"
""" Data Partitioning and Training """
td_feats, td_tags = flatten_to_wordlevel_feat_tags(train_essay_feats)
feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ, sparse=SPARSE_WD_FEATS)

td_X = feature_transformer.fit_transform(td_feats)
wd_td_ys_bytag = get_wordlevel_ys_by_code(td_tags, wd_train_tags)

""" TRAIN Tagger """
tag2word_classifier = train_classifier_per_code(td_X, wd_td_ys_bytag, fn_create_wd_cls, wd_train_tags)
train_wd_predictions_by_code = test_classifier_per_code(td_X, tag2word_classifier, wd_test_tags)

print "\nTraining Sentence Model"
""" SENTENCE LEVEL PREDICTIONS FROM STACKING """
sent_td_xs, sent_td_ys_bycode = get_sent_feature_for_stacking_from_tagging_model(sent_input_feat_tags, sent_input_interaction_tags, train_essay_feats, td_X, wd_td_ys_bytag, tag2word_classifier, SPARSE_SENT_FEATS, LOOK_BACK)

""" Train Stacked Classifier """
tag2sent_classifier = train_classifier_per_code(sent_td_xs, sent_td_ys_bycode , fn_create_sent_cls, sent_output_train_test_tags)