class BaselineStruct(BaseArgumentMixin): def __init__(self, alpha_link, alpha_prop, l1_ratio, exact_test=False): self.alpha_link = alpha_link self.alpha_prop = alpha_prop self.l1_ratio = l1_ratio self.compat_features = False self.exact_test = exact_test def initialize_labels(self, y_props_flat, y_links_flat): self.prop_encoder_ = LabelEncoder().fit(y_props_flat) self.link_encoder_ = LabelEncoder().fit(y_links_flat) self.n_prop_states = len(self.prop_encoder_.classes_) self.n_link_states = len(self.link_encoder_.classes_) def fit(self, X_link, y_link, X_prop, y_prop): self.initialize_labels(y_prop, y_link) y_link = self.link_encoder_.transform(y_link) y_prop = self.prop_encoder_.transform(y_prop) self.link_clf_ = SAGAClassifier(loss='smooth_hinge', penalty='l1', tol=1e-4, max_iter=500, random_state=0, verbose=0) self.prop_clf_ = clone(self.link_clf_) alpha_link = self.alpha_link * (1 - self.l1_ratio) beta_link = self.alpha_link * self.l1_ratio sw = compute_sample_weight('balanced', y_link) self.link_clf_.set_params(alpha=alpha_link, beta=beta_link) self.link_clf_.fit(X_link, y_link, sample_weight=sw) alpha_prop = self.alpha_prop * (1 - self.l1_ratio) beta_prop = self.alpha_prop * self.l1_ratio self.prop_clf_.set_params(alpha=alpha_prop, beta=beta_prop) self.prop_clf_.fit(X_prop, y_prop) return self def decision_function(self, X_link, X_prop, docs): link_offsets = np.cumsum([len(doc.features) for doc in docs]) y_link_flat = self.link_clf_.decision_function(X_link) y_link_marg = np.zeros( (len(y_link_flat), len(self.link_encoder_.classes_))) link_on, = self.link_encoder_.transform([True]) y_link_marg[:, link_on] = y_link_flat.ravel() Y_link = [ y_link_marg[start:end] for start, end in zip(np.append(0, link_offsets), link_offsets) ] prop_offsets = np.cumsum([len(doc.prop_features) for doc in docs]) y_prop_marg = self.prop_clf_.decision_function(X_prop) Y_prop = [ y_prop_marg[start:end] for start, end in zip(np.append(0, prop_offsets), prop_offsets) ] Y_pred = [] for y_link, y_prop in zip(Y_link, Y_prop): Y_pred.append(DocLabel(y_prop, y_link)) assert len(Y_pred) == len(docs) return Y_pred def fast_decode(self, Y_marg, docs, constraints): if constraints: Y_pred = [] zero_compat = np.zeros( (self.n_prop_states, self.n_prop_states, self.n_link_states)) for doc, y in zip(docs, Y_marg): potentials = (y.nodes, y.links, zero_compat, [], [], []) y_decoded, _ = self._inference(doc, potentials, relaxed=False, exact=self.exact_test, constraints=constraints) Y_pred.append(y_decoded) else: Y_pred = [ self._round(y.nodes, y.links, inverse_transform=True) for y in Y_marg ] return Y_pred def predict(self, X_link, X_prop, docs, constraints=""): Y_marg = self.decision_function(X_link, X_prop, docs) return self.fast_decode(Y_marg, docs, constraints)
class BaselineStruct(BaseArgumentMixin): def __init__(self, alpha_link, alpha_prop, l1_ratio): self.alpha_link = alpha_link self.alpha_prop = alpha_prop self.l1_ratio = l1_ratio self.compat_features = False def initialize_labels(self, y_props_flat, y_links_flat): self.prop_encoder_ = LabelEncoder().fit(y_props_flat) self.link_encoder_ = LabelEncoder().fit(y_links_flat) self.n_prop_states = len(self.prop_encoder_.classes_) self.n_link_states = len(self.link_encoder_.classes_) def fit(self, X_link, y_link, X_prop, y_prop): self.initialize_labels(y_prop, y_link) y_link = self.link_encoder_.transform(y_link) y_prop = self.prop_encoder_.transform(y_prop) self.link_clf_ = SAGAClassifier(loss='smooth_hinge', penalty='l1', tol=1e-4, max_iter=500, random_state=0, verbose=0) self.prop_clf_ = clone(self.link_clf_) alpha_link = self.alpha_link * (1 - self.l1_ratio) beta_link = self.alpha_link * self.l1_ratio sw = compute_sample_weight('balanced', y_link) self.link_clf_.set_params(alpha=alpha_link, beta=beta_link) self.link_clf_.fit(X_link, y_link, sample_weight=sw) alpha_prop = self.alpha_prop * (1 - self.l1_ratio) beta_prop = self.alpha_prop * self.l1_ratio self.prop_clf_.set_params(alpha=alpha_prop, beta=beta_prop) self.prop_clf_.fit(X_prop, y_prop) return self def decision_function(self, X_link, X_prop, docs): link_offsets = np.cumsum([len(doc.features) for doc in docs]) y_link_flat = self.link_clf_.decision_function(X_link) y_link_marg = np.zeros( (len(y_link_flat), len(self.link_encoder_.classes_))) link_on, = self.link_encoder_.transform([True]) y_link_marg[:, link_on] = y_link_flat.ravel() Y_link = [ y_link_marg[start:end] for start, end in zip(np.append(0, link_offsets), link_offsets) ] prop_offsets = np.cumsum([len(doc.prop_features) for doc in docs]) y_prop_marg = self.prop_clf_.decision_function(X_prop) Y_prop = [ y_prop_marg[start:end] for start, end in zip(np.append(0, prop_offsets), prop_offsets) ] Y_pred = [] for y_link, y_prop in zip(Y_link, Y_prop): Y_pred.append(DocLabel(y_prop, y_link)) assert len(Y_pred) == len(docs) return Y_pred