def train(filename): oracle = Oracle(filename) print('oracle transitions parsing...') oracle.search_transitions() x = np.array(oracle.features) y = np.array(oracle.transitions) y = np.array([k['transition'] for k in y]) #shift:0, right:1, left:2 new_y = [] for i in range(len(y)): if y[i] == 'shift': new_y.append([1, 0, 0]) elif y[i] == 'right': new_y.append([0, 1, 0]) elif y[i] == 'left': new_y.append([0, 0, 1]) y = np.array(new_y) print('sparse encoding of features...') vocs, inverses = zip(*(np.unique(feature, return_inverse=True) for feature in x.T)) x_new = np.vstack(inverses).T x_new = np.squeeze(x_new) clf = DecisionTreeClassifier(random_state=0) clf.fit(x_new, y) pickle.dump(vocs, open("vocs.p", "wb")) pickle.dump(clf, open("model.p", "wb"))
import numpy as np import time from multiprocessing import Pool from Configuration import Configuration from Oracle import Oracle #from Model import TBP_AS_model filename = "../UD_French-GSD/UD_French-GSD/fr_gsd-ud-train.conllu" #filename = "../UD_French-GSD/UD_French-GSD/test.conllu" oracle = Oracle(filename) oracle.search_transitions() x = np.array(oracle.features) y = np.array(oracle.transitions) y = np.array([k['transition'] for k in y]) #shift:0, right:1, left:2 new_y = [] for i in range(len(y)): if y[i] == 'shift': new_y.append([1, 0, 0]) elif y[i] == 'right': new_y.append([0, 1, 0]) elif y[i] == 'left': new_y.append([0, 0, 1]) y = np.array(new_y) x.shape, y.shape