def __init__(self, threshold=0.5, modelpath=''): self.threshold = threshold self.modelpath = modelpath self.model = None if self.modelpath: self.model = xgb.Booster() self.model.load_model(modelpath) morph_extractor = MorphologyExtractor(sparse=True) self.pipeline = SparseUnion([ ('orig', Pipeline([ ('select', ItemSelector('before')), ('features', SparseUnion([ ('char', StringToChar(10, to_coo=True)), ('ctx', morph_extractor), ])), ])), ('prev', Pipeline([ ('select', ItemSelector('prev')), ('features', SparseUnion([ ('char', StringToChar(5, to_coo=True)), ('ctx', morph_extractor), ])), ])), ('next', Pipeline([ ('select', ItemSelector('next')), ('features', SparseUnion([ ('char', StringToChar(5, to_coo=True)), ('ctx', morph_extractor), ])), ])) ])
def __init__(self, modelpath=''): self.modelpath = modelpath self.model = None if self.modelpath: self.model = xgb.Booster() self.model.load_model(modelpath) self.class_type = CategoricalDtype( categories=['PLAIN', 'DATE', 'PUNCT', 'ORDINAL', 'VERBATIM', 'LETTERS', 'CARDINAL', 'MEASURE', 'TELEPHONE', 'ELECTRONIC', 'DECIMAL', 'DIGIT', 'FRACTION', 'MONEY', 'TIME', 'TRANS', 'DASH']) morph_extractor = MorphologyExtractor(sparse=True, multi_words=True) self.pipeline = SparseUnion([ ('class', Pipeline([ ('select', ItemSelector('class')), ('codes', ToCategoryCodes(self.class_type)), ('reshape', Reshape2d()), ('onehot', OneHotEncoder(n_values=len(self.class_type.categories), sparse=True, dtype=np.uint8)) ])), ('orig', Pipeline([ ('select', ItemSelector('before')), ('features', SparseUnion([ ('char', StringToChar(10, to_coo=True)), ('ctx', morph_extractor), ])), ])), ('prev_prev', Pipeline([ ('select', ItemSelector('prev_prev')), ('features', SparseUnion([ ('char', StringToChar(-5, to_coo=True)), ('ctx', morph_extractor), ])), ])), ('prev', Pipeline([ ('select', ItemSelector('prev')), ('features', SparseUnion([ ('char', StringToChar(-5, to_coo=True)), ('ctx', morph_extractor), ])), ])), ('next', Pipeline([ ('select', ItemSelector('next')), ('features', SparseUnion([ ('char', StringToChar(-5, to_coo=True)), ('ctx', morph_extractor), ])), ])), ('next_next', Pipeline([ ('select', ItemSelector('next_next')), ('features', SparseUnion([ ('char', StringToChar(-5, to_coo=True)), ('ctx', morph_extractor), ])), ])), ]) self.case_extractor = CaseExtractor(multi_words=True)
def __init__(self, modelpath=''): self.modelpath = modelpath self.model = None if self.modelpath: self.model = xgb.Booster() self.model.load_model(modelpath) self.class_type = CategoricalDtype( categories=['PLAIN', 'DATE', 'PUNCT', 'ORDINAL', 'VERBATIM', 'LETTERS', 'CARDINAL', 'MEASURE', 'TELEPHONE', 'ELECTRONIC', 'DECIMAL', 'DIGIT', 'FRACTION', 'MONEY', 'TIME', 'TRANS', 'DASH']) morph_extractor = MorphologyExtractor(sparse=True) self.pipeline = SparseUnion([ ('orig', Pipeline([ ('select', ItemSelector('before')), ('features', SparseUnion([ ('char', StringToChar(10, to_coo=True)), ('ctx', morph_extractor), ])), ])), ('prev', Pipeline([ ('select', ItemSelector('prev')), ('features', SparseUnion([ ('char', StringToChar(5, to_coo=True)), ('ctx', morph_extractor), ])), ])), ('next', Pipeline([ ('select', ItemSelector('next')), ('features', SparseUnion([ ('char', StringToChar(5, to_coo=True)), ('ctx', morph_extractor), ])), ])) ])
classes = frozenset([ 'CARDINAL', 'DATE', 'MEASURE', 'DECIMAL', 'MONEY', 'ORDINAL', 'FRACTION', 'TIME' ]) df = df[~(df['before'] == df['after']) & (df['class'].isin(classes))] class_type = CategoricalDtype(categories=[ 'PLAIN', 'DATE', 'PUNCT', 'ORDINAL', 'VERBATIM', 'LETTERS', 'CARDINAL', 'MEASURE', 'TELEPHONE', 'ELECTRONIC', 'DECIMAL', 'DIGIT', 'FRACTION', 'MONEY', 'TIME', 'TRANS', 'DASH' ]) print(df.info()) morph_extractor = MorphologyExtractor(sparse=True, multi_words=True) before_pipeline = SparseUnion([ ('class', Pipeline([('select', ItemSelector('class')), ('codes', ToCategoryCodes(class_type)), ('reshape', Reshape2d()), ('onehot', OneHotEncoder(n_values=len(class_type.categories), sparse=True, dtype=np.uint8))])), ('orig', Pipeline([ ('select', ItemSelector('before')), ('features', SparseUnion([ ('char', StringToChar(10, to_coo=True)), ('ctx', morph_extractor), ])), ])),
import gc from sklearn.metrics import accuracy_score df = load_train(['before', 'after']).fillna('') df['self'] = (df['before'] == df['after']) df['prev'] = df['before'].shift(1) df['next'] = df['before'].shift(-1) df = df.fillna('') del df['after'] print(df.info()) morph_extractor = MorphologyExtractor(sparse=True) pipeline = SparseUnion([ ('orig', Pipeline([ ('select', ItemSelector('before')), ('features', SparseUnion([ ('char', StringToChar(10, to_coo=True)), ('ctx', morph_extractor), ])), ])), ('prev', Pipeline([ ('select', ItemSelector('prev')), ('features', SparseUnion([ ('char', StringToChar(5, to_coo=True)), ('ctx', morph_extractor), ])), ])),