def __init__(self, threshold=0.5, modelpath=''): self.threshold = threshold self.modelpath = modelpath self.model = None if self.modelpath: self.model = xgb.Booster() self.model.load_model(modelpath) morph_extractor = MorphologyExtractor(sparse=True) self.pipeline = SparseUnion([ ('orig', Pipeline([ ('select', ItemSelector('before')), ('features', SparseUnion([ ('char', StringToChar(10, to_coo=True)), ('ctx', morph_extractor), ])), ])), ('prev', Pipeline([ ('select', ItemSelector('prev')), ('features', SparseUnion([ ('char', StringToChar(5, to_coo=True)), ('ctx', morph_extractor), ])), ])), ('next', Pipeline([ ('select', ItemSelector('next')), ('features', SparseUnion([ ('char', StringToChar(5, to_coo=True)), ('ctx', morph_extractor), ])), ])) ])
def __init__(self, modelpath=''): self.modelpath = modelpath self.model = None if self.modelpath: self.model = xgb.Booster() self.model.load_model(modelpath) self.class_type = CategoricalDtype( categories=['PLAIN', 'DATE', 'PUNCT', 'ORDINAL', 'VERBATIM', 'LETTERS', 'CARDINAL', 'MEASURE', 'TELEPHONE', 'ELECTRONIC', 'DECIMAL', 'DIGIT', 'FRACTION', 'MONEY', 'TIME', 'TRANS', 'DASH']) morph_extractor = MorphologyExtractor(sparse=True, multi_words=True) self.pipeline = SparseUnion([ ('class', Pipeline([ ('select', ItemSelector('class')), ('codes', ToCategoryCodes(self.class_type)), ('reshape', Reshape2d()), ('onehot', OneHotEncoder(n_values=len(self.class_type.categories), sparse=True, dtype=np.uint8)) ])), ('orig', Pipeline([ ('select', ItemSelector('before')), ('features', SparseUnion([ ('char', StringToChar(10, to_coo=True)), ('ctx', morph_extractor), ])), ])), ('prev_prev', Pipeline([ ('select', ItemSelector('prev_prev')), ('features', SparseUnion([ ('char', StringToChar(-5, to_coo=True)), ('ctx', morph_extractor), ])), ])), ('prev', Pipeline([ ('select', ItemSelector('prev')), ('features', SparseUnion([ ('char', StringToChar(-5, to_coo=True)), ('ctx', morph_extractor), ])), ])), ('next', Pipeline([ ('select', ItemSelector('next')), ('features', SparseUnion([ ('char', StringToChar(-5, to_coo=True)), ('ctx', morph_extractor), ])), ])), ('next_next', Pipeline([ ('select', ItemSelector('next_next')), ('features', SparseUnion([ ('char', StringToChar(-5, to_coo=True)), ('ctx', morph_extractor), ])), ])), ]) self.case_extractor = CaseExtractor(multi_words=True)
def __init__(self, modelpath=''): self.modelpath = modelpath self.model = None if self.modelpath: self.model = xgb.Booster() self.model.load_model(modelpath) self.class_type = CategoricalDtype( categories=['PLAIN', 'DATE', 'PUNCT', 'ORDINAL', 'VERBATIM', 'LETTERS', 'CARDINAL', 'MEASURE', 'TELEPHONE', 'ELECTRONIC', 'DECIMAL', 'DIGIT', 'FRACTION', 'MONEY', 'TIME', 'TRANS', 'DASH']) morph_extractor = MorphologyExtractor(sparse=True) self.pipeline = SparseUnion([ ('orig', Pipeline([ ('select', ItemSelector('before')), ('features', SparseUnion([ ('char', StringToChar(10, to_coo=True)), ('ctx', morph_extractor), ])), ])), ('prev', Pipeline([ ('select', ItemSelector('prev')), ('features', SparseUnion([ ('char', StringToChar(5, to_coo=True)), ('ctx', morph_extractor), ])), ])), ('next', Pipeline([ ('select', ItemSelector('next')), ('features', SparseUnion([ ('char', StringToChar(5, to_coo=True)), ('ctx', morph_extractor), ])), ])) ])
if self.category_columns: res = pd.get_dummies( X, sparse=True, dummy_na=False, columns=self.category_columns).to_sparse(fill_value=0) else: res = pd.get_dummies(X, sparse=True, dummy_na=False).to_sparse(fill_value=0) conv = res.select_dtypes(exclude=[np.number]).astype(np.float16) res[conv.columns] = conv return res if __name__ == '__main__': data = [ u'в 1905 году' ] + u'Определение частей речи работает не так как задумывалось'.split() print(data) morph = MorphologyExtractor().transform(data) print(morph.info()) print(morph.density) res = PandasDummies(category_columns=[ 'pos', 'animacy', 'aspect', 'case', 'gender', 'involvement', 'mood', 'number', 'person', 'tense', 'transitivity' ]).fit_transform(morph) print(res) print(res.info()) print(res.density)
df['prev'] = df['before'].shift(1).fillna('') df['next'] = df['before'].shift(-1).fillna('') df['next_next'] = df['before'].shift(-2).fillna('') classes = frozenset([ 'CARDINAL', 'DATE', 'MEASURE', 'DECIMAL', 'MONEY', 'ORDINAL', 'FRACTION', 'TIME' ]) df = df[~(df['before'] == df['after']) & (df['class'].isin(classes))] class_type = CategoricalDtype(categories=[ 'PLAIN', 'DATE', 'PUNCT', 'ORDINAL', 'VERBATIM', 'LETTERS', 'CARDINAL', 'MEASURE', 'TELEPHONE', 'ELECTRONIC', 'DECIMAL', 'DIGIT', 'FRACTION', 'MONEY', 'TIME', 'TRANS', 'DASH' ]) print(df.info()) morph_extractor = MorphologyExtractor(sparse=True, multi_words=True) before_pipeline = SparseUnion([ ('class', Pipeline([('select', ItemSelector('class')), ('codes', ToCategoryCodes(class_type)), ('reshape', Reshape2d()), ('onehot', OneHotEncoder(n_values=len(class_type.categories), sparse=True, dtype=np.uint8))])), ('orig', Pipeline([ ('select', ItemSelector('before')), ('features', SparseUnion([ ('char', StringToChar(10, to_coo=True)),
from transformers.morphology_extractor import MorphologyExtractor from transformers.sparse_union import SparseUnion from transformers.string_to_chars import StringToChar from sparse_helpers import sparse_memory_usage import gc from sklearn.metrics import accuracy_score df = load_train(['before', 'after']).fillna('') df['self'] = (df['before'] == df['after']) df['prev'] = df['before'].shift(1) df['next'] = df['before'].shift(-1) df = df.fillna('') del df['after'] print(df.info()) morph_extractor = MorphologyExtractor(sparse=True) pipeline = SparseUnion([ ('orig', Pipeline([ ('select', ItemSelector('before')), ('features', SparseUnion([ ('char', StringToChar(10, to_coo=True)), ('ctx', morph_extractor), ])), ])), ('prev', Pipeline([ ('select', ItemSelector('prev')), ('features', SparseUnion([
for col in tqdm(self.columns, f'{self.__class__.__name__} transform'): res.append(LabelEncoder().fit_transform(X[col])) # else: # if isinstance(X, np.ndarray): # for col in X.T: # res(LabelEncoder().fit_transform(np.array(col))) return pd.DataFrame(np.column_stack(res), columns=self.columns) if __name__ == '__main__': np_array1d = np.array([('s', 'dfg'), ('f', 's'), ('H', 'h')], dtype=[('col1', 'O'), ('col2', 'O')]) print(np_array1d) res = MultiLabelEncoder(['col1', 'col2']).transform(np_array1d) print(res, flush=True) from transformers.morphology_extractor import MorphologyExtractor data = [ u'в 1905 году' ] + u'Определение частей речи работает не так как задумывалось в ПП'.split( ) print(data, flush=True) context = MorphologyExtractor().transform(data) print(context, flush=True) res = MultiLabelEncoder(('is_first_upper', 'is_upper', 'pos', 'animacy', 'aspect', 'case', 'gender', 'mood', 'number', 'person', 'tense', 'transitivity', 'voice'))\ .transform(context) print(res)