예제 #1
0
    def __init__(self, threshold=0.5, modelpath=''):
        self.threshold = threshold
        self.modelpath = modelpath
        self.model = None
        if self.modelpath:
            self.model = xgb.Booster()
            self.model.load_model(modelpath)

        morph_extractor = MorphologyExtractor(sparse=True)
        self.pipeline = SparseUnion([
            ('orig', Pipeline([
                ('select', ItemSelector('before')),
                ('features', SparseUnion([
                    ('char', StringToChar(10, to_coo=True)),
                    ('ctx', morph_extractor),
                ])),
            ])),
            ('prev', Pipeline([
                ('select', ItemSelector('prev')),
                ('features', SparseUnion([
                    ('char', StringToChar(5, to_coo=True)),
                    ('ctx', morph_extractor),
                ])),
            ])),
            ('next', Pipeline([
                ('select', ItemSelector('next')),
                ('features', SparseUnion([
                    ('char', StringToChar(5, to_coo=True)),
                    ('ctx', morph_extractor),
                ])),
            ]))
        ])
    def __init__(self, modelpath=''):
        self.modelpath = modelpath
        self.model = None
        if self.modelpath:
            self.model = xgb.Booster()
            self.model.load_model(modelpath)

        self.class_type = CategoricalDtype(
            categories=['PLAIN', 'DATE', 'PUNCT', 'ORDINAL', 'VERBATIM', 'LETTERS', 'CARDINAL', 'MEASURE',
                        'TELEPHONE', 'ELECTRONIC', 'DECIMAL', 'DIGIT', 'FRACTION', 'MONEY', 'TIME',
                        'TRANS', 'DASH'])

        morph_extractor = MorphologyExtractor(sparse=True, multi_words=True)
        self.pipeline = SparseUnion([
            ('class', Pipeline([
                ('select', ItemSelector('class')),
                ('codes', ToCategoryCodes(self.class_type)),
                ('reshape', Reshape2d()),
                ('onehot', OneHotEncoder(n_values=len(self.class_type.categories), sparse=True, dtype=np.uint8))
            ])),
            ('orig', Pipeline([
                ('select', ItemSelector('before')),
                ('features', SparseUnion([
                    ('char', StringToChar(10, to_coo=True)),
                    ('ctx', morph_extractor),
                ])),
            ])),
            ('prev_prev', Pipeline([
                ('select', ItemSelector('prev_prev')),
                ('features', SparseUnion([
                    ('char', StringToChar(-5, to_coo=True)),
                    ('ctx', morph_extractor),
                ])),
            ])),
            ('prev', Pipeline([
                ('select', ItemSelector('prev')),
                ('features', SparseUnion([
                    ('char', StringToChar(-5, to_coo=True)),
                    ('ctx', morph_extractor),
                ])),
            ])),
            ('next', Pipeline([
                ('select', ItemSelector('next')),
                ('features', SparseUnion([
                    ('char', StringToChar(-5, to_coo=True)),
                    ('ctx', morph_extractor),
                ])),
            ])),
            ('next_next', Pipeline([
                ('select', ItemSelector('next_next')),
                ('features', SparseUnion([
                    ('char', StringToChar(-5, to_coo=True)),
                    ('ctx', morph_extractor),
                ])),
            ])),
        ])
        self.case_extractor = CaseExtractor(multi_words=True)
    def __init__(self, modelpath=''):
        self.modelpath = modelpath
        self.model = None
        if self.modelpath:
            self.model = xgb.Booster()
            self.model.load_model(modelpath)

        self.class_type = CategoricalDtype(
            categories=['PLAIN', 'DATE', 'PUNCT', 'ORDINAL', 'VERBATIM', 'LETTERS', 'CARDINAL', 'MEASURE',
                        'TELEPHONE', 'ELECTRONIC', 'DECIMAL', 'DIGIT', 'FRACTION', 'MONEY', 'TIME',
                        'TRANS', 'DASH'])

        morph_extractor = MorphologyExtractor(sparse=True)
        self.pipeline = SparseUnion([
            ('orig', Pipeline([
                ('select', ItemSelector('before')),
                ('features', SparseUnion([
                    ('char', StringToChar(10, to_coo=True)),
                    ('ctx', morph_extractor),
                ])),
            ])),
            ('prev', Pipeline([
                ('select', ItemSelector('prev')),
                ('features', SparseUnion([
                    ('char', StringToChar(5, to_coo=True)),
                    ('ctx', morph_extractor),
                ])),
            ])),
            ('next', Pipeline([
                ('select', ItemSelector('next')),
                ('features', SparseUnion([
                    ('char', StringToChar(5, to_coo=True)),
                    ('ctx', morph_extractor),
                ])),
            ]))
        ])
예제 #4
0
classes = frozenset([
    'CARDINAL', 'DATE', 'MEASURE', 'DECIMAL', 'MONEY', 'ORDINAL', 'FRACTION',
    'TIME'
])
df = df[~(df['before'] == df['after']) & (df['class'].isin(classes))]
class_type = CategoricalDtype(categories=[
    'PLAIN', 'DATE', 'PUNCT', 'ORDINAL', 'VERBATIM', 'LETTERS', 'CARDINAL',
    'MEASURE', 'TELEPHONE', 'ELECTRONIC', 'DECIMAL', 'DIGIT', 'FRACTION',
    'MONEY', 'TIME', 'TRANS', 'DASH'
])
print(df.info())

morph_extractor = MorphologyExtractor(sparse=True, multi_words=True)
before_pipeline = SparseUnion([
    ('class',
     Pipeline([('select', ItemSelector('class')),
               ('codes', ToCategoryCodes(class_type)),
               ('reshape', Reshape2d()),
               ('onehot',
                OneHotEncoder(n_values=len(class_type.categories),
                              sparse=True,
                              dtype=np.uint8))])),
    ('orig',
     Pipeline([
         ('select', ItemSelector('before')),
         ('features',
          SparseUnion([
              ('char', StringToChar(10, to_coo=True)),
              ('ctx', morph_extractor),
          ])),
     ])),
import gc
from sklearn.metrics import accuracy_score

df = load_train(['before', 'after']).fillna('')
df['self'] = (df['before'] == df['after'])
df['prev'] = df['before'].shift(1)
df['next'] = df['before'].shift(-1)
df = df.fillna('')
del df['after']
print(df.info())

morph_extractor = MorphologyExtractor(sparse=True)
pipeline = SparseUnion([
    ('orig',
     Pipeline([
         ('select', ItemSelector('before')),
         ('features',
          SparseUnion([
              ('char', StringToChar(10, to_coo=True)),
              ('ctx', morph_extractor),
          ])),
     ])),
    ('prev',
     Pipeline([
         ('select', ItemSelector('prev')),
         ('features',
          SparseUnion([
              ('char', StringToChar(5, to_coo=True)),
              ('ctx', morph_extractor),
          ])),
     ])),