CARDINAL_REGEXP = '\d'
INPUT_MAX_LEN = 47
OUTPUT_MAX_LEN = 12
INPUT_VOCAB_SIZE = 5000
OUTPUT_VOCAB_SIZE = 257

LAYER_NUM = 2
HIDDEN_DIM = 64
EMBEDDING_DIM = 0
BATCH_SIZE = 128
LEARNING_RATE = 0.001
MEM_SIZE = 10000
NB_EPOCH = 100
DROPOUT = 0.0

df = load_train(['before', 'after', 'class'], input_path=r'../input/norm_challenge_ru').fillna('')
# df = load_external(['before', 'after'],
#                   only_diff=True,
#                   input_path=r'../input/norm_challenge_ru/ru_with_types')\
#      .fillna('')
df['prev_prev'] = df['before'].shift(2)
df['prev'] = df['before'].shift(1)
df['next'] = df['before'].shift(-1)
df['next_next'] = df['before'].shift(-2)
df = df[~(df['before'] == df['after'])].fillna('')
df = df[df['class'] == 'DATE']
df['before'] = df['prev_prev'].map(str) + ' '\
               + df['prev'].map(str) + ' '\
               + df['before'].map(lambda s: ' '.join(list(s))) + ' ' \
               + df['next'].map(str) + ' ' \
               + df['next_next'].map(str)
Пример #2
0
        del threegramms

        if 'after' in X.columns:
            return X.assign(
                after=X['after'].combine_first(pd.Series(data, index=X.index)))
        else:
            return X.assign(after=data)

    def get_params(self):
        params = super(self.__class__, self).get_params()
        params['mean_confidence'] = self.mean_confidence
        return params


if __name__ == '__main__':
    df = load_train(columns=['before', 'after'], input_path=INPUT_PATH)
    df['prev'] = df['before'].shift(1)
    df['next'] = df['before'].shift(-1)
    df['before'] = df['before']
    df['after'] = df['after']
    df = df.fillna('')
    print(df.info())

    dt = DictNBHDTransformer(0.5)

    dt.fit(df.drop(['after'], axis=1), df['after'])
    dt.fit(df.drop(['after'], axis=1), df['after'])

    res_df = dt.transform(df.rename(columns={'after': 'actual'}))
    print('Acc',
          len(res_df[res_df['after'] == res_df['actual']]) / len(res_df))
import matplotlib.pyplot as plt
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from loaders.loading import load_train
from transformers.item_selector import ItemSelector
from transformers.morphology_extractor import MorphologyExtractor
from transformers.sparse_union import SparseUnion
from transformers.string_to_chars import StringToChar
from sparse_helpers import sparse_memory_usage
import gc
from sklearn.metrics import accuracy_score

df = load_train(['before', 'after']).fillna('')
df['self'] = (df['before'] == df['after'])
df['prev'] = df['before'].shift(1)
df['next'] = df['before'].shift(-1)
df = df.fillna('')
del df['after']
print(df.info())

morph_extractor = MorphologyExtractor(sparse=True)
pipeline = SparseUnion([
    ('orig',
     Pipeline([
         ('select', ItemSelector('before')),
         ('features',
          SparseUnion([
              ('char', StringToChar(10, to_coo=True)),
              ('ctx', morph_extractor),
Пример #4
0
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, FunctionTransformer

from loaders.loading import load_train
from transformers.case_extractor import CaseExtractor
from transformers.item_selector import ItemSelector, Reshape2d, ToCategoryCodes
from transformers.morphology_extractor import MorphologyExtractor
from transformers.sparse_union import SparseUnion
from transformers.string_to_chars import StringToChar
from sparse_helpers import sparse_memory_usage
import gc
from sklearn.metrics import accuracy_score
from pandas.api.types import CategoricalDtype

INPUT_PATH = r'../input/norm_challenge_ru'

df = load_train(['before', 'after', 'class'], INPUT_PATH).fillna('')
#df = load_external(['before', 'after', 'class'],
#                   only_diff=True,
#                   input_path=r'../input/norm_challenge_ru/ru_with_types')\
#      .fillna('')

df['prev_prev'] = df['before'].shift(2).fillna('')
df['prev'] = df['before'].shift(1).fillna('')
df['next'] = df['before'].shift(-1).fillna('')
df['next_next'] = df['before'].shift(-2).fillna('')
classes = frozenset([
    'CARDINAL', 'DATE', 'MEASURE', 'DECIMAL', 'MONEY', 'ORDINAL', 'FRACTION',
    'TIME'
])
df = df[~(df['before'] == df['after']) & (df['class'].isin(classes))]
class_type = CategoricalDtype(categories=[
Пример #5
0
            # p.tag.number  # число (единственное, множественное)

        res = pd.DataFrame(data, columns=['case', 'gender', 'number']).fillna('none')
        del data
        res['case'] = res['case'].astype(self.case_type)
        res['gender'] = res['gender'].astype(self.gender_type)
        res['number'] = res['number'].astype(self.number_type)
        return res


if __name__ == '__main__':
    data = [u'В 1905 году'] + u'съешь ещё этих мягких французских булок , ДА выпей чаю брюки брючные'.split()
    print(data)

    morph = CaseExtractor()
    res = morph.transform(data)
    print(res.info())
    print(res)

    morph.multi_words = True
    morph.word_rows = {}
    res = morph.transform(data)
    print(res.info())
    print(res)

    df = load_train(columns=['after'], input_path=r'../../input/norm_challenge_ru')
    morph.word_rows = {}
    res = morph.transform(df.sample(100000)['after'])
    print(res.info())
    print(res)