示例#1
0
    def load(self, word2tags_path=None):
        module_folder = str(pathlib.Path(__file__).resolve().parent)
        data_folder = os.path.join(module_folder, '../tmp')

        config_path = os.path.join(data_folder, 'rupostagger.config')
        if not os.path.exists(config_path):
            data_folder = module_folder
            config_path = os.path.join(data_folder, 'rupostagger.config')

        #print('DEBUG@47 module_folder={}'.format(module_folder))
        #print('DEBUG@48 data_folder={}'.format(data_folder))

        with open(config_path, 'r') as rdr:
            self.config = json.load(rdr)
            self.winspan = self.config['winspan']
            self.use_gren = self.config['use_gren']
            self.use_w2v = self.config['use_w2v']
            self.use_syllabs = self.config['use_syllabs']
            self.ending_len = self.config['ending_len']

        self.word2tags = ruword2tags.RuWord2Tags()
        self.word2tags.load(word2tags_path)

        model_path = os.path.join(data_folder, 'rupostagger.model')
        self.tagger = pycrfsuite.Tagger()
        self.tagger.open(model_path)
示例#2
0
 def __init__(self):
     self.tokenizer = Tokenizer()
     self.tokenizer.load()
     self.lexicon = Word2Lemmas()
     self.language_resources = LanguageResources()
     self.postagger = rupostagger.RuPosTagger()
     self.chunker = ruchunker.Chunker()
     self.word2tags = ruword2tags.RuWord2Tags()
     self.flexer = ruword2tags.RuFlexer()
     self.syntan = None
     self.gg_dictionaries = GenerativeGrammarDictionaries()
     self.known_words = set()
     #self.lemmatizer = Mystem()
     self.lemmatizer = rulemma.Lemmatizer()
     self.word_embeddings = None
示例#3
0
    def load(self, model_dir=None):
        if model_dir is None:
            module_folder = str(pathlib.Path(__file__).resolve().parent)
            model_dir = os.path.join(module_folder, '../tmp')
            if not os.path.exists(model_dir):
                model_dir = module_folder

        config_path = os.path.join(model_dir, 'chunker_NP.config')
        self.chunker_params = ChunkerCrfParams.load(config_path)

        if self.chunker_params.use_gren:
            self.word2tags = ruword2tags.RuWord2Tags()
            self.word2tags.load()

        if self.chunker_params.use_postagger:
            self.postagger = rupostagger.RuPosTagger()
            self.postagger.load()

        self.crf_tagger = pycrfsuite.Tagger()
        self.crf_tagger.open(
            os.path.join(model_dir, self.chunker_params.model_filename))
示例#4
0
    logging.root.removeHandler(absl.logging._absl_handler)
    absl.logging._warn_preinit_stderr = False
    logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(message)s')

    logfile_path = os.path.join(tmp_dir, 'rupostagger2.trainer.log')
    lf = logging.FileHandler(logfile_path, mode='w')
    lf.setLevel(logging.INFO)
    lf.setFormatter(logging.Formatter('%(asctime)s %(message)s'))
    logging.getLogger('').addHandler(lf)

    logging.info('STARTED')

    trainer = Trainer()

    logging.info('Loading dictionary...')
    word2tags = ruword2tags.RuWord2Tags()
    word2tags.load()

    w2v = None
    if use_w2v:
        w2v_path = os.path.join(tmp_dir, 'w2v.kv')
        #w2v_path = os.path.join('/home/inkoziev/polygon/w2v/fasttext.CBOW=1_WIN=5_DIM=64')
        wordchar2vector_path = '~/polygon/chatbot/data/wordchar2vector.dat'

        if 'fasttext' in w2v_path:
            w2v = FastText.load_fasttext_format(w2v_path)
        else:
            if use_wc2v:
                logging.info(u'Loading the wordchar2vector model from "%s"',
                             wordchar2vector_path)
                wc2v = gensim.models.KeyedVectors.load_word2vec_format(
示例#5
0
                        if len(phrase2) == 0 or phrase1 == phrase2:
                            no_expansion_phrases.append(u' '.join(
                                tokenizer.tokenize(phrase1)))
                lines = []

    print('{} samples, {} no-expansion phrases'.format(
        len(samples), len(no_expansion_phrases)))

    with io.open(no_expansion_path, 'w', encoding='utf=8') as wrt:
        for phrase in no_expansion_phrases:
            wrt.write(phrase + '\n')

    tagger = rupostagger.RuPosTagger()
    tagger.load()

    gren = ruword2tags.RuWord2Tags()
    gren.load()

    # НАЧАЛО ОТЛАДКИ
    #words = tokenizer.tokenize('Тебе нравится пить кофе')
    #tags = list(tagger.tag(words))
    # КОНЕЦ ОТЛАДКИ

    #lemmatizer = rulemma.Lemmatizer()
    #lemmatizer.load()

    all_templates = set()
    template2freq = collections.Counter()
    template2sample = dict()
    all_terms = collections.Counter()
    logging.info('Start "prepare_answer_relevancy_dataset.py"')

    tokenizer = Tokenizer()
    tokenizer.load()

    samples = load_samples(input_paths, tokenizer)

    logging.info('Loading dictionaries...')

    thesaurus = Thesaurus()
    thesaurus.load(os.path.join(data_folder, 'dict/links.csv'))  # , corpus)

    lexicon = Word2Lemmas()
    lexicon.load(os.path.join(data_folder, 'dict/word2lemma.dat'))

    grdict = ruword2tags.RuWord2Tags()
    grdict.load()

    flexer = ruword2tags.RuFlexer()
    flexer.load()

    # Аугментация: генерируем негативных сэмплы через выбор вариантов словоформ, отличающихся
    # от использованных в валидном ответе.
    logging.info('Generating negative samples...')
    all_keys = set(sample.get_key() for sample in samples)
    neg_samples = []
    for sample in samples:
        if sample.label == 1:
            answer_words = tokenizer.tokenize(sample.answer)
            answer_len = len(answer_words)
            if answer_len == 1: