示例#1
0
def preprocess_eval(input_path: str,
                    output_path: str,
                    locale: str,
                    min_ngram_len: int = 3,
                    max_ngram_len: int = 6):
    """ Preprocess the evalution data where the file is of the format `word1, word2, similarity` """

    word2morphemes = Word2Morph.load_model(locale=locale)
    sentence2tags = Sentence2Tags.load_model(locale=locale)
    get_token = TokenFactory(special_char=SPECIAL_CHAR,
                             word2morphemes=word2morphemes,
                             sentence2tags=sentence2tags,
                             min_ngram_len=min_ngram_len,
                             max_ngram_len=max_ngram_len)

    with open(output_path, 'w',
              encoding='utf-8') as outf, open(input_path,
                                              'r',
                                              encoding='utf-8') as inf:
        for line in tqdm(inf):
            w1, w2, sim = line.replace(',', ' ').split()
            w1_token = get_token.from_word(w1)
            w2_token = get_token.from_word(w2)

            outf.write(
                token_format(w1_token) + ' ' + token_format(w2_token) + ' ' +
                str(sim) + '\n')
示例#2
0
def predict(model_path: str, batch_size: int = 1,
            input_path='datasets/rus.test', output_path='logs/rus.predictions'):
    word2morph = Word2Morph.load_model(path=model_path)
    inputs = DataLoader(file_path=input_path).load()
    correct, wrong, predicted_samples = word2morph.evaluate(inputs, batch_size=batch_size)
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write('\n'.join([str(sample) for sample in predicted_samples]))
示例#3
0
def preprocess_conllu(input_path: str,
                      output_path: str,
                      locale: str = None,
                      min_ngram_len: int = 3,
                      max_ngram_len: int = 6):
    """ Preprocess the data which is in the CONNL-U format """

    print('Processing the file:', input_path)
    print('To save the results in:', output_path)
    word2morphemes = {} if locale is None else Word2Morph.load_model(
        locale=locale)

    sentences = []
    with open(input_path, 'r', encoding='utf-8') as f:
        line = f.readline()
        while line.strip() != '':
            conll_lines = []
            while line.strip() != '':
                if line.split('\t')[0].isdigit():
                    conll_lines.append(line)
                line = f.readline()
            if not conll_lines:
                line = f.readline()
                continue

            sentences.append(conll_lines)
            line = f.readline()

    print('Processing', len(sentences), 'sentences...', flush=True)
    get_token = TokenFactory(special_char=SPECIAL_CHAR,
                             word2morphemes=word2morphemes,
                             min_ngram_len=min_ngram_len,
                             max_ngram_len=max_ngram_len)

    with open(output_path, 'w', encoding='utf-8') as f:
        for sentence in tqdm(sentences):
            tokens = [get_token.from_conll_line(line) for line in sentence]
            parsed_sentence = ' '.join([token_format(t) for t in tokens])
            f.write(parsed_sentence + '\n')
示例#4
0
 def load_model(locale: str):
     return Word2Morph2Vec(sentence2tags=Sentence2Tags.load_model(locale=locale),
                           word2morph=Word2Morph.load_model(locale=locale),
                           morph2vec=Morph2Vec.load_model(locale=locale))
示例#5
0
def preprocess_wiki(input_path: str,
                    output_path: str,
                    locale: str,
                    min_ngram_len: int = 3,
                    max_ngram_len: int = 6,
                    max_sentence_len: int = 100,
                    chunk_size: int = 10,
                    batch_size: int = 32):
    """
    Preprocess wiki. As the file is too large (5+GB) there are some optimisations made here.
    We keep a cache of words which were already processed by word2morph not to process the same word twice.
    The sentences are processed in batches rather than one by one to fully utilise the GPUs available.
    We process `chunk_size` sentences at once giving `batch_size` elements to the neural network for each batch in the
    chunk.
    """
    def lemmas_to_morph(lemmas: List[str]) -> List[Sample]:

        absent_lemmas = [
            lemma for lemma in lemmas if lemma not in lemmas_to_morph.cache
        ]
        input_lemmas = [
            Sample(word=lemma, segments=tuple()) for lemma in absent_lemmas
        ]
        res_samples = word2morphemes.predict(inputs=input_lemmas,
                                             batch_size=batch_size)

        for lemma, sample in zip(absent_lemmas, res_samples):
            lemmas_to_morph.cache[lemma] = sample

        return [lemmas_to_morph.cache[lemma] for lemma in lemmas]

    lemmas_to_morph.cache: Dict[str, Sample] = {}

    def process(chunk_sentences: List[str]):
        """ Process chunk os sentences returning result in the needed format """
        ''' Sentence to tags '''
        input_trees = [
            sentence_to_tree(s.split(' ')) for s in chunk_sentences
            if len(s.split(' ')) < max_sentence_len
        ]
        res_trees = sentence2tags.predict(input_trees)
        ''' Parse all the sentences (trees) to CONLL-U format '''
        res_sentences_conllu = [tree_to_conllu_lines(t) for t in res_trees]
        ''' Get all the tokens from the CONLL-U formatted sentences  '''
        res_sentences_tokens = [[
            get_token.from_conll_line(line) for line in sentence_conllu
        ] for sentence_conllu in res_sentences_conllu]
        ''' Parse lemmas to get morphemes here instead of get_token.from_conll_line '''
        lemmas = [[t.lemma for t in sentence_tokens]
                  for sentence_tokens in res_sentences_tokens]
        all_lemmas = [l for li in lemmas for l in li]
        all_morphemes = lemmas_to_morph(lemmas=all_lemmas)
        ''' Add morphemes to the tokens '''
        idx = 0
        for sentence_tokens in res_sentences_tokens:
            for t in sentence_tokens:
                t.morphemes = all_morphemes[idx].segments
                idx += 1

        parsed_sentences = [
            ' '.join([token_format(t) for t in sentence_tokens])
            for sentence_tokens in res_sentences_tokens
        ]
        return parsed_sentences

    with open(input_path, 'r', encoding='utf-8') as f:
        sentences = [l.strip() for l in tqdm(f)]

    word2morphemes = Word2Morph.load_model(locale=locale)
    sentence2tags = Sentence2Tags.load_model(locale=locale)
    get_token = TokenFactory(special_char=SPECIAL_CHAR,
                             min_ngram_len=min_ngram_len,
                             max_ngram_len=max_ngram_len)

    with open(output_path, 'w', encoding='utf-8') as f:
        for i in tqdm(range(0, len(sentences), chunk_size)):
            for p in process(sentences[i:i + chunk_size]):
                f.write(p + '\n')
示例#6
0
    def train(self,
              batch_size: int = 32,
              epochs: int = 100,
              lr_multipliers: Tuple[float,
                                    ...] = (0.5, 0.75, 0.8, 1, 1.2, 1.5, 2),
              nb_models: int = 3,
              threads: int = 4,
              monitor_metric: str = 'val_word_acc_processed',
              log_dir: str = 'logs',
              **kwargs):
        self.params.update(locals()), self.params.pop('self')
        ''' Save all the objects/parameters for reproducibility '''
        log_dir = Path(log_dir).joinpath(
            datetime.now().replace(microsecond=0).isoformat())
        model_path = Path(log_dir).joinpath('checkpoints').joinpath(
            "best-model.joblib")
        model_path.parent.mkdir(parents=True, exist_ok=True)
        with open(Path(log_dir).joinpath('params.json'), 'w',
                  encoding='utf-8') as f:
            json.dump(
                {
                    'params': self.params,
                    'commandline': sys.argv,
                    'commit': get_current_commit()
                },
                f,
                indent=4)

        train_generator = DataGenerator(dataset=self.train_dataset,
                                        processor=self.processor,
                                        batch_size=batch_size)
        valid_generator = DataGenerator(dataset=self.valid_dataset,
                                        processor=self.processor,
                                        batch_size=batch_size,
                                        with_samples=True)

        best_current_models: List[ModelInstance] = []
        best_prev_models: List[ModelInstance] = []

        for epoch in range(epochs):
            best_prev_models = deepcopy(best_current_models)
            best_current_models = []

            def log_model(score):
                nonlocal best_current_models
                learning_rate = float(K.get_value(self.model.optimizer.lr))
                path = f'{log_dir}/model-epoch:{epoch}-acc:{score:.3f}-lr:{learning_rate:.3f}.joblib'
                best_current_models.append(
                    ModelInstance(performance=score,
                                  path=path,
                                  lr=learning_rate))
                print('Obtained:', str(best_current_models[-1]), flush=True)
                Word2Morph(model=self.model,
                           processor=self.processor).save(path)

                best_current_models = list(set(best_current_models))
                best_current_models = sorted(best_current_models, reverse=True)
                best_current_models, worst = best_current_models[:
                                                                 nb_models], best_current_models[
                                                                     nb_models:]
                for model in worst:
                    print('Removing:', model.path, flush=True)
                    os.remove(model.path)

                print('Resulting list:')
                for i, model in enumerate(best_current_models):
                    print(i, ':', str(model))
                print(flush=True)

            # There are no models for the initial epoch => use the initial random model as the base model
            if len(best_current_models) == 0:
                log_model(score=0)

            for base_model in best_prev_models:
                for lr_multiplier in lr_multipliers:
                    print('Trying to modify:', str(base_model), flush=True)

                    # Clean-up the keras session before working with a new model
                    del self.processor
                    del self.model
                    K.clear_session()
                    gc.collect()

                    w2m = Word2Morph.load_model(base_model.path)
                    self.model, self.processor = w2m.model, w2m.processor
                    lr = float(K.get_value(self.model.optimizer.lr))
                    K.set_value(self.model.optimizer.lr, lr * lr_multiplier)

                    history = self.model.fit_generator(
                        generator=train_generator,
                        epochs=epoch + 1,
                        initial_epoch=epoch,
                        callbacks=[
                            Evaluate(data_generator=valid_generator,
                                     to_sample=self.processor.to_sample)
                        ],
                        class_weight=self.class_weights,
                        use_multiprocessing=True,
                        workers=threads,
                    )
                    log_model(score=history.history[monitor_metric][-1])