Exemplo n.º 1
0
def get_laser_embeddings(
    sentences,
    bpe_codes_path=BPE_CODES_PATH,
    encoder_path=ENCODER_PATH,
    language='en',
    max_tokens=12000,
    normalize_l2=False,
    n_encoding_jobs=10,
):
    prepare_laser()
    from embed import SentenceEncoder  # noqa: E402
    from text_processing import Token, BPEfastApply  # noqa: E402

    def get_laser_encoder(encoder_path, max_tokens=12000):
        return SentenceEncoder(encoder_path,
                               max_sentences=None,
                               max_tokens=max_tokens,
                               cpu=False)

    def encode_file(input_filepath, output_filepath, language, bpe_codes_path):
        tokenized_filepath = get_temp_filepath()
        Token(str(input_filepath),
              str(tokenized_filepath),
              lang=language,
              romanize=True if language == 'el' else False)
        BPEfastApply(str(tokenized_filepath), str(output_filepath),
                     str(bpe_codes_path))
        tokenized_filepath.unlink()

    input_filepath = get_temp_filepath()
    write_lines(sentences, input_filepath)
    with mute():
        with log_action('Tokenizing and applying BPE'):
            parallel_file_encoder = get_parallel_file_preprocessor(
                lambda input_filepath, output_filepath: encode_file(
                    input_filepath, output_filepath, language, bpe_codes_path),
                n_jobs=n_encoding_jobs,
            )
            bpe_filepath = get_temp_filepath()
            parallel_file_encoder(input_filepath, bpe_filepath)
        with log_action('Geting LASER embedding'):
            encoder = get_laser_encoder(encoder_path, max_tokens=max_tokens)
            embeddings = encoder.encode_sentences(read_lines(bpe_filepath))
            input_filepath.unlink()
            bpe_filepath.unlink()
            assert embeddings.shape[0] == len(sentences)
    del encoder
    if normalize_l2:
        embeddings = embeddings / np.expand_dims(
            np.linalg.norm(embeddings, axis=1), axis=1)
    return embeddings
Exemplo n.º 2
0
 def encode_file_pair(self, complex_filepath, simple_filepath,
                      output_complex_filepath, output_simple_filepath):
     for preprocessor in self.preprocessors:
         intermediary_output_complex_filepath = get_temp_filepath()
         intermediary_output_simple_filepath = get_temp_filepath()
         preprocessor.encode_file_pair(
             complex_filepath,
             simple_filepath,
             intermediary_output_complex_filepath,
             intermediary_output_simple_filepath,
         )
         complex_filepath = intermediary_output_complex_filepath
         simple_filepath = intermediary_output_simple_filepath
     shutil.copyfile(complex_filepath, output_complex_filepath)
     shutil.copyfile(simple_filepath, output_simple_filepath)
Exemplo n.º 3
0
 def encode_file(input_filepath, output_filepath, language, bpe_codes_path):
     tokenized_filepath = get_temp_filepath()
     Token(str(input_filepath),
           str(tokenized_filepath),
           lang=language,
           romanize=True if language == 'el' else False)
     BPEfastApply(str(tokenized_filepath), str(output_filepath),
                  str(bpe_codes_path))
     tokenized_filepath.unlink()
Exemplo n.º 4
0
def get_easse_report(simplifier,
                     test_set,
                     orig_sents_path=None,
                     refs_sents_paths=None):
    orig_sents, _ = get_orig_and_refs_sents(test_set, orig_sents_path,
                                            refs_sents_paths)
    orig_sents_path = get_temp_filepath()
    write_lines(orig_sents, orig_sents_path)
    sys_sents_path = simplifier(orig_sents_path)
    report_path = get_temp_filepath()
    report(
        test_set,
        sys_sents_path=sys_sents_path,
        orig_sents_path=orig_sents_path,
        refs_sents_paths=refs_sents_paths,
        report_path=report_path,
    )
    return report_path
Exemplo n.º 5
0
def write_sentencepiece_vocab_as_fairseq_dict(sentencepiece_model,
                                              fairseq_dict_path=None):
    if fairseq_dict_path is None:
        fairseq_dict_path = get_temp_filepath()
    with open(fairseq_dict_path, 'w') as f:
        for i in range(len(sentencepiece_model)):
            piece = sentencepiece_model.id_to_piece(i)
            if piece.startswith('<') and piece.endswith('>'):
                continue
            f.write(f'{piece} 999\n')  # Use 999 as dummy count
        return fairseq_dict_path
Exemplo n.º 6
0
 def decode_file(self,
                 input_filepath,
                 output_filepath,
                 encoder_filepath=None):
     for preprocessor in self.preprocessors:
         intermediary_output_filepath = get_temp_filepath()
         preprocessor.decode_file(input_filepath,
                                  intermediary_output_filepath,
                                  encoder_filepath)
         input_filepath = intermediary_output_filepath
     shutil.copyfile(input_filepath, output_filepath)
Exemplo n.º 7
0
 def encode_file(self,
                 input_filepath,
                 output_filepath,
                 encoder_filepath=None):
     if encoder_filepath is None:
         # We will use an empty temporary file which will yield None for each line
         encoder_filepath = get_temp_filepath(create=True)
     with open(output_filepath, 'w', encoding='utf-8') as f:
         for input_line, encoder_line in yield_lines_in_parallel(
             [input_filepath, encoder_filepath], strict=False):
             f.write(self.encode_sentence(input_line, encoder_line) + '\n')
Exemplo n.º 8
0
def apply_line_function_to_file(line_function,
                                input_filepath,
                                output_filepath=None):
    if output_filepath is None:
        output_filepath = get_temp_filepath()
    with open(input_filepath, 'r') as input_file, open(output_filepath,
                                                       'w') as output_file:
        for line in input_file:
            transformed_line = line_function(line.rstrip('\n'))
            if transformed_line is not None:
                output_file.write(transformed_line + '\n')
    return output_filepath
Exemplo n.º 9
0
def download(url, destination_path=None, overwrite=True):
    if destination_path is None:
        destination_path = get_temp_filepath()
    if not overwrite and destination_path.exists():
        return destination_path
    print('Downloading...')
    try:
        urlretrieve(url, destination_path, reporthook)
        sys.stdout.write('\n')
    except (Exception, KeyboardInterrupt, SystemExit):
        print('Rolling back: remove partially downloaded file')
        os.remove(destination_path)
        raise
    return destination_path
Exemplo n.º 10
0
def simplify_sentences(source_sentences, model_name='muss_en_wikilarge_mined'):
    # Best ACCESS parameter values for the en_bart_access_wikilarge_mined model, ideally we would need to use another set of parameters for other models.
    exp_dir = get_model_path(model_name)
    preprocessors = get_muss_preprocessors(model_name)
    generate_kwargs = {}
    if is_model_using_mbart(model_name):
        generate_kwargs['task'] = 'translation_from_pretrained_bart'
        generate_kwargs[
            'langs'] = 'ar_AR,cs_CZ,de_DE,en_XX,es_XX,et_EE,fi_FI,fr_XX,gu_IN,hi_IN,it_IT,ja_XX,kk_KZ,ko_KR,lt_LT,lv_LV,my_MM,ne_NP,nl_XX,ro_RO,ru_RU,si_LK,tr_TR,vi_VN,zh_CN'  # noqa: E501
    simplifier = get_fairseq_simplifier(exp_dir, **generate_kwargs)
    simplifier = get_preprocessed_simplifier(simplifier,
                                             preprocessors=preprocessors)
    source_path = get_temp_filepath()
    write_lines(source_sentences, source_path)
    pred_path = simplifier(source_path)
    return read_lines(pred_path)
Exemplo n.º 11
0
def evaluate_simplifier(simplifier,
                        test_set,
                        orig_sents_path=None,
                        refs_sents_paths=None,
                        quality_estimation=False):
    orig_sents, _ = get_orig_and_refs_sents(test_set,
                                            orig_sents_path=orig_sents_path,
                                            refs_sents_paths=refs_sents_paths)
    orig_sents_path = get_temp_filepath()
    write_lines(orig_sents, orig_sents_path)
    sys_sents_path = simplifier(orig_sents_path)
    return evaluate_system_output(
        test_set,
        sys_sents_path=sys_sents_path,
        orig_sents_path=orig_sents_path,
        refs_sents_paths=refs_sents_paths,
        metrics=['sari', 'bleu', 'fkgl'],
        quality_estimation=quality_estimation,
    )
Exemplo n.º 12
0
 def preprocessed_simplifier(complex_filepath, pred_filepath):
     preprocessed_complex_filepath = get_temp_filepath()
     composed_preprocessor.encode_file(complex_filepath, preprocessed_complex_filepath)
     preprocessed_pred_filepath = simplifier(preprocessed_complex_filepath)
     composed_preprocessor.decode_file(preprocessed_pred_filepath, pred_filepath, encoder_filepath=complex_filepath)
Exemplo n.º 13
0
 def wrapped(complex_filepath, pred_filepath=None):
     if pred_filepath is None:
         pred_filepath = get_temp_filepath()
     simplifier(complex_filepath, pred_filepath)
     return pred_filepath