def _read_one(path, is_train=False, max_size=None):
     lang_id = guess_language_id(path)
     logger.info("Reading: guess that the language of file %s is %s." % (path, lang_id))
     one_data = conllx_data.read_data_to_variable(path, word_alphabet, char_alphabet, pos_alphabet,
                                                  type_alphabet, use_gpu=use_gpu, volatile=(not is_train),
                                                  use_bert=args.use_bert, symbolic_root=True, lang_id=lang_id,
                                                  max_size=max_size)
     return one_data
Пример #2
0
 def _read_one(path, is_train):
     lang_id = guess_language_id(path)
     logger.info("Reading: guess that the language of file %s is %s." %
                 (path, lang_id))
     one_data = conllx_stacked_data.read_stacked_data_to_variable(
         path,
         word_alphabet,
         char_alphabet,
         pos_alphabet,
         type_alphabet,
         use_gpu=use_gpu,
         volatile=(not is_train),
         prior_order=prior_order,
         lang_id=lang_id)
     return one_data
Пример #3
0
 def _read_one(path, is_train):
     lang_id = guess_language_id(path)
     logger.info("Reading: guess that the language of file %s is %s." %
                 (path, lang_id))
     one_data = conllx_data.read_data_to_variable(
         path,
         word_alphabet,
         char_alphabet,
         pos_alphabet,
         type_alphabet,
         use_gpu=use_gpu,
         volatile=(not is_train),
         symbolic_root=True,
         lang_id=lang_id,
         len_thresh=(args.train_len_thresh if is_train else 100000))
     return one_data
Пример #4
0
def augment_with_extra_embedding(the_alphabet, extra_embed_file,
                                 extra_embed_src_file, test_file, logger):
    extra_embeds_arr = []
    if extra_embed_file is not None:
        # reopen the vocab
        the_alphabet.open()
        # read the embed
        extra_word_dict, _ = load_embedding_dict('word2vec', extra_embed_file)
        if extra_embed_src_file is not None:
            src_extra_word_dict, _ = load_embedding_dict(
                'word2vec', extra_embed_src_file)
        lang_id = guess_language_id(test_file)
        for one_sent in iter_file(test_file):
            for w in one_sent["word"]:
                already_spec = w.startswith("!en_")
                if already_spec:
                    normed_word = w
                else:
                    normed_word = DIGIT_RE.sub(b"0", w)
                    normed_word = lang_specific_word(normed_word,
                                                     lang_id=lang_id)
                #
                if normed_word in the_alphabet.instance2index:
                    continue
                # TODO: assume english is the source for run-translate
                if already_spec:
                    w = w[4:]
                    check_dict = src_extra_word_dict
                else:
                    check_dict = extra_word_dict
                #
                if w in check_dict:
                    new_embed_arr = check_dict[w]
                elif w.lower() in check_dict:
                    new_embed_arr = check_dict[w.lower()]
                else:
                    new_embed_arr = None
                if new_embed_arr is not None:
                    extra_embeds_arr.append(new_embed_arr)
                    the_alphabet.add(normed_word)
        # close the vocab
        the_alphabet.close()
    logger.info("Augmenting the vocab with new words of %s, now vocab is %s." %
                (len(extra_embeds_arr), the_alphabet.size()))
    return extra_embeds_arr