예제 #1
0
    def __init__(self, args):
        self.args = args
        self.lang_to_code, self.code_to_lang = utils.get_lang_code_dicts()
        paths_to_read = []
        langs = args.langs.split("/")
        for lang in langs:
            input_folder = args.treebank_path + "/" + "UD_" + self.code_to_lang[
                lang] + "//"
            for [path, dir, files] in os.walk(input_folder):

                files.sort()
                for file in files:
                    if file.endswith(".conllu"):
                        path = input_folder + file
                        print("Reading vocab from ", path)
                        paths_to_read.append((path, lang))
                break

        self.tag_to_ids, self.word_to_id, self.char_to_id = self.read_files(
            paths_to_read)
        print("Size of vocab before: %d" % len(self.word_to_id))
        self.word_to_id['<unk>'] = len(self.word_to_id)
        self.char_to_id['<unk>'] = len(self.char_to_id)

        self.word_to_id['<\s>'] = len(self.word_to_id)
        self.char_to_id['<pad>'] = len(self.char_to_id)
        print("Size of vocab after: %d" % len(self.word_to_id))
        self.word_padding_token = 0
        self.char_padding_token = 0
        self.id2tags = {}
        self.tag_vocab_sizes = {}
        self.word_freq = {}
        for key, tag2id in self.tag_to_ids.items():
            self.id2tags[key] = {v: k for k, v in tag2id.items()}
            self.tag_vocab_sizes[key] = len(tag2id)
            print("Feat: {0} Size: {1}".format(key, len(tag2id)))
            print(self.tag_to_ids[key])
        self.id_to_word = {v: k for k, v in self.word_to_id.items()}
        self.id_to_char = {v: k for k, v in self.char_to_id.items()}

        self.word_vocab_size = len(self.id_to_word)
        self.char_vocab_size = len(self.id_to_char)
        print("Size of vocab after: %d" % len(self.word_to_id))
        print("Word vocab size=%d, Char Vocab size=%d" %
              (self.word_vocab_size, self.char_vocab_size))
예제 #2
0
def main():
    lang_to_code, code_to_lang = utils.get_lang_code_dicts()
    annot_sents = read_conll(['ru', 'bg', 'da', 'sv', 'es', 'pt', 'uk'],
                             code_to_lang,
                             train_or_dev="train")
예제 #3
0
print(args)

# Set seed
torch.manual_seed(args.seed)

# Create dictionaries for language codes, morph tags and pos tags
langs = args.langs.split("/")
args.model_name = args.model_type + "".join(["_" + l for l in langs])
if args.sum_word_char:
    args.model_name += "_wc-sum"
if args.sent_attn:
    args.model_name += "_sent-attn"
if args.tgt_size:
    args.model_name += "-" + str(args.tgt_size)

lang_to_code, code_to_lang = utils.get_lang_code_dicts()
print("Reading training data...")

training_data_langwise, train_tgt_labels = utils.read_conll(
    args.treebank_path,
    langs,
    code_to_lang,
    tgt_size=args.tgt_size,
    train_or_dev="train")
training_data = []

if args.tgt_size == 100 and args.model_type != "mono":
    training_data_langwise[langs[-1]] = training_data_langwise[langs[-1]] * 10
elif args.tgt_size == 1000 and args.model_type != "mono":
    training_data_langwise[langs[-1]] = training_data_langwise[langs[-1]]