Пример #1
0
def main():
    args = parse_args()

    # input files
    wv_file = args.glove_dir + '/' + args.wv_file
    wv_dim = args.wv_dim

    # output files
    helper.ensure_dir(args.vocab_dir)
    vocab_file = args.vocab_dir + '/vocab.pkl'
    emb_file = args.vocab_dir + '/embedding.npy'

    # load files
    print("loading files...")
    train_file = args.data_dir + '/train.jsonl'
    dev_file = args.data_dir + '/dev.jsonl'
    # test_file = args.data_dir + '/test.jsonl'
    train_tokens = load_tokens(train_file)
    dev_tokens = load_tokens(dev_file)
    # test_tokens = load_tokens(test_file)
    if args.lower:
        train_tokens, dev_tokens = [[t.lower() for t in tokens] for tokens in \
                                                 (train_tokens, dev_tokens)]
        # (train_tokens, dev_tokens, test_tokens)]

    # load glove
    print("loading glove...")
    glove_vocab = vocab.load_glove_vocab(wv_file, wv_dim)
    print("{} words loaded from glove.".format(len(glove_vocab)))

    print("building vocab...")
    # all_tokens = train_tokens + dev_tokens + test_tokens
    all_tokens = train_tokens + dev_tokens
    v = build_vocab(all_tokens, glove_vocab, args.min_freq)

    print("calculating oov...")
    # datasets = {'train': train_tokens, 'dev': dev_tokens, 'test': test_tokens}
    datasets = {'train': train_tokens, 'dev': dev_tokens}
    for dname, d in datasets.items():
        total, oov = count_oov(d, v)
        print("{} oov: {}/{} ({:.2f}%)".format(dname, oov, total,
                                               oov * 100.0 / total))

    print("building embeddings...")
    if args.random:
        print("using random initialization...")
        embedding = random_embedding(v, wv_dim)
    else:
        embedding = vocab.build_embedding(wv_file, v, wv_dim)
    print("embedding size: {} x {}".format(*embedding.shape))

    print("dumping to files...")
    with open(vocab_file, 'wb') as outfile:
        pickle.dump(v, outfile)
    np.save(emb_file, embedding)
    print("all done.")
Пример #2
0
def prepare_vocab(data_dir,
                  vocab_dir,
                  spacy_model,
                  glove_dir="dataset/glove",
                  wv_file="glove.840B.300d.txt",
                  wv_dim=300,
                  min_freq=0,
                  lower=True):
    # input files
    train_file = data_dir + '/train.json'
    dev_file = data_dir + '/dev.json'
    test_file = data_dir + '/test.json'
    wv_file = glove_dir + '/' + wv_file
    wv_dim = wv_dim

    # output files
    helper.ensure_dir(vocab_dir)
    vocab_file = vocab_dir + '/vocab.pkl'
    emb_file = vocab_dir + '/embedding.npy'

    # load files
    print("loading files...")
    train_tokens = load_tokens(train_file, spacy_model)
    dev_tokens = load_tokens(dev_file)
    test_tokens = load_tokens(test_file, spacy_model)
    if lower:
        train_tokens, test_tokens = [[t.lower() for t in tokens] for tokens in \
                                     (train_tokens, dev_tokens, test_tokens)]

    # load glove
    print("loading glove...")
    glove_vocab = vocab.load_glove_vocab(wv_file, wv_dim)
    print("{} words loaded from glove.".format(len(glove_vocab)))

    print("building vocab...")
    v = build_vocab(train_tokens, glove_vocab, min_freq)

    print("calculating oov...")
    datasets = {'train': train_tokens, 'test': test_tokens}
    for dname, d in datasets.items():
        total, oov = count_oov(d, v)
        print("{} oov: {}/{} ({:.2f}%)".format(dname, oov, total,
                                               oov * 100.0 / total))

    print("building embeddings...")
    embedding = vocab.build_embedding(wv_file, v, wv_dim)
    print("embedding size: {} x {}".format(*embedding.shape))

    print("dumping to files...")
    with open(vocab_file, 'wb') as outfile:
        pickle.dump(v, outfile)
    np.save(emb_file, embedding)
    print("all done.")
Пример #3
0
def prepare_voabulary (vocab_params): 
    
    # input files
    train_file = vocab_params.data_dir + '/train.json'
    dev_file = vocab_params.data_dir + '/dev.json'
    test_file = vocab_params.data_dir + '/test.json'
    wv_file = vocab_params.glove_dir + '/' + vocab_params.glove_text_file
    wv_dim = vocab_params.emb_dim

    # output files
    helper.ensure_dir(vocab_params.vocab_dir)
    vocab_file = vocab_params.vocab_dir + vocab_params.vocab_file
    emb_file = vocab_params.vocab_dir + vocab_params.embed_file

    # load files
    print("loading files...")
    train_tokens = load_tokens(train_file)
    dev_tokens = load_tokens(dev_file)
    test_tokens = load_tokens(test_file)
    if vocab_params.lower:
        train_tokens, dev_tokens, test_tokens = [[t.lower() for t in tokens] for tokens in\
                (train_tokens, dev_tokens, test_tokens)]

    # load glove
    print("loading glove...")
    glove_vocab = vocab.load_glove_vocab(wv_file, wv_dim)
    print("{} words loaded from glove.".format(len(glove_vocab)))
    
    print("building vocab...")
    v = build_vocab(train_tokens, glove_vocab, vocab_params.min_freq)

    print("calculating oov...")
    datasets = {'train': train_tokens, 'dev': dev_tokens, 'test': test_tokens}
    for dname, d in datasets.items():
        total, oov = count_oov(d, v)
        print("{} oov: {}/{} ({:.2f}%)".format(dname, oov, total, oov*100.0/total))
    
    print("building embeddings...")
    embedding = vocab.build_embedding(wv_file, v, wv_dim)
    print("embedding size: {} x {}".format(*embedding.shape))

    print("dumping to files...")
    with open(vocab_file, 'wb') as outfile:
        pickle.dump(v, outfile)
    np.save(emb_file, embedding)
    print("all done.")
    return(vocab)
Пример #4
0
def main():
    args = parse_args()

    # input files
    train_file = args.data_dir + '/train.json'
    dev_file = args.data_dir + '/dev.json'
    test_file = args.data_dir + '/test.json'
    schema_file = args.data_dir + '/schemas.json'
    wv_file = args.emb_dir + '/' + args.wv_file
    wv_dim = args.wv_dim

    # output files
    helper.ensure_dir(args.vocab_dir)
    vocab_file = args.vocab_dir + '/vocab.pkl'
    char_file = args.vocab_dir + '/chars.json'
    emb_file = args.vocab_dir + '/embedding.npy'

    # load files
    print("loading files...")
    train_tokens = load_tokens(train_file)
    dev_tokens = load_tokens(dev_file)
    test_tokens = load_tokens(test_file)
    if args.lower:
        train_tokens, dev_tokens, test_tokens = [[t.lower() for t in tokens] for tokens in\
                (train_tokens, dev_tokens, test_tokens)]

    # load glove
    print("loading glove...")
    glove_vocab = vocab.load_glove_vocab(wv_file, wv_dim)
    print("{} words loaded from glove.".format(len(glove_vocab)))

    print("building vocab...")
    v = build_vocab(train_tokens, glove_vocab, args.min_freq)

    print("calculating oov...")
    datasets = {'train': train_tokens, 'dev': dev_tokens, 'test': test_tokens}
    for dname, d in datasets.items():
        total, oov = count_oov(d, v)
        print("{} oov: {}/{} ({:.2f}%)".format(dname, oov, total,
                                               oov * 100.0 / total))

    print("building embeddings...")
    embedding = vocab.build_embedding(wv_file, v, wv_dim)
    print("embedding size: {} x {}".format(*embedding.shape))

    print("dumping embeddings to files...")
    with open(vocab_file, 'wb') as outfile:
        pickle.dump(v, outfile)
    np.save(emb_file, embedding)
    # print("all done.")

    print("building schemas...")
    all_schemas = set()
    subj_type = set()
    obj_type = set()
    min_count = 2
    pos_tags = set()
    chars = defaultdict(int)
    with open(train_file) as f:
        a = json.load(f)
        for ins in a:
            for spo in ins['spo_details']:
                all_schemas.add(spo[3])
                subj_type.add(spo[2])
                obj_type.add(spo[6])
            for pos in ins['pos_tags']:
                pos_tags.add(pos)
            for token in ins['tokens']:
                for char in token:
                    chars[char] += 1
    id2predicate = {i + 1: j for i, j in enumerate(all_schemas)}  # 0表示终止类别
    predicate2id = {j: i for i, j in id2predicate.items()}

    id2subj_type = {i + 1: j for i, j in enumerate(subj_type)}  # 0表示终止类别
    subj_type2id = {j: i for i, j in id2subj_type.items()}

    id2obj_type = {i + 1: j for i, j in enumerate(obj_type)}  # 0表示终止类别
    obj_type2id = {j: i for i, j in id2obj_type.items()}

    with codecs.open(schema_file, 'w', encoding='utf-8') as f:
        json.dump([
            id2predicate, predicate2id, id2subj_type, subj_type2id,
            id2obj_type, obj_type2id
        ],
                  f,
                  indent=4,
                  ensure_ascii=False)

    print("dumping chars to files...")
    with codecs.open(char_file, 'w', encoding='utf-8') as f:
        chars = {i: j for i, j in chars.items() if j >= min_count}
        id2char = {i + 2: j for i, j in enumerate(chars)}  # padding: 0, unk: 1
        char2id = {j: i for i, j in id2char.items()}
        id2pos = {i + 2: j
                  for i, j in enumerate(pos_tags)}  # padding: 0, unk: 1
        pos2id = {j: i for i, j in id2pos.items()}
        json.dump([id2char, char2id, id2pos, pos2id],
                  f,
                  indent=4,
                  ensure_ascii=False)
Пример #5
0
def main():
    args = parse_args()

    # input files
    train_file = args.data_dir + "/rationale_train.json"
    dev_file = args.data_dir + "/rationale_dev.json"
    un_file = args.data_dir + "/rationale_un.json"
    wl_file = args.data_dir + "/rationale_wl.json"
    cts_file = args.data_dir + "/rationale_cts.json"
    bc_file = args.data_dir + "/rationale_bc.json"
    wv_file = args.glove_dir + "/" + args.wv_file
    wv_dim = args.wv_dim

    # output files
    helper.ensure_dir(args.vocab_dir)
    vocab_file = args.vocab_dir + "/vocab.pkl"
    emb_file = args.vocab_dir + "/embedding.npy"

    # load files
    print("loading files...")
    train_tokens = load_tokens(train_file)
    dev_tokens = load_tokens(dev_file)
    un_tokens = load_tokens(un_file)
    wl_tokens = load_tokens(wl_file)
    cts_tokens = load_tokens(cts_file)
    bc_tokens = load_tokens(bc_file)

    if args.lower:
        train_tokens, dev_tokens, un_tokens, wl_tokens, cts_tokens, bc_tokens = [
            [t.lower() for t in tokens] for tokens in (
                train_tokens,
                dev_tokens,
                un_tokens,
                wl_tokens,
                cts_tokens,
                bc_tokens,
            )
        ]

    # load glove
    print("loading glove...")
    glove_vocab = vocab.load_glove_vocab(wv_file, wv_dim)
    print("{} words loaded from glove.".format(len(glove_vocab)))

    print("building vocab...")
    v = build_vocab(train_tokens, glove_vocab, args.min_freq)

    print("calculating oov...")
    datasets = {
        "train": train_tokens,
        "dev": dev_tokens,
        "un": un_tokens,
        "wl": wl_tokens,
        "cts": cts_tokens,
        "bc": bc_tokens,
    }
    for dname, d in datasets.items():
        total, oov = count_oov(d, v)
        print("{} oov: {}/{} ({:.2f}%)".format(dname, oov, total,
                                               oov * 100.0 / total))

    print("building embeddings...")
    embedding = vocab.build_embedding(wv_file, v, wv_dim)
    print("embedding size: {} x {}".format(*embedding.shape))

    print("dumping to files...")
    with open(vocab_file, "wb") as outfile:
        pickle.dump(v, outfile)
    np.save(emb_file, embedding)
    print("all done.")
Пример #6
0
def main():
    args = parse_args()

    # input files
    train_file = args.data_dir + '/train.jsonl'
    dev_file = args.data_dir + '/testa.jsonl'
    test_file = args.data_dir + '/testb.jsonl'
    wv_file = args.glove_dir + '/' + args.wv_file
    wv_dim = args.wv_dim

    # output files
    helper.ensure_dir(args.vocab_dir)
    vocab_file = args.vocab_dir + '/vocab.pkl'
    char_vocab_file = args.vocab_dir + '/vocab_char.pkl'
    emb_file = args.vocab_dir + '/embedding.npy'

    # load files
    print("loading files...")
    train_tokens, train_chars = load_tokens(train_file)
    dev_tokens, dev_chars = load_tokens(dev_file)
    test_tokens, test_chars = load_tokens(test_file)
    if args.lower:
        train_tokens, dev_tokens, test_tokens = [[t.lower() for t in tokens] for tokens in\
                (train_tokens, dev_tokens, test_tokens)]
    if args.char_lower and train_chars:
        train_chars, dev_chars, test_chars = [[c.lower() for c in chars] for chars in\
            (train_chars, dev_chars, test_chars)]

    # load glove
    print("loading glove...")
    glove_vocab = vocab.load_glove_vocab(wv_file, wv_dim)
    print("{} words loaded from glove.".format(len(glove_vocab)))

    print("building vocab...")
    if args.all:
        all_tokens = train_tokens + dev_tokens + test_tokens
    else:
        all_tokens = train_tokens
    v = build_vocab(all_tokens, glove_vocab, args.min_freq)

    if train_chars:
        print("building vocab for chars...")
        all_chars = train_chars + dev_chars + test_chars
        char_counter = Counter(all_chars)
        #char_vocab = constant.VOCAB_PREFIX + sorted(char_counter.keys(), key=char_counter.get, reverse=True)
        char_vocab = constant.VOCAB_PREFIX + sorted(list(char_counter.keys()))
        print("vocab built with {} chars.".format(len(char_vocab)))
    else:
        char_vocab = None

    print("calculating oov...")
    datasets = {'train': train_tokens, 'dev': dev_tokens, 'test': test_tokens}
    for dname, d in datasets.items():
        total, oov = count_oov(d, v)
        print("{} oov: {}/{} ({:.2f}%)".format(dname, oov, total,
                                               oov * 100.0 / total))

    print("building embeddings...")
    if args.random:
        print("using random initialization...")
        embedding = random_embedding(v, wv_dim)
    else:
        embedding = vocab.build_embedding(wv_file, v, wv_dim)
    print("embedding size: {} x {}".format(*embedding.shape))

    print("dumping to files...")
    with open(vocab_file, 'wb') as outfile:
        pickle.dump(v, outfile)
    if char_vocab:
        with open(char_vocab_file, 'wb') as outfile:
            pickle.dump(char_vocab, outfile)
    np.save(emb_file, embedding)
    print("all done.")