Пример #1
0
def separate_vocab(vocab_path, roots):
    vocab_name = os.path.splitext(vocab_path)[0]
    vocab = get_vocab(vocab_path)
    print("Got vocabulary with {:d} elements.".format(len(vocab)))

    write_train_test_vocabs(vocab, roots, vocab_name)
    print("Divided vocabulary for train and test.")
Пример #2
0
def data_process(config):
    train_data, test_data = util.get_data(config['data_name'])

    vocab2index = util.get_vocab(
        train_data["text"] + test_data["text"], max_size=config["vocab_size"])

    train_data = train_data.map(lambda e: util.encode_sentence(
        e["text"], vocab2index, config))
    train_data.set_format(type='torch', columns=['input_ids', 'label'])
    test_data = test_data.map(lambda e: util.encode_sentence(
        e["text"], vocab2index, config))
    test_data.set_format(type='torch', columns=['input_ids', 'label'])
    train_dl = DataLoader(
        train_data, batch_size=config['batch_size'], shuffle=True)
    valid_dl = DataLoader(test_data, batch_size=config['batch_size'])

    pretrained_emb = util.load_glove('glove.6B.300d.txt')

    pretrained_embeddings = util.get_emb_matrix(
        pretrained_emb, vocab2index, emb_size=config['embed_dim'])
    keywords_matrix = [pretrained_emb[k] for k in config["keywords"]]
    related_embeddings = util.create_relatedness_matrix(
        keywords_matrix, pretrained_embeddings)

    print(f'embedding matrix shape: {pretrained_embeddings.shape}')
    print(f'relatedness matrix shape: {related_embeddings.shape}')

    return train_dl, valid_dl, pretrained_embeddings, related_embeddings
Пример #3
0
    a label: int 1 or 0

'''
raw_train_vua = data_parser.load_raw_train_vua()
raw_test_vua = data_parser.load_raw_test_vua()

print('VUA dataset division: ', len(raw_train_vua), len(raw_test_vua))
"""
2. Data preparation
"""
'''
2. 1
get vocabulary and glove embeddings in raw dataset 
'''
# vocab is a set of words
vocab = get_vocab(raw_train_vua + raw_test_vua)
# two dictionaries. <PAD>: 0, <UNK>: 1
word2idx, idx2word = get_word2idx_idx2word(vocab)
# glove_embeddings a nn.Embeddings
glove_embeddings = get_embedding_matrix(word2idx,
                                        idx2word,
                                        normalization=False)
# elmo_embeddings
elmos_train_vua = h5py.File('../elmo/VUA_train2.hdf5', 'r')
# suffix_embeddings: number of suffix tag is 2, and the suffix embedding dimension is 50
suffix_embeddings = nn.Embedding(2, 50)
'''
2. 2
embed the datasets
'''
random.seed(0)
Пример #4
0
for i in range(len(raw_train_vua)):
    raw_train_vua[i][2] = index_sequence(pos2idx, raw_train_vua[i][2])
for i in range(len(raw_val_vua)):
    raw_val_vua[i][2] = index_sequence(pos2idx, raw_val_vua[i][2])
print('size of training set, validation set: ', len(raw_train_vua),
      len(raw_val_vua))
"""
2. Data preparation
"""
'''
2. 1
get vocabulary and glove embeddings in raw dataset 
'''
# vocab is a set of words
vocab = get_vocab(raw_train_vua)
# two dictionaries. <PAD>: 0, <UNK>: 1
word2idx, idx2word = get_word2idx_idx2word(vocab)
# glove_embeddings a nn.Embeddings
glove_embeddings = get_embedding_matrix(word2idx,
                                        idx2word,
                                        normalization=False)
# elmo_embeddings
elmos_train_vua = h5py.File('../elmo/VUA_train.hdf5', 'r')
elmos_val_vua = h5py.File('../elmo/VUA_val.hdf5', 'r')
# no suffix embeddings for sequence labeling
suffix_embeddings = None
'''
2. 2
embed the datasets
'''
Пример #5
0
        verb_label = int(line[5])
        label_seq[verb_idx] = verb_label
        pos_seq[
            verb_idx] = 1  # idx2pos = {0: 'words that are not focus verbs', 1: 'focus verb'}
        raw_mohx.append([sentence.strip(), label_seq, pos_seq])

print('MOH-X dataset division: ', len(raw_mohx))
"""
2. Data preparation
"""
'''
2. 1
get vocabulary and glove embeddings in raw dataset 
'''
# vocab is a set of words
vocab = get_vocab(raw_mohx)
# two dictionaries. <PAD>: 0, <UNK>: 1
word2idx, idx2word = get_word2idx_idx2word(vocab)
# glove_embeddings a nn.Embeddings
glove_embeddings = get_embedding_matrix(word2idx,
                                        idx2word,
                                        normalization=False)
# elmo_embeddings
# set elmos_mohx=None to exclude elmo vectors. Also need to change the embedding_dim in later model initialization
elmos_mohx = h5py.File('../elmo/MOH-X_cleaned.hdf5', 'r')
'''
2. 2
embed the datasets
'''
random.seed(0)
random.shuffle(raw_mohx)
Пример #6
0
    for word_seq, publication_id in raw_test_rcc:
        output.append({'publication_id': publication_id, 'sentence': word_seq})
    logging.info("Writing on new csv file...")
    writer.writerows(output)

logging.info(
    'size of test set: {}, annotated by brute-force test set: {}, to-be-found test set: {}'
    .format(
        len(raw_test_rcc) + len(test_annotated), len(test_annotated),
        len(raw_test_rcc)))

# logging.info("Read vocabulary info from {}".format(args.vocab_info_path))
# with open(args.vocab_info_path, "rb+") as infile:
#   word2idx, idx2word = pickle.load(infile)

vocab = get_vocab(raw_test_rcc + test_annotated)
word2idx, idx2word = get_word2idx_idx2word(vocab)
logging.info("Loading glove embeddings")
glove_embeddings = get_embedding_matrix(word2idx,
                                        idx2word,
                                        args,
                                        normalization=False)

if using_GPU:
    elmo = ElmoEmbedder("./elmo/options.json", "./elmo/weights.hdf5", 0)
else:
    elmo = ElmoEmbedder("./elmo/options.json", "./elmo/weights.hdf5", -1)

############
# labeling #
############
Пример #7
0
def label_data_with_vocab(vocab_path, file_path):
    vocab = get_vocab(vocab_path)
    files = do_dirty_labeling_for_file(file_path, os.path.splitext(file_path)[0], vocab)
    print("Divided dataset into train and test.")
    return files
Пример #8
0
# normal version
with open('../Poetry/poetry.csv') as f:
    lines = csv.reader(f)
    next(lines)
    for line in lines:
        raw_poetry.append([line[1].strip(), int(line[2]), int(line[3])])
print('Poetry dataset size: ', len(raw_poetry))
"""
2. Data preparation
"""
'''
2. 1
get vocabulary and glove embeddings in raw dataset 
'''
# vocab is a set of words
vocab = get_vocab(raw_poetry)
# two dictionaries. <PAD>: 0, <UNK>: 1
word2idx, idx2word = get_word2idx_idx2word(vocab)
# glove_embeddings a nn.Embeddings
glove_embeddings = get_embedding_matrix(word2idx,
                                        idx2word,
                                        normalization=False)
# elmo_embeddings
elmos_poetry = None
# suffix_embeddings: number of suffix tag is 2, and the suffix embedding dimension is 50
suffix_embeddings = nn.Embedding(2, 50)
'''
2. 2
embed the datasets
'''
random.seed(0)  # set a seed
Пример #9
0
    a label: int 1 or 0

'''
raw_train_toefl = data_parser.load_raw_train_toefl()[0]
raw_test_toefl = data_parser.load_raw_test_toefl()[0]

print('TOEFL dataset division: ', len(raw_train_toefl), len(raw_test_toefl))
"""
2. Data preparation
"""
'''
2. 1
get vocabulary and glove embeddings in raw dataset 
'''
# vocab is a set of words
vocab = get_vocab(raw_train_toefl + raw_test_toefl)
# two dictionaries. <PAD>: 0, <UNK>: 1
word2idx, idx2word = get_word2idx_idx2word(vocab)
# glove_embeddings a nn.Embeddings
glove_embeddings = get_embedding_matrix(word2idx,
                                        idx2word,
                                        normalization=False)
# elmo_embeddings
elmos_train_toefl = h5py.File('../elmo/TOEFL_train.hdf5', 'r')
# suffix_embeddings: number of suffix tag is 2, and the suffix embedding dimension is 50
suffix_embeddings = nn.Embedding(2, 50)
'''
2. 2
embed the datasets
'''
random.seed(0)
Пример #10
0
        labeled = line[3]
        assert (len(word_seq) == len(label_seq))
        raw_train_bruteforce_rcc.append(
            [publication_id, word_seq, label_seq, labeled])

train_rcc = []
for raw_sent, raw_sent_brute in zip(raw_train_rcc, raw_train_bruteforce_rcc):
    pub_id = raw_sent[0]
    word_seq = raw_sent[1]
    label_seq = raw_sent[2]
    if raw_sent[3] == 'N' and raw_sent_brute[3] == 'Y':
        label_seq = raw_sent_brute[2]
    assert (len(word_seq) == len(label_seq))
    train_rcc.append([word_seq, label_seq, pub_id])

vocab = get_vocab(train_rcc)
word2idx, idx2word = get_word2idx_idx2word(vocab)
logging.info("Loading glove embeddings")
glove_embeddings = get_embedding_matrix(word2idx,
                                        idx2word,
                                        args,
                                        normalization=False)

if using_GPU:
    elmo = ElmoEmbedder("./elmo/options.json", "./elmo/weights.hdf5", 0)
else:
    elmo = ElmoEmbedder("./elmo/options.json", "./elmo/weights.hdf5", -1)

# logging.info("embedd test data with glove and elmo vectors")
# embedded_rcc = []
# for example in tqdm(train_rcc, total=len(train_rcc)):
Пример #11
0
# normal version
with open('../datasets/TroFi/TroFi_formatted_all3737.csv') as f:
    lines = csv.reader(f)
    next(lines)
    for line in lines:
        raw_trofi.append([line[1].strip(), int(line[2]), int(line[3])])
print('TroFi dataset size: ', len(raw_trofi))
"""
2. Data preparation
"""
'''
2. 1
get vocabulary and glove embeddings in raw dataset 
'''
# vocab is a set of words
vocab = get_vocab(raw_trofi)
# two dictionaries. <PAD>: 0, <UNK>: 1
word2idx, idx2word = get_word2idx_idx2word(vocab)
# glove_embeddings a nn.Embeddings
glove_embeddings = get_embedding_matrix(word2idx,
                                        idx2word,
                                        normalization=False)
# elmo_embeddings
# set elmos_mohx=None to exclude elmo vectors
#elmos_trofi = h5py.File('../elmo/TroFi3737.hdf5', 'r')
# suffix_embeddings: number of suffix tag is 2, and the suffix embedding dimension is 50
suffix_embeddings = nn.Embedding(2, 50)
'''
2. 2
embed the datasets
'''
import numpy as np
import matplotlib

Gutenberg = []

with open('../Poetry/corpus_to_be_labeled.csv') as t:
    lines = csv.reader(t)
    next(lines)
    for line in lines:
        # Gutenberg.append([line[1].strip(), int(float(line[2])), int(float(line[3]))])
        Gutenberg.append([line[1].strip(), int(line[2])])

print('Poetry dataset size: ', len(Gutenberg))

# vocab is a set of words
vocab = get_vocab(Gutenberg)
# two dictionaries. <PAD>: 0, <UNK>: 1
word2idx, idx2word = get_word2idx_idx2word(vocab)
# glove_embeddings a nn.Embeddings
glove_embeddings = get_embedding_matrix(word2idx,
                                        idx2word,
                                        normalization=False)
# elmo_embeddings
elmos_poetry = None
# suffix_embeddings: number of suffix tag is 2, and the suffix embedding dimension is 50
suffix_embeddings = nn.Embedding(2, 50)
'''
2. 2
embed the datasets
'''
# random.seed(0)  # set a seed
Пример #13
0
# Spliting the data into train, validation and test
msk = np.random.rand(len(df)) < 0.8
df_train = df[msk]
msk2 = np.random.rand(len(df_train)) < 0.8
df_validate = df_train[~msk2]
df_train = df_train[msk2]
df_test = df[~msk]

print("Train size: %s" % len(df_train))
print("Validation size: %s" % len(df_validate))
print("Test size: %s" % len(df_test))

# Creating the english and hebrew vocabularies
eng_vocab, rev_eng_vocab = get_vocab(df["english_sentences"],
                                     addtional_tokens=[PAD_TOKEN, OOV_TOKEN],
                                     top=None)
heb_vocab, rev_heb_vocab = get_vocab(df["hebrew_sentences"],
                                     addtional_tokens=[
                                         PAD_TOKEN, OOV_TOKEN,
                                         SENTENCE_START_TOKEN,
                                         SENTENCE_END_TOKEN
                                     ],
                                     top=None)


def vectorize_dataset(df):
    """
    vectorizing the data into encoder input, decoder input and decoder target.
    """
    vect_eng_sentences = vectorize_sentences(df["english_sentences"],
Пример #14
0
df = read_data()

# Spliting the data into train, validation and test
msk = np.random.rand(len(df)) < 0.8
df_train = df[msk]
msk2 = np.random.rand(len(df_train)) < 0.8
df_validate = df_train[~msk2]
df_train = df_train[msk2]
df_test = df[~msk]

print("Train size: %s" % len(df_train))
print("Validation size: %s" % len(df_validate))
print("Test size: %s" % len(df_test))

# Creating the english and hebrew vocabularies
eng_vocab, rev_eng_vocab = get_vocab(df["english_sentences"], addtional_tokens=[PAD_TOKEN, OOV_TOKEN], top=None)
heb_vocab, rev_heb_vocab = get_vocab(df["hebrew_sentences"],
                                     addtional_tokens=[PAD_TOKEN, OOV_TOKEN, SENTENCE_START_TOKEN, SENTENCE_END_TOKEN],
                                     top=None)


def vectorize_dataset(df):
    """
    vectorizing the data into encoder input, decoder input and decoder target.
    """
    vect_eng_sentences = vectorize_sentences(df["english_sentences"], eng_vocab, encode=True, reverse=True)
    decoder_input_data = vectorize_sentences(df["hebrew_sentences"], heb_vocab, add_prefix_token=SENTENCE_START_TOKEN,
                                             encode=True)
    decoder_target_data = np.array(
        [np.concatenate((x[1:], [heb_vocab[SENTENCE_END_TOKEN]]), axis=0) for x in decoder_input_data])
    return vect_eng_sentences, decoder_input_data, decoder_target_data