def __init__(self, path_to_glove='glove.6B.200d.txt', embedding_dim=200, prep_Data_from = 'train', purpose='train_model'): """ NOTE- Beware of NaNs, drop them beforehand Dataset is the Liar Dataset. The description of the data can be found here - "Liar, Liar Pants on Fire: A New Benchmark Dataset for Fake News Detection - https://arxiv.org/abs/1705.00648" Download the dataset from - https://github.com/Tariq60/LIAR-PLUS Find the Glove vectors at https://nlp.stanford.edu/projects/glove/ and download the 822MB one. It contains 50d,100d, 200d and 300d vectors. 300d with 400K vocab takes around 1.5GB RAM, choose file according to your system. We have prepared test cases using the 200d vectors. :param path_to_glove: path to the desired glove vector file. File would be a .txt file :param embedding_dim: The dimension of vector you are choosing. :param prep_Data_from: Chose file from which you wanna prep data. :param purpose: This is only used by the test.py file. This parameter should not concern you. When making your dataloaders, DO NOT pass this parameter. """ assert prep_Data_from in ['train', 'test', 'val'] assert purpose in ['train_model', 'test_class'] if purpose == 'train_model': path_to_train = 'train2.tsv' path_to_val = 'val2.tsv' path_to_test = 'test2.tsv' else: path_to_train = 'sample_train.tsv' path_to_test = 'sample_test.tsv' path_to_val = 'sample_val.tsv' train_Dataframe = pandas.read_csv(path_to_train, sep='\t', header=None).dropna() test_Dataframe = pandas.read_csv(path_to_test, sep='\t', header=None).dropna() val_Dataframe = pandas.read_csv(path_to_val, sep='\t', header=None).dropna() self.embeddings = create_glove_dict(path_to_glove) self.embedding_dim = embedding_dim self.dataframe = pandas.concat([train_Dataframe, test_Dataframe, val_Dataframe]) self.justification_max = get_max_length(self.dataframe, 15) self.statement_max = get_max_length(self.dataframe, 3) if prep_Data_from == 'train': self.dataframe = train_Dataframe elif prep_Data_from == 'val': self.dataframe = val_Dataframe elif prep_Data_from == 'test': self.dataframe = test_Dataframe del train_Dataframe, test_Dataframe, val_Dataframe self.labels = {"true": 0, "mostly-true": 1, "half-true": 2, "barely-true": 3, "false": 4, "pants-fire": 5} return None
def create_data_2_train(train_path, dev_path, test_path, char_embedd_dim): word_sentences_train, label_sentences_train = load_conll_data(train_path) word_sentences_dev, label_sentences_dev = load_conll_data(dev_path) word_sentences_test, label_sentences_test = load_conll_data(test_path) max_length_train = utils.get_max_length(word_sentences_train) max_length_dev = utils.get_max_length(word_sentences_dev) max_length_test = utils.get_max_length(word_sentences_test) max_length = max(max_length_train, max_length_dev, max_length_test) label_sentences_id_train, alphabet_label = utils.map_string_2_id_open( label_sentences_train, 'pos') alphabet_label.save('pre-trained-model/pos', name='alphabet_label') label_sentences_id_dev = utils.map_string_2_id_close( label_sentences_dev, alphabet_label) label_sentences_id_test = utils.map_string_2_id_close( label_sentences_test, alphabet_label) word_train, label_train, mask_train = \ utils.construct_tensor_word(word_sentences_train, label_sentences_id_train, unknown_embedd, embedding_words, embedding_vectors, embedd_dim, max_length) word_dev, label_dev, mask_dev = \ utils.construct_tensor_word(word_sentences_dev, label_sentences_id_dev, unknown_embedd, embedding_words, embedding_vectors, embedd_dim, max_length) word_test, label_test, mask_test = \ utils.construct_tensor_word(word_sentences_test, label_sentences_id_test, unknown_embedd, embedding_words, embedding_vectors, embedd_dim, max_length) alphabet_char = LabelEncoder('char') alphabet_char.get_index(word_end) index_sentences_train, max_char_length_train = utils.get_character_indexes( word_sentences_train, alphabet_char) alphabet_char.close() char_embedd_table = utils.build_char_embedd_table(char_embedd_dim, alphabet_char) alphabet_char.save('pre-trained-model/pos', name='alphabet_char') index_sentences_dev, max_char_length_dev = utils.get_character_indexes( word_sentences_dev, alphabet_char) index_sentences_test, max_char_length_test = utils.get_character_indexes( word_sentences_test, alphabet_char) max_char_length = max(max_char_length_train, max_char_length_dev, max_char_length_test) char_train = utils.construct_tensor_char(index_sentences_train, max_length, max_char_length, alphabet_char) char_dev = utils.construct_tensor_char(index_sentences_dev, max_length, max_char_length, alphabet_char) char_test = utils.construct_tensor_char(index_sentences_test, max_length, max_char_length, alphabet_char) num_labels = alphabet_label.size() - 1 num_data = word_train.shape[0] return word_train, word_dev, word_test, char_train, char_dev, char_test, mask_train, mask_dev, mask_test, \ label_train, label_dev, label_test, alphabet_label, alphabet_char, max_length, max_char_length, \ char_embedd_table, num_labels, num_data
def train(): filename ='Flick_8k.trainImages.txt' train=utils.load_ids(filename) train_captions=utils.load_clean_captions('descriptions.txt',train) train_features=utils.load_photos_features('features.pkl',train) tokenizer = load(open('tokenizer.pkl','rb')) vocab_size = len(tokenizer.word_index)+1 max_len = utils.get_max_length(train_captions) model = caption_model(vocab_size,max_len) epochs=20 steps = len(train_captions)
# read dev data logger.info("Reading data from dev set...") word_sentences_dev, _, word_index_sentences_dev, label_index_sentences_dev = dp.read_conll_sequence_labeling( dev_path, word_alphabet, label_alphabet, word_column, label_column) # close alphabets : by close we mean we cannot add any more words to the word vocabulary. #To DO :change to close this after train set alone word_alphabet.close() label_alphabet.close() # we are doing a -1 because we did not use the zer index. I believe this is to account for unknown word logger.info("word alphabet size: %d" % (word_alphabet.size() - 1)) logger.info("label alphabet size: %d" % (label_alphabet.size() - 1)) # get maximum length : this is mainly for padding. max_length_train = utils.get_max_length(word_sentences_train) max_length_dev = utils.get_max_length(word_sentences_dev) #max_length_test = utils.get_max_length(word_sentences_test) max_length = min(dp.MAX_LENGTH, max(max_length_train, max_length_dev)) logger.info("Maximum length (i.e max words ) of training set is %d" % max_length_train) logger.info("Maximum length (i.e max words ) of dev set is %d" % max_length_dev) #logger.info("Maximum length (i.e max words ) of test set is %d" % max_length_test) logger.info("Maximum length (i.e max words ) used for training is %d" % max_length) logger.info("Padding training text and lables ...") word_index_sentences_train_pad, train_seq_length = utils.padSequence( word_index_sentences_train, max_length, beginZero=FLAGS.PadZeroBegin) label_index_sentences_train_pad, _ = utils.padSequence(
def create_data_2_train(train_path, dev_path, test_path, char_embedd_dim): word_sentences_train, pos_sentences_train, chunk_sentences_train, label_sentences_train = load_conll_data(train_path) word_sentences_dev, pos_sentences_dev, chunk_sentences_dev, label_sentences_dev = load_conll_data(dev_path) word_sentences_test, pos_sentences_test, chunk_sentences_test, label_sentences_test = load_conll_data(test_path) max_length_train = utils.get_max_length(word_sentences_train) max_length_dev = utils.get_max_length(word_sentences_dev) max_length_test = utils.get_max_length(word_sentences_test) max_length = max(max_length_train, max_length_dev, max_length_test) pos_sentences_id_train, alphabet_pos = utils.map_string_2_id_open(pos_sentences_train, 'pos') alphabet_pos.save('pre-trained-model/ner', name='alphabet_pos') pos_sentences_id_dev = utils.map_string_2_id_close(pos_sentences_dev, alphabet_pos) pos_sentences_id_test = utils.map_string_2_id_close(pos_sentences_test, alphabet_pos) chunk_sentences_id_train, alphabet_chunk = utils.map_string_2_id_open(chunk_sentences_train, 'chunk') alphabet_chunk.save('pre-trained-model/ner', name='alphabet_chunk') chunk_sentences_id_dev = utils.map_string_2_id_close(chunk_sentences_dev, alphabet_chunk) chunk_sentences_id_test = utils.map_string_2_id_close(chunk_sentences_test, alphabet_chunk) label_sentences_id_train, alphabet_label = utils.map_string_2_id_open(label_sentences_train, 'ner') alphabet_label.save('pre-trained-model/ner', name='alphabet_label') label_sentences_id_dev = utils.map_string_2_id_close(label_sentences_dev, alphabet_label) label_sentences_id_test = utils.map_string_2_id_close(label_sentences_test, alphabet_label) word_train, label_train, mask_train = \ utils.construct_tensor_word(word_sentences_train, label_sentences_id_train, unknown_embedd, embedding_words, embedding_vectors, embedd_dim, max_length) word_dev, label_dev, mask_dev = \ utils.construct_tensor_word(word_sentences_dev, label_sentences_id_dev, unknown_embedd, embedding_words, embedding_vectors, embedd_dim, max_length) word_test, label_test, mask_test = \ utils.construct_tensor_word(word_sentences_test, label_sentences_id_test, unknown_embedd, embedding_words, embedding_vectors, embedd_dim, max_length) pos_train = utils.construct_tensor_onehot(pos_sentences_id_train, max_length, alphabet_pos.size()) pos_dev = utils.construct_tensor_onehot(pos_sentences_id_dev, max_length, alphabet_pos.size()) pos_test = utils.construct_tensor_onehot(pos_sentences_id_test, max_length, alphabet_pos.size()) chunk_train = utils.construct_tensor_onehot(chunk_sentences_id_train, max_length, alphabet_chunk.size()) chunk_dev = utils.construct_tensor_onehot(chunk_sentences_id_dev, max_length, alphabet_chunk.size()) chunk_test = utils.construct_tensor_onehot(chunk_sentences_id_test, max_length, alphabet_chunk.size()) word_train = np.concatenate((word_train, pos_train), axis=2) word_train = np.concatenate((word_train, chunk_train), axis=2) word_dev = np.concatenate((word_dev, pos_dev), axis=2) word_dev = np.concatenate((word_dev, chunk_dev), axis=2) word_test = np.concatenate((word_test, pos_test), axis=2) word_test = np.concatenate((word_test, chunk_test), axis=2) alphabet_char = LabelEncoder('char') alphabet_char.get_index(word_end) index_sentences_train, max_char_length_train = utils.get_character_indexes(word_sentences_train, alphabet_char) alphabet_char.close() char_embedd_table = utils.build_char_embedd_table(char_embedd_dim, alphabet_char) alphabet_char.save('pre-trained-model/ner', name='alphabet_char') index_sentences_dev, max_char_length_dev = utils.get_character_indexes(word_sentences_dev, alphabet_char) index_sentences_test, max_char_length_test = utils.get_character_indexes(word_sentences_test, alphabet_char) max_char_length = max(max_char_length_train, max_char_length_dev, max_char_length_test) char_train = utils.construct_tensor_char(index_sentences_train, max_length, max_char_length, alphabet_char) char_dev = utils.construct_tensor_char(index_sentences_dev, max_length, max_char_length, alphabet_char) char_test = utils.construct_tensor_char(index_sentences_test, max_length, max_char_length, alphabet_char) num_labels = alphabet_label.size() - 1 num_data, _, embedd_dim_concat = word_train.shape # print(np.shape(word_train)) # print(np.shape(word_dev)) # print(np.shape(word_test)) # print(np.shape(char_train)) # print(np.shape(char_dev)) # print(np.shape(char_test)) # print(np.shape(mask_train)) # print(np.shape(mask_dev)) # print(np.shape(mask_test)) # print(np.shape(label_train)) # print(np.shape(label_dev)) # print(np.shape(label_test)) # print(word_train[-1]) np.save("word_train.npy",word_train) np.save("label_train.npy", label_train) print("Done") return word_train, word_dev, word_test, char_train, char_dev, char_test, mask_train, mask_dev, mask_test, \ label_train, label_dev, label_test, alphabet_label, alphabet_char, max_length, max_char_length, \ char_embedd_table, num_labels, num_data, embedd_dim_concat