def load_and_process(train_data_file, test_data_file=None, train_tokens_file=None, test_tokens_file=None, embed_size=300, max_comment_size=250, label_names=None, fraction_dev=0.3, debug=False): # Get glove/w2v data emb_data = preprocess.get_glove(embed_size) # Load and (optionally) subset train data train_data = preprocess.load_data(train_data_file, debug=debug) # Load test data if test_data_file: test_data = preprocess.load_data(test_data_file, debug=debug) id_test = test_data['id'] # Tokenize train comments or load pre-tokenized train comments if debug or (train_tokens_file is None): tokens = preprocess.tokenize_df(train_data) else: tokens = preprocess.load_tokenized_comments(train_tokens_file) # Pad and create masks for train comments tokens, masks = preprocess.pad_comments(tokens, max_comment_size) # Tokenize test comments or load pre-tokenized test comments if test_data_file: if test_tokens_file is None: tokens_test = preprocess.tokenize_df(test_data) else: tokens_test = preprocess.load_tokenized_comments(test_tokens_file) # Pad and create masks for train comments tokens_test, masks_test = preprocess.pad_comments( tokens_test, max_comment_size) # Load train labels if label_names is None: label_names = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ] labels = preprocess.filter_labels(train_data, label_names) # Split to train and dev sets train_dev_set = preprocess.split_train_dev(tokens, labels, masks, fraction_dev=fraction_dev) if test_data_file: test_set = (id_test, tokens_test, masks_test) else: test_set = None return emb_data, train_dev_set, test_set
def __init__(self, config=None, emb_data=None, glove_dim=None): # Load word embedding data from memory if already loaded if emb_data is not None: self.emb_matrix = emb_data[0].astype('float32') self.word2id = emb_data[1] self.id2word = emb_data[2] # Load glove data from file elif glove_dim is not None: self.emb_matrix, self.word2id, self.id2word = get_glove(glove_dim) self.emb_matrix = self.emb_matrix.astype('float32') # Load config and build self.config = Config(config) self.build()
def load_model(): save_prefix = os.path.join(out_dir, config['exp_name'], config['exp_name']) emb_data = preprocess.get_glove(embed_size) model = rnn_model.PredictWithRNNModel(config, emb_data, save_prefix) return model