def load_fire16(data_dir=dataset_dir, filename='fire16_labeled', data_set='train'): mapped_file = filename + '_4class.csv' if exists(join(data_dir, mapped_file)): # data_df = read_labelled_json(data_dir=data_dir, filename=mapped_file, data_set=data_set) data_df = read_csv(data_dir=data_dir, data_file=mapped_file, index_col=0, header=0) else: data_df = read_labelled_json(data_dir=data_dir, filename=filename, data_set=data_set) ## Match label space between two datasets: data_df = labels_mapper(data_df) # delete all rows where sum == 0 irrelevant_rows = [] for i, row in data_df.iterrows(): if sum(row[1:]) < 1: irrelevant_rows.append(i) data_df = data_df.drop(irrelevant_rows) data_df.to_csv(join(data_dir, mapped_file)) return data_df
def load_smerp17(data_dir=dataset_dir, filename='smerp17_labeled', data_set='test'): data_df = read_labelled_json(data_dir=data_dir, filename=filename) # data_df.to_csv(join(data_dir, filename+'_4class')) return data_df
def split_target(df=None, data_dir=dataset_dir, labelled_dataname=cfg['data']['test'], test_size=0.999, train_size=None, n_classes=cfg['data']['num_classes'], stratified=False): """ Splits labelled target data to train and test set. :param data_dir: :param labelled_dataname: :param test_size: :param train_size: :param n_classes: :return: """ logger.info('Splits labelled target data to train and test set.') ## Read target data if df is None: df = read_labelled_json(data_dir, labelled_dataname) df, t_lab_test_df = split_df(df, test_size=test_size, stratified=stratified, order=2, n_classes=n_classes) logger.info(f'Number of TEST samples: [{t_lab_test_df.shape[0]}]') if train_size is not None: _, df = split_df(df, test_size=train_size, stratified=stratified, order=2, n_classes=n_classes) logger.info(f'Number of TRAIN samples: [{df.shape[0]}]') # token_dist(t_lab_df) return df, t_lab_test_df
def read_data(self, data_dir=dataset_dir, filename=cfg['data']['train']): new_df = read_labelled_json(data_dir, filename) new_df = format_inputs(new_df) return new_df
def classify( train_df=None, test_df=None, stoi=None, vectors=None, n_classes=cfg['data']['num_classes'], dim=cfg['embeddings']['emb_dim'], data_dir=dataset_dir, train_filename=cfg['data']['train'], test_filename=cfg['data']['test'], cls_thresh=None, epoch=cfg['training']['num_epoch'], num_layers=cfg['lstm_params']['num_layers'], num_hidden_nodes=cfg['lstm_params']['hid_size'], dropout=cfg['model']['dropout'], default_thresh=0.5, lr=cfg['model']['optimizer']['lr'], train_batch_size=cfg['training']['train_batch_size'], test_batch_size=cfg['training']['eval_batch_size'], ): """ :param n_classes: :param test_batch_size: :param train_df: :param test_df: :param stoi: :param vectors: :param dim: :param data_dir: :param train_filename: :param test_filename: :param cls_thresh: :param epoch: :param num_layers: :param num_hidden_nodes: :param dropout: :param default_thresh: :param lr: :param train_batch_size: :return: """ ## Prepare labelled source data: # logger.info('Prepare labelled source data') # if train_df is None: # train_df = read_labelled_json(data_dir, train_filename) # train_df = labels_mapper(train_df) train_dataname = train_filename + "_4class.csv" train_df.to_csv(join(data_dir, train_dataname)) if stoi is None: logger.critical('GLOVE features') train_dataset, (train_vocab, train_label) = get_dataset_fields( csv_dir=data_dir, csv_file=train_dataname, min_freq=1, labelled_data=True) else: logger.critical('GCN features') train_dataset, (train_vocab, train_label) = get_dataset_fields( csv_dir=data_dir, csv_file=train_dataname, min_freq=1, labelled_data=True, embedding_file=None, embedding_dir=None) train_vocab.vocab.set_vectors(stoi=stoi, vectors=vectors, dim=dim) ## Plot representations: # plot_features_tsne(train_vocab.vocab.vectors, # list(train_vocab.vocab.stoi.keys())) ## Prepare labelled target data: logger.info('Prepare labelled target data') if test_df is None: test_df = read_labelled_json(data_dir, test_filename) test_dataname = test_filename + "_4class.csv" test_df.to_csv(join(data_dir, test_dataname)) test_dataset, (test_vocab, test_label) = get_dataset_fields( csv_dir=data_dir, csv_file=test_dataname, # init_vocab=True, labelled_data=True) # check whether cuda is available # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') logger.info('Get iterator') train_iter, val_iter = dataset2bucket_iter( (train_dataset, test_dataset), batch_sizes=(train_batch_size, test_batch_size)) size_of_vocab = len(train_vocab.vocab) num_output_nodes = n_classes # instantiate the model logger.info('instantiate the model') model = BiLSTM_Classifier(size_of_vocab, num_hidden_nodes, num_output_nodes, dim, num_layers, dropout=dropout) # architecture logger.info(model) # No. of trianable parameters logger.info('No. of trianable parameters') count_parameters(model) # Initialize the pretrained embedding logger.info('Initialize the pretrained embedding') pretrained_embeddings = train_vocab.vocab.vectors model.embedding.weight.data.copy_(pretrained_embeddings) logger.debug(pretrained_embeddings.shape) # label_cols = [str(cls) for cls in range(n_classes)] logger.info('Training model') model_best, val_preds_trues_best, val_preds_trues_all, losses = trainer( model, train_iter, val_iter, N_EPOCHS=epoch, lr=lr) plot_training_loss(losses['train'], losses['val'], plot_name='loss' + str(epoch) + str(lr)) if cls_thresh is None: cls_thresh = [default_thresh] * n_classes predicted_labels = logit2label(DataFrame( val_preds_trues_best['preds'].cpu().numpy()), cls_thresh, drop_irrelevant=False) logger.info('Calculate performance') result = calculate_performance_pl(val_preds_trues_best['trues'], val_preds_trues_best['preds']) logger.info("Result: {}".format(result)) # result_df = flatten_results(result) # result_df.round(decimals=4).to_csv( # join(data_dir, test_filename + '_results.csv')) return result
def prepare_datasets(train_df=None, test_df=None, stoi=None, vectors=None, dim=cfg['embeddings']['emb_dim'], split_test=False, get_iter=False, data_dir=dataset_dir, train_filename=cfg['data']['train'], test_filename=cfg['data']['test']): """ Creates train and test dataset from df and returns data loader. :param get_iter: If iterator over the text samples should be returned :param split_test: Splits the testing data :param train_df: Training dataframe :param test_df: Testing dataframe :param vectors: Custom Vectors for each token :param dim: Embedding dim :param data_dir: :param train_filename: :param test_filename: :return: """ logger.info(f'Prepare labelled train (source) data: {train_filename}') if train_df is None: if train_filename.startswith('fire16'): train_df = load_fire16() else: train_df = read_labelled_json(data_dir, train_filename) train_dataname = train_filename + "_4class.csv" train_df.to_csv(join(data_dir, train_dataname)) if stoi is None: logger.critical('Setting GLOVE vectors:') train_dataset, (train_vocab, train_label) = get_dataset_fields( csv_dir=data_dir, csv_file=train_dataname, min_freq=1, labelled_data=True) else: logger.critical('Setting custom vectors:') train_dataset, (train_vocab, train_label) = get_dataset_fields( csv_dir=data_dir, csv_file=train_dataname, min_freq=1, labelled_data=True, embedding_file=None, embedding_dir=None) train_vocab.vocab.set_vectors(stoi=stoi, vectors=vectors, dim=dim) ## Plot representations: # plot_features_tsne(train_vocab.vocab.vectors, # list(train_vocab.vocab.stoi.keys())) # train_vocab = { # 'freqs': train_vocab.vocab.freqs, # 'str2idx_map': dict(train_vocab.vocab.stoi), # 'idx2str_map': train_vocab.vocab.itos, # 'vectors': train_vocab.vocab.vectors, # } ## Prepare labelled target data: logger.info(f'Prepare labelled test (target) data: {test_filename}') if test_df is None: if test_filename.startswith('smerp17'): test_df = load_smerp17() else: test_df = read_labelled_json(data_dir, test_filename, data_set='test') if split_test: test_extra_df, test_df = split_target(df=test_df, test_size=0.4) test_dataname = test_filename + "_4class.csv" test_df.to_csv(join(data_dir, test_dataname)) test_dataset, (test_vocab, test_label) = get_dataset_fields(csv_dir=data_dir, csv_file=test_dataname, labelled_data=True) # test_vocab = { # 'freqs': test_vocab.vocab.freqs, # 'str2idx_map': dict(test_vocab.vocab.stoi), # 'idx2str_map': test_vocab.vocab.itos, # 'vectors': test_vocab.vocab.vectors, # } logger.info('Get iterator') if get_iter: train_batch_size = cfg['training']['train_batch_size'] test_batch_size = cfg['training']['eval_batch_size'] train_iter, val_iter = dataset2bucket_iter( (train_dataset, test_dataset), batch_sizes=(train_batch_size, test_batch_size)) return train_dataset, test_dataset, train_vocab, test_vocab, train_iter, val_iter return train_dataset, test_dataset, train_vocab, test_vocab
type=str) parser.add_argument("-ne", "--num_train_epochs", default=cfg['training']['num_epoch'], type=int) parser.add_argument("-c", "--use_cuda", default=cfg['model']['use_cuda'], action='store_true') args = parser.parse_args() from File_Handlers.json_handler import read_labelled_json from Class_mapper.FIRE16_SMERP17_map import labels_mapper data_dir = dataset_dir train_df = read_labelled_json(data_dir, args.dataset_name) train_df = labels_mapper(train_df) test_df = read_labelled_json(data_dir, cfg['data']['target']['labelled']) test_df = test_df.sample(frac=1) result, model_outputs = BERT_classifier(train_df=train_df, test_df=test_df, dataset_name=args.dataset_name, model_name=args.model_name, model_type=args.model_type, num_epoch=args.num_train_epochs, use_cuda=args.use_cuda)