def run(args): corpus_name, grid_mode, shuffles_number, rewrite_new = args.generate_shuffle, args.grid_mode, \ args.shuffles_number, args.rewrite_new if args.generate_shuffle: grids_path = 'data/' + corpus_name + '/' + grid_mode + '/' corpus_dct, _ = get_corpus(corpus_name) grid_loader = GridLoader(grids_path) grid_generator = GridGenerator(coref='no_coref', nlp='no_nlp') grid_shuffler = GridShuffler(grids_path, grid_loader=grid_loader, grid_generator=grid_generator) grid_shuffler.create_shuffle_index_files( corpus_dct, corpus_name=corpus_name, shuffles_number=shuffles_number, grids_path=grids_path, rewrite_new=rewrite_new)
def main(): print(''.join(y for y in ["-"] * 180)) ### Params to modify ### corpus = 'Oasis' model_type = 'egrid_-coref' saliency = 1 transition_range = (2, 2) task = 'reordering' only_data = ['training', 'test', 'validation'] # only_data = ['test','validation'] # only_data = ['training'] ######################## data_types = {'training': 'train', 'test': 'test', 'validation': 'dev'} experiments_path = 'experiments/' grids_data_path = 'data/' out_path = create_path(experiments_path + corpus + '/' + task + '/' + model_type + '/') grids_path = grids_data_path + corpus + '/' + model_type + '/' data_filename = grids_path.split('/')[1] + '_sal' + str( saliency) + '_range' + str(transition_range[0]) + "_" + str( transition_range[1]) # out_path = 'experiments/egrid_-coref_DAspan_da_noentcol/' # grids_path = 'data/egrid_-coref_DAspan_da_noentcol/' # out_path = 'experiments/egrid_-coref_DAspan_da_noentcol/' # grids_path = 'data/egrid_-coref_DAspan_da_noentcol/' # out_path = 'experiments/noents_baseline/' # grids_path = 'data/noents_baseline/' # simple_egrid_-coref print('Grid folder: ', grids_path) grid_loader = GridLoader(grids_path) # swda_path = '../../Datasets/Switchboard/data/switchboard1-release2/' # # swda_path = '../../swda/switchboard1-release2/' # server path # swda = Switchboard(swda_path) # corpus_dct = swda.load_csv() corpus_dct, corpus_loader = get_corpus(corpus) # Get train val test splits experiments_split = grid_loader.get_training_test_splits( corpus_name=corpus) # type pd.Dataframe print('Train Test split', experiments_split.shape) print('Training data', len(experiments_split['training'])) print('Test data', len(experiments_split['test'])) # Reduce data # corpus_dct = {k:corpus_dct[k] for i, k in enumerate(corpus_dct.keys()) if i==0} # Testing if corpus == 'AMI': selected_files_list = list( set([ grid_name + '.' for data in only_data for grid_name in experiments_split[data].tolist() ])) else: selected_files_list = list( set([ grid_name for data in only_data for grid_name in experiments_split[data].tolist() ])) corpus_dct = { k: corpus_dct[k] for k in corpus_dct.keys() if k in selected_files_list } feature_extractor = EntitiesFeatureExtractor(grid_folder=grids_path, grid_loader=grid_loader) print('Corpus name: ', corpus) print('Length selected files: ', len(selected_files_list)) print('Model type: ', model_type) print('Data type: ', only_data) print('Task: ', task) print('Saliency: ', saliency) print('Transition_range: ', transition_range) grids_transitions_dict = feature_extractor.extract_transitions_probs( corpus_dct=corpus_dct, transition_range=transition_range, saliency=saliency, logprobs=True, corpus_name=corpus, task=task) print('Grid trans dct len: ', len(grids_transitions_dict)) print('Grid trans key example: ', list(grids_transitions_dict.keys())[0]) # print('Grid trans dct len: ', grids_transitions_dict.keys()) for data_type in only_data: filename = data_types[data_type] grids_transitions_test = get_grids_transitions_data( grids_transitions_dict, experiments_split, data_type, corpus) if task == 'reordering': feature_extractor.featurize_transitions_dct_svmlightformat( grids_transitions_test, out_path + data_filename + '_' + filename) elif task == 'insertion': feature_extractor.featurize_transitions_dct_svmlightformat_insertion( grids_transitions_test, out_path + data_filename + '_' + filename)
def run(args): corpus_name, grid_mode, task_type, saliency, number_transitions = \ args.generate_feature_vectors, args.grid_mode, args.task, \ args.saliency, args.number_transitions if args.generate_feature_vectors: print(''.join(y for y in ["-"] * 180)) corpus = corpus_name model_type = grid_mode saliency = saliency transition_range = number_transitions task = task_type only_data = ['training', 'test', 'validation'] # only_data = ['test','validation'] # only_data = ['training'] ######################## data_types = {'training': 'train', 'test': 'test', 'validation': 'dev'} experiments_path = 'experiments/' grids_data_path = 'data/' out_path = create_path(experiments_path + corpus + '/' + task + '/' + model_type + '/') grids_path = grids_data_path + corpus + '/' + model_type + '/' data_filename = grids_path.split('/')[1] + '_sal' + str( saliency) + '_range' + str(transition_range[0]) + "_" + str( transition_range[1]) print('Grid folder: ', grids_path) grid_loader = GridLoader(grids_path) corpus_dct, corpus_loader = get_corpus(corpus) # Get train val test splits experiments_split = grid_loader.get_training_test_splits( corpus_name=corpus) # type pd.Dataframe print('Train Test split', experiments_split.shape) print('Training data', len(experiments_split['training'])) print('Test data', len(experiments_split['test'])) # Reduce data # corpus_dct = {k:corpus_dct[k] for i, k in enumerate(corpus_dct.keys()) if i==0} # Testing if corpus == 'AMI': selected_files_list = list( set([ grid_name + '.' for data in only_data for grid_name in experiments_split[data].tolist() ])) else: selected_files_list = list( set([ grid_name for data in only_data for grid_name in experiments_split[data].tolist() ])) corpus_dct = { k: corpus_dct[k] for k in corpus_dct.keys() if k in selected_files_list } feature_extractor = EntitiesFeatureExtractor(grid_folder=grids_path, grid_loader=grid_loader) print('Corpus name: ', corpus) print('Length selected files: ', len(selected_files_list)) print('Model type: ', model_type) print('Data type: ', only_data) print('Task: ', task) print('Saliency: ', saliency) print('Transition_range: ', transition_range) grids_transitions_dict = feature_extractor.extract_transitions_probs( corpus_dct=corpus_dct, transition_range=transition_range, saliency=saliency, logprobs=True, corpus_name=corpus, task=task) print('Grid trans dct len: ', len(grids_transitions_dict)) print('Grid trans key example: ', list(grids_transitions_dict.keys())[0]) # print('Grid trans dct len: ', grids_transitions_dict.keys()) for data_type in only_data: filename = data_types[data_type] grids_transitions_test = get_grids_transitions_data( grids_transitions_dict, experiments_split, data_type, corpus) print(data_type, ",", len(data_type)) if len(grids_transitions_test) == 0: print("no data for type", data_type, " found!") continue if task == 'reordering': feature_extractor.featurize_transitions_dct_svmlightformat( grids_transitions_test, out_path + data_filename + '_' + filename) elif task == 'insertion': feature_extractor.featurize_transitions_dct_svmlightformat_insertion( grids_transitions_test, out_path + data_filename + '_' + filename)