def main(): e = Extractor() e.load(sys.argv[1]) p = Preprocessor(e.classes, type='class', pkg_start=sys.argv[2]) save_path = 'dataset/cache/'+sys.argv[1].split('/')[-1].split('.')[0] + \ '_prep.pckl' p.save(save_path) for label in p.labels: print(label) print('Number of classes: {}'.format(len(p.labels))) print('Number of packages: {}'.format(len(set(p.labels))))
console.setLevel(logging.INFO) formatter = logging.Formatter('%(name)-2s: %(levelname)-2s %(message)s') console.setFormatter(formatter) logging.getLogger('').addHandler(console) # Extract the classes from the dataset. Use cached version if available or not specified otherwise. logging.info('Using dataset {}'.format(args.PROJECT)) dataset_name = args.PROJECT.split(os.sep)[-1] t0 = time() a = Extractor() cache_path = '../dataset/cache/' + dataset_name + '.pckl' if os.path.isfile(cache_path) and args.reload_extraction is False: logging.info('########## LOADING PROJECT FROM CACHE ##########') a.load(cache_path) else: logging.info('########## EXTRACTING PROJECT ##########') dataset_path = args.PROJECT if not os.path.exists(dataset_path): sys.exit('Specified dataset not found in dataset folder. Aborting') a.clean_dataset(dataset_path) a.extr_folder_classes(dataset_path) a.save(cache_path) logging.info('Finished extracting {0:.4f}s'.format(time() - t0)) # Preprocess extracted dataset. Use cached version if available or not specified otherwise. t0 = time() cache_path = '../dataset/cache/' + dataset_name + '_prep.pckl' if os.path.isfile(cache_path) and (args.reload_preprocessing is False and args.reload_extraction is False):