예제 #1
0
def main():

    e = Extractor()
    e.load(sys.argv[1])
    p = Preprocessor(e.classes, type='class', pkg_start=sys.argv[2])
    save_path = 'dataset/cache/'+sys.argv[1].split('/')[-1].split('.')[0] + \
                '_prep.pckl'
    p.save(save_path)
    for label in p.labels:
        print(label)
    print('Number of classes:  {}'.format(len(p.labels)))
    print('Number of packages: {}'.format(len(set(p.labels))))
예제 #2
0
    console.setLevel(logging.INFO)
    formatter = logging.Formatter('%(name)-2s: %(levelname)-2s %(message)s')
    console.setFormatter(formatter)
    logging.getLogger('').addHandler(console)

# Extract the classes from the dataset. Use cached version if available or not specified otherwise.
logging.info('Using dataset {}'.format(args.PROJECT))
dataset_name = args.PROJECT.split(os.sep)[-1]

t0 = time()
a = Extractor()
cache_path = '../dataset/cache/' + dataset_name + '.pckl'

if os.path.isfile(cache_path) and args.reload_extraction is False:
    logging.info('########## LOADING PROJECT FROM CACHE ##########')
    a.load(cache_path)
else:
    logging.info('##########     EXTRACTING PROJECT     ##########')
    dataset_path = args.PROJECT
    if not os.path.exists(dataset_path):
        sys.exit('Specified dataset not found in dataset folder. Aborting')
    a.clean_dataset(dataset_path)
    a.extr_folder_classes(dataset_path)
    a.save(cache_path)
logging.info('Finished extracting {0:.4f}s'.format(time() - t0))

# Preprocess extracted dataset. Use cached version if available or not specified otherwise.
t0 = time()
cache_path = '../dataset/cache/' + dataset_name + '_prep.pckl'
if os.path.isfile(cache_path) and (args.reload_preprocessing is False
                                   and args.reload_extraction is False):