def main(): parser = ArgumentParser() parser.add_argument('-d', '--dataset') parser.add_argument('-p', '--dataset-path', default=default_dataset_path()) parser.add_argument('-o', '--output') opts = parser.parse_args() dataset_name = opts.dataset dataset_path = opts.dataset_path out_fn = opts.output if not out_fn: logging.error('--output argument required ...') parser.print_usage() sys.exit(1) if not dataset_name: logging.error('--dataset argument required ...') parser.print_usage() sys.exit(1) if dataset_name == 'newsgroups': corpus = (preprocess_ng(doc) for doc in newsgroups.iterator( download_file(newsgroups.NEWSGROUPS_ARCHIVE_URL, dataset_path))) if dataset_name == 'ndt': dataset = NDTDataset(dataset_path=dataset_path) dataset.install() corpus = (preprocess_ndt(doc) for doc in dataset) else: logging.error('Unknown dataset %s ...' % dataset_name) sys.exit(1) d = Dictionary(corpus) d.save_as_text(out_fn, sort_by_word=False)
def main(): parser = ArgumentParser() parser.add_argument('-d', '--dataset') parser.add_argument('-p', '--dataset-path', default=default_dataset_path()) parser.add_argument('-o', '--output') opts = parser.parse_args() dataset_name = opts.dataset dataset_path = opts.dataset_path out_fn = opts.output if not out_fn: logging.error('--output argument required ...') parser.print_usage() sys.exit(1) if not dataset_name: logging.error('--dataset argument required ...') parser.print_usage() sys.exit(1) if dataset_name == 'newsgroups': corpus = (preprocess_ng(doc) for doc in newsgroups.iterator(download_file(newsgroups.NEWSGROUPS_ARCHIVE_URL, dataset_path))) if dataset_name == 'ndt': dataset = NDTDataset(dataset_path=dataset_path) dataset.install() corpus = (preprocess_ndt(doc) for doc in dataset) else: logging.error('Unknown dataset %s ...' % dataset_name) sys.exit(1) d = Dictionary(corpus) d.save_as_text(out_fn, sort_by_word=False)
def install_hunpos(): """ Downloads and install system appropriate HunPos binaries in the default location. :rtype : None """ models_dir = os.path.join(project_path(), 'models') hunpos_archive_fn = download_file(HUNPOS_URL_MAP[sys.platform], models_dir) if sys.platform == 'win32': with ZipFile(hunpos_archive_fn) as f: f.extractall(models_dir) else: with TarFile(hunpos_archive_fn) as f: f.extractall(models_dir) os.remove(hunpos_archive_fn)