def main(): parser = ArgumentParser() parser.add_argument('-d', '--dataset') parser.add_argument('-p', '--dataset-path', default=default_dataset_path()) parser.add_argument('-o', '--output') opts = parser.parse_args() dataset_name = opts.dataset dataset_path = opts.dataset_path out_fn = opts.output if not out_fn: logging.error('--output argument required ...') parser.print_usage() sys.exit(1) if not dataset_name: logging.error('--dataset argument required ...') parser.print_usage() sys.exit(1) if dataset_name == 'newsgroups': corpus = (preprocess_ng(doc) for doc in newsgroups.iterator( download_file(newsgroups.NEWSGROUPS_ARCHIVE_URL, dataset_path))) if dataset_name == 'ndt': dataset = NDTDataset(dataset_path=dataset_path) dataset.install() corpus = (preprocess_ndt(doc) for doc in dataset) else: logging.error('Unknown dataset %s ...' % dataset_name) sys.exit(1) d = Dictionary(corpus) d.save_as_text(out_fn, sort_by_word=False)
def main(): parser = ArgumentParser() parser.add_argument('-d', '--dataset') parser.add_argument('-p', '--dataset-path', default=default_dataset_path()) parser.add_argument('-o', '--output') opts = parser.parse_args() dataset_name = opts.dataset dataset_path = opts.dataset_path out_fn = opts.output if not out_fn: logging.error('--output argument required ...') parser.print_usage() sys.exit(1) if not dataset_name: logging.error('--dataset argument required ...') parser.print_usage() sys.exit(1) if dataset_name == 'newsgroups': corpus = (preprocess_ng(doc) for doc in newsgroups.iterator(download_file(newsgroups.NEWSGROUPS_ARCHIVE_URL, dataset_path))) if dataset_name == 'ndt': dataset = NDTDataset(dataset_path=dataset_path) dataset.install() corpus = (preprocess_ndt(doc) for doc in dataset) else: logging.error('Unknown dataset %s ...' % dataset_name) sys.exit(1) d = Dictionary(corpus) d.save_as_text(out_fn, sort_by_word=False)
def main(): parser = ArgumentParser() parser.add_argument('-f', '--features') parser.add_argument('-m', '--model-file') parser.add_argument('-d', '--dataset-file') parser.add_argument('-l', '--language', default='nob') args = parser.parse_args() features = args.features model_fn = args.model_file dataset_fn = args.dataset_file lang = args.language if not features in FEATURES_MAP: logging.error('Unknown feature identifier %s (one of <%s>) ...' % (features, '|'.join(FEATURES_MAP.keys()))) sys.exit(1) if dataset_fn and not os.path.exists(dataset_fn): logging.error('Could not find NDT dataset archive %s ...' % dataset_fn) sys.exit(1) if not model_fn: # noinspection PyUnresolvedReferences model_fn = 'no-ndt-hunpos-%s-%s' % (features, datetime.now().strftime("%Y-%m-%d-%H-%M")) if not lang in ['nob', 'nno', 'both']: logging.error('Uknown language %s (one of <%s>) ...' % (lang), '|'.join(['nob', 'nno', 'both'])) sys.exit(1) if lang == 'both': lang = None if dataset_fn: dataset = NDTDataset(dataset_fn=dataset_fn, normalize_func=None, fields=FIELDS, lang=lang) else: dataset = NDTDataset(normalize_func=None, fields=FIELDS, lang=lang) dataset.install() pos_norm_func = FEATURES_MAP[features] seq_gen = ([(form, pos_norm_func(form, pos, feats)) for form, pos, feats in sent] for sent in dataset) stats = train_hunpos_model(seq_gen, model_fn) # print the stats from the hunpos output for k, v in stats.items(): print '%s:\t%s' % (k, v)
def main(): parser = ArgumentParser() parser.add_argument('-e', '--elasticsearch-server', default='localhost:9200') parser.add_argument('-d', '--dataset') parser.add_argument('-s', '--sections') opts = parser.parse_args() es_hosts = [opts.elasticsearch_server] dataset_name = opts.dataset dataset_sections = opts.sections es = Elasticsearch(hosts=es_hosts, timeout=120) if dataset_name == 'newsgroups': dataset = NewsgroupsDataset() elif dataset_name == 'aviskorpus': sections = None sources = None if dataset_sections: try: sections, sources = dataset_sections.split('-') sections = [int(s) for s in sections.split('|')] sources = [s for s in sources.split('|')] except Exception: logging.error('Malformed section specification "%s" ...' % dataset_sections) sys.exit(1) dataset = AviskorpusDataset(sections=sections, sources=sources) elif dataset_name == 'ndt': sections = None lang = None if dataset_sections: try: sections, lang = dataset_sections.split('-') sections = [int(s) for s in sections.split('|')] lang = [s for s in lang.split('|')] except Exception: logging.error('Malformed section specification "%s" ...' % dataset_sections) sys.exit(1) dataset = NDTDataset(lang=lang, sections=sections) else: logging.error('Unknown dataset %s ...' % dataset_name) sys.exit(1) dataset.install(es)
def main(): parser = ArgumentParser() parser.add_argument('-f', '--features') parser.add_argument('-m', '--model-file') parser.add_argument('-d', '--dataset-file') parser.add_argument('-l', '--language', default='nob') args = parser.parse_args() features = args.features model_fn = args.model_file dataset_fn = args.dataset_file lang = args.language if not features in FEATURES_MAP: logging.error('Unknown feature identifier %s (one of <%s>) ...' % (features, '|'.join(FEATURES_MAP.keys()))) sys.exit(1) if dataset_fn and not os.path.exists(dataset_fn): logging.error('Could not find NDT dataset archive %s ...' % dataset_fn) sys.exit(1) if not model_fn: # noinspection PyUnresolvedReferences model_fn = 'no-ndt-hunpos-%s-%s' % ( features, datetime.now().strftime("%Y-%m-%d-%H-%M")) if not lang in ['nob', 'nno', 'both']: logging.error('Uknown language %s (one of <%s>) ...' % (lang), '|'.join(['nob', 'nno', 'both'])) sys.exit(1) if lang == 'both': lang = None if dataset_fn: dataset = NDTDataset(dataset_fn=dataset_fn, normalize_func=None, fields=FIELDS, lang=lang) else: dataset = NDTDataset(normalize_func=None, fields=FIELDS, lang=lang) dataset.install() pos_norm_func = FEATURES_MAP[features] seq_gen = ([(form, pos_norm_func(form, pos, feats)) for form, pos, feats in sent] for sent in dataset) stats = train_hunpos_model(seq_gen, model_fn) # print the stats from the hunpos output for k, v in stats.items(): print '%s:\t%s' % (k, v)