def main(args): from vsm.corpus import Corpus config = topicexplorer.config.read(args.config_file) args.corpus_path = config.get("main", "corpus_file") c = Corpus.load(args.corpus_path) context_type = config.get('main', 'context_type') if args.add: metadata = parse_metadata_from_csvfile(args.add, context_type) c = add_metadata(c, context_type, metadata, force=args.force, rename=args.rename) c.save(args.corpus_path) if args.list: extract_labels(c, context_type, args.list) if args.extract: extract_metadata(c, context_type, args.extract) if args.htrc: config = add_htrc_metadata(config, corpus=c) with open(args.config_file, "w") as configfh: config.write(configfh)
def get_host_port(args): """ Returns the hostname and port number """ import topicexplorer.config config = topicexplorer.config.read(args.config) # automatic port assignment def test_port(port): try: host = args.host or config.get("www", "host") if host == '0.0.0.0': host = 'localhost' try: s = socket.create_connection((host, port), 2) s.close() raise IOError("Socket connectable on port {0}".format(port)) except socket.error: pass return port except IOError: if not args.quiet: port = int_prompt( "Conflict on port {0}. Enter new port:".format(port)) return test_port(port) else: raise IOError( "Conflict on port {0}. Try running with -p to manually set new port.".format(port)) port = args.port or int(config.get('www', 'port').format(0)) port = test_port(port) # prompt to save if (int(config.get("www", "port").format(0))) != port: if not args.quiet and bool_prompt( "Change default baseport to {0}?".format(port), default=True): config.set("www", "port", text(port)) # create deep copy of configuration # see http://stackoverflow.com/a/24343297 config_string = StringIO() config.write(config_string) # skip DEFAULT section config_string.seek(0) idx = config_string.getvalue().index("[main]") config_string.seek(idx) # read deep copy new_config = ConfigParser() config.read_file(config_string) # write deep copy without DEFAULT section # this preserves DEFAULT for rest of program with open(args.config, 'w') as configfh: new_config.write(configfh) # hostname assignment host = args.host or config.get('www', 'host') return host, port
def absolutize_config_file(config_file, output_dir): config_file = os.path.join(output_dir, config_file) config = topicexplorer.config.read(config_file) # path variables corpus_file = config.get('main', 'corpus_file') corpus_file = os.path.join(output_dir, corpus_file) corpus_file = os.path.abspath(corpus_file) config.set('main', 'corpus_file', corpus_file) model_pattern = config.get('main', 'model_pattern') model_pattern = os.path.join(output_dir, model_pattern) model_pattern = os.path.abspath(model_pattern) config.set('main', 'model_pattern', model_pattern) cluster_path = config.get('main', 'cluster') if cluster_path is not None and cluster_path != 'None': cluster_path = os.path.join(output_dir, cluster_path) cluster_path = os.path.abspath(cluster_path) config.set('main', 'cluster', cluster_path) path = config.get('main', 'path') if path is not None and path != 'None': path = os.path.join(output_dir, path) path = os.path.abspath(path) config.set('main', 'path', path) raw_corpus = config.get('main', 'raw_corpus') if raw_corpus is not None and raw_corpus != 'None': raw_corpus = os.path.join(output_dir, raw_corpus) raw_corpus = os.path.abspath(raw_corpus) config.set('main', 'raw_corpus', raw_corpus) corpus_desc = config.get('main', 'corpus_desc') if corpus_desc is not None and corpus_desc != 'None': corpus_desc = os.path.join(output_dir, corpus_desc) corpus_desc = os.path.abspath(corpus_desc) config.set('main', 'corpus_desc', corpus_desc) htrc_metadata = config.get('www', 'htrc_metadata') if htrc_metadata is not None and htrc_metadata != 'None': htrc_metadata = os.path.join(output_dir, htrc_metadata) htrc_metadata = os.path.abspath(htrc_metadata) config.set('www', 'htrc_metadata', htrc_metadata) with open(config_file, 'w', encoding='utf8') as configfile: config.write(configfile)
def cluster(n_clusters, config_file): from .cluster import dimensionReduce dimension_reduce_model = dimensionReduce(config_file) dimension_reduce_model.fit_isomap() dimension_reduce_model.fit_kmeans(int(n_clusters)) print("writing model files for Isomap and kmeans\n") config = topicexplorer.config.read(config_file) corpus_filename = config.get("main", "corpus_file") filename = '.'.join(corpus_filename.split('.')[:-1]) + '-cluster.csv' config.set("main", "cluster", filename) with open(config_file, "w") as configfh: config.write(configfh) dimension_reduce_model.write(config.get("main", "cluster")) return filename
def create_relative_config_file(config_file, manifest, include_corpus=False): if sys.version_info[0] == 3: root = os.path.commonpath(map(os.path.abspath, manifest)) + '/' else: root = os.path.commonprefix(map(os.path.abspath, manifest)) config = topicexplorer.config.read(config_file) # path variables corpus_file = config.get('main', 'corpus_file') model_pattern = config.get('main', 'model_pattern') cluster_path = config.get('main', 'cluster') path = config.get('main', 'path') raw_corpus = config.get('main', 'raw_corpus') corpus_desc = config.get('main', 'corpus_desc') config.set('main', 'corpus_file', corpus_file.replace(root, '')) config.set('main', 'model_pattern', model_pattern.replace(root, '')) if cluster_path is not None: config.set('main', 'cluster', cluster_path.replace(root, '')) if path is not None: config.set('main', 'path', path.replace(root, '')) if raw_corpus is not None and include_corpus: config.set('main', 'raw_corpus', raw_corpus.replace(root, '')) else: config.set('main', 'raw_corpus', None) if corpus_desc is not None: config.set('main', 'corpus_desc', corpus_desc.replace(root, '')) try: if config.getboolean('main', 'htrc'): htrc_metapath = config.get('www', 'htrc_metadata') if htrc_metapath is not None: config.set('www', 'htrc_metadata', htrc_metapath.replace(root, '')) except: pass tempfh = NamedTemporaryFile(prefix='tez.'+config_file, delete=False) temp_config_file = tempfh.name tempfh.close() with open(temp_config_file, 'w', encoding='utf-8') as tempfile: config.write(tempfile) return temp_config_file
def main(args=None, launch=True): download_and_extract() pwd = os.getcwd() init_parser = ArgumentParser() init.populate_parser(init_parser) args = init_parser.parse_args( ['ap', '--name', '"Associated Press 88-90 sample"', '--rebuild', '-q']) init.main(args) prep_parser = ArgumentParser() prep.populate_parser(prep_parser) args = prep_parser.parse_args( 'ap.ini --lang en --high 2000 --low 5 -q'.split()) prep.main(args) train_parser = ArgumentParser() train.populate_parser(train_parser) args = train_parser.parse_args( "ap.ini -k 20 40 60 --context-type article --iter 20".split()) train.main(args) import topicexplorer.config config = topicexplorer.config.read('ap.ini') config.set("main", "label_module", "topicexplorer.extensions.ap") config.set("main", "corpus_desc", "ap.md") config.set("www", "icons", "ap,fingerprint,link") config.set("www", "fulltext", "True") shutil.copyfile(get_static_resource_path('demo/ap.md'), 'ap.md') with open("ap.ini", "w") as configfh: config.write(configfh) if launch: launch_parser = ArgumentParser() server.populate_parser(launch_parser) args = launch_parser.parse_args(['ap.ini']) server.main(args)
def main(args=None, launch=True): download_and_extract() pwd = os.getcwd() init_parser = ArgumentParser() init.populate_parser(init_parser) args = init_parser.parse_args( ['ap', '--name', '"Associated Press 88-90 sample"', '--rebuild', '-q']) init.main(args) prep_parser = ArgumentParser() prep.populate_parser(prep_parser) args = prep_parser.parse_args('ap.ini --lang en --high 2000 --low 5 -q'.split()) prep.main(args) train_parser = ArgumentParser() train.populate_parser(train_parser) args = train_parser.parse_args("ap.ini -k 20 40 60 --context-type article --iter 20".split()) train.main(args) import topicexplorer.config config = topicexplorer.config.read('ap.ini') config.set("main", "label_module", "topicexplorer.extensions.ap") config.set("main", "corpus_desc", "ap.md") config.set("www", "icons", "ap,fingerprint,link") config.set("www", "fulltext", "True") shutil.copyfile(get_static_resource_path('demo/ap.md'), 'ap.md') with open("ap.ini", "w") as configfh: config.write(configfh) if launch: launch_parser = ArgumentParser() server.populate_parser(launch_parser) args = launch_parser.parse_args(['ap.ini']) server.main(args)
def main(args): if args.cluster: cluster(args.cluster, args.config_file) return config = topicexplorer.config.read(args.config_file) corpus_filename = config.get("main", "corpus_file") model_path = config.get("main", "path") if config.getboolean("main", "sentences"): from vsm.extensions.ldasentences import CorpusSent as Corpus else: from vsm.corpus import Corpus if args.k is None: try: if config.get("main", "topics"): default = ' '.join(map(str, eval(config.get("main", "topics")))) if args.quiet: args.k = [int(n) for n in default.split()] else: raise NoOptionError('main', 'topics') except NoOptionError: default = ' '.join(map(str, range(20, 100, 20))) while args.k is None: ks = input("Number of Topics [Default '{0}']: ".format(default)) try: if ks: args.k = [int(n) for n in ks.split()] elif not ks.strip(): args.k = [int(n) for n in default.split()] if args.k: print("\nTIP: number of topics can be specified with argument '-k N N N ...':") print(" topicexplorer train %s -k %s\n" %\ (args.config_file, ' '.join(map(str, args.k)))) except ValueError: print("Enter valid integers, separated by spaces!") if args.processes < 0: import multiprocessing args.processes = multiprocessing.cpu_count() + args.processes print("Loading corpus... ") corpus = Corpus.load(corpus_filename) try: model_pattern = config.get("main", "model_pattern") except NoOptionError: model_pattern = None if (model_pattern is not None and not args.rebuild and (args.quiet or args.cont or bool_prompt("""Existing topic models found. You can continue training or start a new model. Do you want to continue training your existing models? """, default=True))): from vsm.model.lda import LDA m = LDA.load(model_pattern.format(args.k[0]), multiprocessing=args.processes > 1, n_proc=args.processes) if args.iter is None and not args.quiet: # pragma: no cover args.iter = int_prompt("Total number of training iterations:", default=int(m.iteration * 1.5), min=m.iteration) print("\nTIP: number of training iterations can be specified with argument '--iter N':") print(" topicexplorer train --iter %d %s\n" % (args.iter, args.config_file)) elif args.iter is None and args.quiet: # pragma: no cover args.iter = int(m.iteration * 1.5) del m # if the set changes, build some new models and continue some old ones config_topics = eval(config.get("main", "topics")) if args.k != config_topics: new_models = set(args.k) - set(config_topics) continuing_models = set(args.k) & set(config_topics) build_models(corpus, corpus_filename, model_path, config.get("main", "context_type"), new_models, n_iterations=args.iter, n_proc=args.processes, seed=args.seed, dry_run=args.dry_run) model_pattern = continue_training(model_pattern, continuing_models, args.iter, n_proc=args.processes, dry_run=args.dry_run) else: model_pattern = continue_training(model_pattern, args.k, args.iter, n_proc=args.processes, dry_run=args.dry_run) else: # build a new model if args.iter is None and not args.quiet: # pragma: no cover args.iter = int_prompt("Number of training iterations:", default=200) print("\nTIP: number of training iterations can be specified with argument '--iter N':") print(" topicexplorer train --iter %d %s\n" % (args.iter, args.config_file)) elif args.iter is None and args.quiet: # pragma: no cover args.iter = 200 # TODO: if only one context_type, make it just the one context type. ctxs = corpus.context_types if len(ctxs) == 1: args.context_type = ctxs[0] else: ctxs = sorted(ctxs, key=lambda ctx: len(corpus.view_contexts(ctx))) if args.context_type not in ctxs: while args.context_type not in ctxs: contexts = ctxs[:] contexts[0] = contexts[0].upper() contexts = '/'.join(contexts) args.context_type = input("Select a context type [%s] : " % contexts) if args.context_type.strip() == '': args.context_type = ctxs[0] if args.context_type == ctxs[0].upper(): args.context_type = ctxs[0] print("\nTIP: context type can be specified with argument '--context-type TYPE':") print(" topicexplorer train --context-type %s %s\n" % (args.context_type, args.config_file)) print("\nTIP: This configuration can be automated as:") print(" topicexplorer train %s --iter %d --context-type %s -k %s\n" %\ (args.config_file, args.iter, args.context_type, ' '.join(map(str, args.k)))) model_pattern = build_models(corpus, corpus_filename, model_path, args.context_type, args.k, n_iterations=args.iter, n_proc=args.processes, seed=args.seed, dry_run=args.dry_run) config.set("main", "model_pattern", model_pattern) if args.context_type: # test for presence, since continuing doesn't require context_type config.set("main", "context_type", args.context_type) args.k.sort() config.set("main", "topics", str(args.k)) if not args.dry_run: if config.has_option("main", "cluster"): cluster_path = config.get("main", "cluster", fallback=None) config.remove_option("main", "cluster") try: if cluster_path: os.remove(cluster_path) except (OSError, IOError): # fail silently on IOError pass with open(args.config_file, "w") as configfh: config.write(configfh)
def main(args): config = topicexplorer.config.read(args.config_file) if config.getboolean("main", "sentences"): from vsm.extensions.ldasentences import CorpusSent as Corpus else: from vsm.corpus import Corpus if args.lang is None: args.lang = [] args.corpus_path = config.get("main", "corpus_file") c = Corpus.load(args.corpus_path) if c.original_length != len(c.corpus): print("Corpus has already been prepared. Proceed to training or") print("re-init the corpus to apply a different set of stopwords.") print("\nTIP: Train the LDA models with:") print(" topicexplorer train", args.config_file) sys.exit(1) # auto-guess a language """ new_langs = [lang for lang in detect_langs(c) if lang in langs and lang not in args.lang] if new_langs: args.lang.extend(new_langs) """ # add default locale if no other languages are specified # do not add if in quiet mode -- make everything explicit if not args.lang and not args.quiet: import locale locale = locale.getdefaultlocale()[0].split('_')[0].lower() if locale in langs.keys(): args.lang.append(locale) # check for any new candidates args.lang = [lang for lang in args.lang if stop_language(c, langs[lang])] if args.lang and not args.quiet: args.lang = lang_prompt(args.lang) stoplist = set() # Apply stop words print(" ") for lang in args.lang: print("Applying", langs[lang], "stopwords") candidates = stop_language(c, langs[lang]) if len(candidates): stoplist.update(candidates) # Apply custom stopwords file if args.stopword_file: with open(args.stopword_file, encoding='utf8') as swf: #candidates = [unidecode(word.strip()) for word in swf] candidates = [word.strip() for word in swf] if len(candidates): print("Applying custom stopword file to remove {} word{}.". format(len(candidates), 's' if len(candidates) > 1 else '')) stoplist.update(candidates) if args.min_word_len: candidates = get_small_words(c, args.min_word_len) if len(candidates): print("Filtering {} small word{} with less than {} characters.". format(len(candidates), 's' if len(candidates) > 1 else '', args.min_word_len)) stoplist.update(candidates) # cache item counts items, counts = get_corpus_counts(c) if args.high_filter is None and args.high_percent is None and not args.quiet: args.high_filter, candidates = get_high_filter(c, words=stoplist, items=items, counts=counts) if len(candidates): print("Filtering {} high frequency word{}.".format( len(candidates), 's' if len(candidates) > 1 else '')) stoplist.update(candidates) elif args.high_filter is None and args.high_percent is None and args.quiet: pass elif args.high_filter: candidates = get_candidate_words(c, args.high_filter, sort=False, items=items, counts=counts) if len(candidates): print("Filtering {} high frequency word{}.".format( len(candidates), 's' if len(candidates) > 1 else '')) stoplist.update(candidates) elif args.high_percent: args.high_filter = get_closest_bin(c, 1 - (args.high_percent / 100.), counts=counts) print(args.high_filter) candidates = get_candidate_words(c, args.high_filter, sort=False, items=items, counts=counts) if len(candidates): print("Filtering {} high frequency word{}.".format( len(candidates), 's' if len(candidates) > 1 else '')) stoplist.update(candidates) if args.low_filter is None and args.low_percent is None and not args.quiet: args.low_filter, candidates = get_low_filter(c, words=stoplist, items=items, counts=counts) if len(candidates): print("Filtering {} low frequency word{}.".format( len(candidates), 's' if len(candidates) > 1 else '')) stoplist.update(candidates) elif args.low_filter is None and args.low_percent is None and args.quiet: pass elif args.low_filter: candidates = get_candidate_words(c, -1 * args.low_filter, sort=False, items=items, counts=counts) if len(candidates): print("Filtering {} low frequency words.".format(len(candidates))) stoplist.update(candidates) elif args.low_percent: args.low_filter = get_closest_bin(c, 1 - (args.low_percent / 100.), reverse=True, counts=counts) print(args.low_filter) candidates = get_candidate_words(c, -1 * args.low_filter, sort=False, items=items, counts=counts) if len(candidates): print("Filtering {} low frequency word{}.".format( len(candidates), 's' if len(candidates) > 1 else '')) stoplist.update(candidates) if not stoplist: print("No stopwords applied.\n\n") sys.exit(0) else: print("\n\nApplying {} stopword{}".format( len(stoplist), 's' if len(stoplist) > 1 else '')) c.in_place_stoplist(stoplist) print("\n") def name_corpus(dirname, languages, lowfreq=None, highfreq=None): corpus_name = [dirname] if args.lang: corpus_name.append('nltk') corpus_name.append(''.join(args.lang)) if lowfreq is not None and lowfreq > 0: corpus_name.append('freq%s' % lowfreq) if highfreq is not None and highfreq > 0: corpus_name.append('N%s' % highfreq) corpus_name = '-'.join(corpus_name) corpus_name += '.npz' return corpus_name dirname = os.path.basename(args.corpus_path).split('-nltk-')[0].replace( '.npz', '') corpus_name = name_corpus(dirname, ['en'], args.low_filter, args.high_filter) model_path = os.path.dirname(args.corpus_path) args.corpus_path = os.path.join(model_path, corpus_name) c.save(args.corpus_path) config.set("main", "corpus_file", args.corpus_path) config.remove_option("main", "model_pattern") with open(args.config_file, 'w') as configfh: config.write(configfh)
def main(args): if args.cluster: cluster(args.cluster, args.config_file) return config = topicexplorer.config.read(args.config_file) corpus_filename = config.get("main", "corpus_file") model_path = config.get("main", "path") if config.getboolean("main", "sentences"): from vsm.extensions.ldasentences import CorpusSent as Corpus else: from vsm.corpus import Corpus if args.k is None: try: if config.get("main", "topics"): default = ' '.join(map(str, eval(config.get("main", "topics")))) if args.quiet: args.k = [int(n) for n in default.split()] else: raise NoOptionError('main', 'topics') except NoOptionError: default = ' '.join(map(str, range(20, 100, 20))) while args.k is None: ks = input("Number of Topics [Default '{0}']: ".format(default)) try: if ks: args.k = [int(n) for n in ks.split()] elif not ks.strip(): args.k = [int(n) for n in default.split()] if args.k: print( "\nTIP: number of topics can be specified with argument '-k N N N ...':" ) print(" topicexplorer train %s -k %s\n" %\ (args.config_file, ' '.join(map(str, args.k)))) except ValueError: print("Enter valid integers, separated by spaces!") if args.processes < 0: import multiprocessing args.processes = multiprocessing.cpu_count() + args.processes print("Loading corpus... ") corpus = Corpus.load(corpus_filename) try: model_pattern = config.get("main", "model_pattern") except NoOptionError: model_pattern = None if (model_pattern is not None and not args.rebuild and (args.quiet or args.cont or bool_prompt( """Existing topic models found. You can continue training or start a new model. Do you want to continue training your existing models? """, default=True))): from vsm.model.lda import LDA m = LDA.load(model_pattern.format(args.k[0]), multiprocessing=args.processes > 1, n_proc=args.processes) if args.iter is None and not args.quiet: # pragma: no cover args.iter = int_prompt("Total number of training iterations:", default=int(m.iteration * 1.5), min=m.iteration) print( "\nTIP: number of training iterations can be specified with argument '--iter N':" ) print(" topicexplorer train --iter %d %s\n" % (args.iter, args.config_file)) elif args.iter is None and args.quiet: # pragma: no cover args.iter = int(m.iteration * 1.5) del m # if the set changes, build some new models and continue some old ones config_topics = eval(config.get("main", "topics")) if args.k != config_topics: new_models = set(args.k) - set(config_topics) continuing_models = set(args.k) & set(config_topics) build_models(corpus, corpus_filename, model_path, config.get("main", "context_type"), new_models, n_iterations=args.iter, n_proc=args.processes, seed=args.seed, dry_run=args.dry_run) model_pattern = continue_training(model_pattern, continuing_models, args.iter, n_proc=args.processes, dry_run=args.dry_run) else: model_pattern = continue_training(model_pattern, args.k, args.iter, n_proc=args.processes, dry_run=args.dry_run) else: # build a new model if args.iter is None and not args.quiet: # pragma: no cover args.iter = int_prompt("Number of training iterations:", default=200) print( "\nTIP: number of training iterations can be specified with argument '--iter N':" ) print(" topicexplorer train --iter %d %s\n" % (args.iter, args.config_file)) elif args.iter is None and args.quiet: # pragma: no cover args.iter = 200 # TODO: if only one context_type, make it just the one context type. ctxs = corpus.context_types if len(ctxs) == 1: args.context_type = ctxs[0] else: ctxs = sorted(ctxs, key=lambda ctx: len(corpus.view_contexts(ctx))) if args.context_type not in ctxs: while args.context_type not in ctxs: contexts = ctxs[:] contexts[0] = contexts[0].upper() contexts = '/'.join(contexts) args.context_type = input("Select a context type [%s] : " % contexts) if args.context_type.strip() == '': args.context_type = ctxs[0] if args.context_type == ctxs[0].upper(): args.context_type = ctxs[0] print( "\nTIP: context type can be specified with argument '--context-type TYPE':" ) print(" topicexplorer train --context-type %s %s\n" % (args.context_type, args.config_file)) print("\nTIP: This configuration can be automated as:") print(" topicexplorer train %s --iter %d --context-type %s -k %s\n" %\ (args.config_file, args.iter, args.context_type, ' '.join(map(str, args.k)))) model_pattern = build_models(corpus, corpus_filename, model_path, args.context_type, args.k, n_iterations=args.iter, n_proc=args.processes, seed=args.seed, dry_run=args.dry_run) config.set("main", "model_pattern", model_pattern) if args.context_type: # test for presence, since continuing doesn't require context_type config.set("main", "context_type", args.context_type) args.k.sort() config.set("main", "topics", str(args.k)) if not args.dry_run: if config.has_option("main", "cluster"): cluster_path = config.get("main", "cluster", fallback=None) config.remove_option("main", "cluster") try: if cluster_path: os.remove(cluster_path) except (OSError, IOError): # fail silently on IOError pass with open(args.config_file, "w") as configfh: config.write(configfh)
def main(args): config = topicexplorer.config.read(args.config_file) if config.getboolean("main", "sentences"): from vsm.extensions.ldasentences import CorpusSent as Corpus else: from vsm.corpus import Corpus if args.lang is None: args.lang = [] args.corpus_path = config.get("main", "corpus_file") c = Corpus.load(args.corpus_path) if c.original_length != len(c.corpus): print("Corpus has already been prepared. Proceed to training or") print("re-init the corpus to apply a different set of stopwords.") print("\nTIP: Train the LDA models with:") print(" topicexplorer train", args.config_file) sys.exit(1) # auto-guess a language """ new_langs = [lang for lang in detect_langs(c) if lang in langs and lang not in args.lang] if new_langs: args.lang.extend(new_langs) """ # add default locale if no other languages are specified # do not add if in quiet mode -- make everything explicit if not args.lang and not args.quiet: import locale locale = locale.getdefaultlocale()[0].split('_')[0].lower() if locale in langs.keys(): args.lang.append(locale) # check for any new candidates args.lang = [lang for lang in args.lang if stop_language(c, langs[lang])] if args.lang and not args.quiet: args.lang = lang_prompt(args.lang) stoplist = set() # Apply stop words print(" ") for lang in args.lang: print("Applying", langs[lang], "stopwords") candidates = stop_language(c, langs[lang]) if len(candidates): stoplist.update(candidates) # Apply custom stopwords file if args.stopword_file: with open(args.stopword_file, encoding='utf8') as swf: #candidates = [unidecode(word.strip()) for word in swf] candidates = [word.strip() for word in swf] if len(candidates): print("Applying custom stopword file to remove {} word{}.".format( len(candidates), 's' if len(candidates) > 1 else '')) stoplist.update(candidates) if args.min_word_len: candidates = get_small_words(c, args.min_word_len) if len(candidates): print("Filtering {} small word{} with less than {} characters.".format( len(candidates), 's' if len(candidates) > 1 else '', args.min_word_len)) stoplist.update(candidates) # cache item counts items, counts = get_corpus_counts(c) if args.high_filter is None and args.high_percent is None and not args.quiet: args.high_filter, candidates = get_high_filter(c, words=stoplist, items=items, counts=counts) if len(candidates): print("Filtering {} high frequency word{}.".format(len(candidates), 's' if len(candidates) > 1 else '')) stoplist.update(candidates) elif args.high_filter is None and args.high_percent is None and args.quiet: pass elif args.high_filter: candidates = get_candidate_words(c, args.high_filter, sort=False, items=items, counts=counts) if len(candidates): print("Filtering {} high frequency word{}.".format(len(candidates), 's' if len(candidates) > 1 else '')) stoplist.update(candidates) elif args.high_percent: args.high_filter = get_closest_bin(c, 1 - (args.high_percent / 100.), counts=counts) print(args.high_filter) candidates = get_candidate_words(c, args.high_filter, sort=False, items=items, counts=counts) if len(candidates): print("Filtering {} high frequency word{}.".format(len(candidates), 's' if len(candidates) > 1 else '')) stoplist.update(candidates) if args.low_filter is None and args.low_percent is None and not args.quiet: args.low_filter, candidates = get_low_filter(c, words=stoplist, items=items, counts=counts) if len(candidates): print("Filtering {} low frequency word{}.".format(len(candidates), 's' if len(candidates) > 1 else '')) stoplist.update(candidates) elif args.low_filter is None and args.low_percent is None and args.quiet: pass elif args.low_filter: candidates = get_candidate_words(c, -1 * args.low_filter, sort=False, items=items, counts=counts) if len(candidates): print("Filtering {} low frequency words.".format(len(candidates))) stoplist.update(candidates) elif args.low_percent: args.low_filter = get_closest_bin(c, 1 - (args.low_percent / 100.), reverse=True, counts=counts) print(args.low_filter) candidates = get_candidate_words(c, -1 * args.low_filter, sort=False, items=items, counts=counts) if len(candidates): print("Filtering {} low frequency word{}.".format(len(candidates), 's' if len(candidates) > 1 else '')) stoplist.update(candidates) if not stoplist: print("No stopwords applied.\n\n") sys.exit(0) else: print("\n\nApplying {} stopword{}".format(len(stoplist), 's' if len(stoplist) > 1 else '')) c.in_place_stoplist(stoplist) print("\n") def name_corpus(dirname, languages, lowfreq=None, highfreq=None): corpus_name = [dirname] if args.lang: corpus_name.append('nltk') corpus_name.append(''.join(args.lang)) if lowfreq is not None and lowfreq > 0: corpus_name.append('freq%s' % lowfreq) if highfreq is not None and highfreq > 0: corpus_name.append('N%s' % highfreq) corpus_name = '-'.join(corpus_name) corpus_name += '.npz' return corpus_name dirname = os.path.basename(args.corpus_path).split('-nltk-')[0].replace('.npz', '') corpus_name = name_corpus(dirname, ['en'], args.low_filter, args.high_filter) model_path = os.path.dirname(args.corpus_path) args.corpus_path = os.path.join(model_path, corpus_name) c.save(args.corpus_path) config.set("main", "corpus_file", args.corpus_path) config.remove_option("main", "model_pattern") with open(args.config_file, 'w') as configfh: config.write(configfh)