def main(): from argparse import ArgumentParser from topicexplorer.lib.util import is_valid_configfile # Construct argument parser parser = ArgumentParser() parser.add_argument('-p', '--port', type=int, default=8000) # parser.add_argument('config', help="Configuration File", # type=is_valid_configfile) parser.add_argument('corpus') args = parser.parse_args() """ # load in the configuration file config = ConfigParser({ 'raw_corpus' : None, 'fulltext' : 'false'}) config.read(args.config) # path variables corpus_file = config.get('main', 'corpus_file') # Load text model objects corpus = Corpus.load(corpus_file) """ global corpus corpus = Corpus.load(args.corpus) from argparse import Namespace # bibtex.init(None, None, Namespace(bibtex='library.bib')) # Launch server port = args.port host = '0.0.0.0' root.run(server='paste', host=host, port=port)
def main(args): from vsm.corpus import Corpus config = ConfigParser({"htrc": False, "sentences": "False"}) config.read(args.config_file) args.corpus_path = config.get("main", "corpus_file") c = Corpus.load(args.corpus_path) context_type = config.get('main', 'context_type') if args.add: metadata = parse_metadata_from_csvfile(args.add, context_type) c = add_metadata(c, context_type, metadata, force=args.force, rename=args.rename) c.save(args.corpus_path) if args.list: extract_labels(c, context_type, args.list) if args.extract: extract_metadata(c, context_type, args.extract) if args.htrc: config = add_htrc_metadata(config, corpus=c) with open(args.config_file, "w") as configfh: config.write(configfh)
def walk_corpus(walk_dir, chunk_name='document', encoding='utf8', ignore=IGNORE, nltk_stop=True, stop_freq=1, add_stop=None, decode=False, verbose=1, simple=False, tokenizer=word_tokenize): filenames = [] for root, dirs, files in os.walk(walk_dir): for file in files: filenames.append(os.path.join(root, file)) # filter the blacklist (typically .json, .log, etc.) filenames = filter_by_suffix(filenames, ignore) files = [] for filename in filenames: if encoding == 'detect': encoding = detect_encoding(filename) try: if decode: with open(filename, mode='r', encoding=encoding) as f: files.append(unidecode(f.read())) else: with open(filename, mode='r', encoding=encoding) as f: files.append(f.read()) except UnicodeDecodeError: encoding = detect_encoding(filename) if decode: with open(filename, mode='r', encoding=encoding) as f: files.append(unidecode(f.read())) else: with open(filename, mode='r', encoding=encoding) as f: files.append(f.read()) words, tok = dir_tokenize(files, filenames, chunk_name=chunk_name, paragraphs=False, verbose=verbose, simple=simple, tokenizer=tokenizer) names, data = list(zip(*list(tok.items()))) c = Corpus(words, context_data=data, context_types=names) if nltk_stop or stop_freq or add_stop: c = apply_stoplist(c, nltk_stop=nltk_stop, freq=stop_freq, add_stop=add_stop) return c
def empty_corpus(context_type='document'): """ Creates an empty Corpus with defined context_type. :param context_type: A type of tokenization. Default is 'document'. :type context_type: string :returns: An empty Corpus with no words or context_data. :See Also: :class:`vsm.corpus.Corpus` """ return Corpus([], context_data=[np.array([], dtype=[('idx', np.int)])], context_types=[context_type])
def corpus_fromlist(ls, context_type='context', remove_empty=True): """ Takes a list of lists or arrays containing strings or integers and returns a Corpus object. The label associated to a given context is `context_type` prepended to the context index. :param ls: List of lists or List of arrays containing strings or integers. :type ls: list :param context_type: A type of tokenization. :type context_type: string, optional :returns: A Corpus object built from `ls`. :See Also: :class:`vsm.corpus.Corpus` **Examples** >>> ls = [['a', 'b'], ['c'], ['d', 'e']] >>> c = corpus_fromlist(ls, context_type='sentence') >>> c.view_contexts('sentence', as_strings=True) [array(['a', 'b'], dtype='|S1'), array(['c'], dtype='|S1'), array(['d', 'e'], dtype='|S1')] >>> c.context_data [array([(2, 'sentence_0'), (3, 'sentence_1'), (5, 'sentence_2')], dtype=[('idx', '<i8'), ('sentence_label', '|S10')])] """ corpus = chain.from_iterable(ls) #[w for ctx in ls for w in ctx] indices = np.cumsum([len(sbls) for sbls in ls]) metadata = ['{0}_{1}'.format(context_type, i) for i in range(len(indices))] md_type = np.array(metadata).dtype md_type = np.object_ dtype = [('idx', np.int), (context_type + '_label', md_type)] context_data = [np.array(list(zip(indices, metadata)), dtype=dtype)] return Corpus(corpus, context_data=context_data, context_types=[context_type], words_corpus=chain.from_iterable(copy(ctx) for ctx in ls), remove_empty=remove_empty)
def main(args): from vsm.corpus import Corpus config = ConfigParser({"htrc": False, "sentences": "False"}) config.read(args.config_file) args.corpus_path = config.get("main", "corpus_file") c = Corpus.load(args.corpus_path) context_type = config.get('main', 'context_type') if args.add: metadata = parse_metadata_from_csvfile(args.add, context_type) c = add_metadata(c, context_type, metadata, force=args.force, rename=args.rename) c.save(args.corpus_path) if args.list: extract_labels(c, context_type, args.list) if args.extract: extract_metadata(c, context_type, args.extract)
def corpus_from_strings(strings, metadata=[], decode=False, nltk_stop=True, stop_freq=0, add_stop=None, tokenizer=word_tokenize): """ Takes a list of strings and returns a Corpus object whose document tokens are the strings. :param tokenizer: word tokenization function. Defaults to `vsm.extensions.corpusbuilders.util.word_tokenize`. :type tokenizer: lambda s -> tokens """ if decode: for i in range(len(strings)): if isinstance(strings[i], str): strings[i] = unidecode(strings[i]) documents = [tokenizer(s) for s in strings] corpus = sum(documents, []) indices = np.cumsum([len(d) for d in documents]) del documents if len(metadata) == 0: metadata = ['document_{0}'.format(i) for i in range(len(strings))] md_type = np.array(metadata).dtype md_type = np.object_ dtype = [('idx', np.int), ('document_label', md_type)] context_data = [np.array(list(zip(indices, metadata)), dtype=dtype)] c = Corpus(corpus, context_data=context_data, context_types=['document']) if nltk_stop or stop_freq or add_stop: c = apply_stoplist(c, nltk_stop=nltk_stop, freq=stop_freq, add_stop=add_stop) return c
def main(args): global context_type, lda_c, lda_m, lda_v, label, id_fn # load in the configuration file config = ConfigParser({ 'certfile': None, 'keyfile': None, 'ca_certs': None, 'ssl': False, 'port': '8000', 'host': '0.0.0.0', 'topic_range': '{0},{1},1'.format(args.k, args.k + 1), 'icons': 'link', 'corpus_link': None, 'doc_title_format': None, 'doc_url_format': None, 'topics': None }) config.read(args.config) # path variables path = config.get('main', 'path') context_type = config.get('main', 'context_type') corpus_file = config.get('main', 'corpus_file') model_pattern = config.get('main', 'model_pattern') # automatic port assignment def test_port(port): try: host = args.host or config.get("www", "host") if host == '0.0.0.0': host = 'localhost' try: s = socket.create_connection((host, port), 2) s.close() raise IOError("Socket connectable on port {0}".format(port)) except socket.error: pass return port except IOError: port = int_prompt( "Conflict on port {0}. Enter new port:".format(port)) return test_port(port) port = args.port or int(config.get('www', 'port').format(0)) + args.k port = test_port(port) # prompt to save if (int(config.get("www", "port").format(0)) + args.k) != port: if bool_prompt("Change default baseport to {0}?".format(port - args.k), default=True): config.set("www", "port", str(port - args.k)) # create deep copy of configuration # see http://stackoverflow.com/a/24343297 config_string = StringIO() config.write(config_string) # skip DEFAULT section config_string.seek(0) idx = config_string.getvalue().index("[main]") config_string.seek(idx) # read deep copy new_config = ConfigParser() new_config.readfp(config_string) # write deep copy without DEFAULT section # this preserves DEFAULT for rest of program with open(args.config, 'wb') as configfh: new_config.write(configfh) # hostname assignment host = args.host or config.get('www', 'host') # LDA objects lda_c = Corpus.load(corpus_file) lda_m = None lda_v = None def load_model(k): global lda_m, lda_v lda_m = LDA.load(model_pattern.format(k)) lda_v = LDAViewer(lda_c, lda_m) load_model(args.k) # label function imports try: label_module = config.get('main', 'label_module') label_module = import_module(label_module) print "imported label module" label_module.init(config.get('main', 'path'), lda_v, context_type) except (ImportError, NoOptionError, AttributeError): pass try: label = label_module.label print "imported label function" except (AttributeError, UnboundLocalError): label = lambda x: x print "using default label function" try: id_fn = label_module.id_fn print "imported id function" except (AttributeError, UnboundLocalError): id_fn = def_label_fn print "using default id function" config_icons = config.get('www', 'icons').split(",") @route('/icons.js') def icons(): with open(resource_filename(__name__, '../www/icons.js')) as icons: text = '{0}\n var icons = {1};'\ .format(icons.read(), json.dumps(config_icons)) return text # index page parameterization corpus_name = config.get('www', 'corpus_name') corpus_link = config.get('www', 'corpus_link') doc_title_format = config.get('www', 'doc_title_format') doc_url_format = config.get('www', 'doc_url_format') if config.get('main', 'topic_range'): topic_range = map(int, config.get('main', 'topic_range').split(',')) topic_range = range(*topic_range) if config.get('main', 'topics'): topic_range = eval(config.get('main', 'topics')) topic_range = [{ 'k': k, 'port': int(config.get('www', 'port').format(0)) + k } for k in topic_range] renderer = pystache.Renderer(escape=lambda u: u) @route('/') def index(): response.set_header('Expires', _cache_date()) with open(resource_filename(__name__, '../www/index.mustache.html'), encoding='utf-8') as tmpl_file: template = tmpl_file.read() return renderer.render( template, { 'corpus_name': corpus_name, 'corpus_link': corpus_link, 'context_type': context_type, 'topic_range': topic_range, 'doc_title_format': doc_title_format, 'doc_url_format': doc_url_format }) @route('/<filename:path>') @_set_acao_headers def send_static(filename): return static_file(filename, root=resource_filename(__name__, '../www/')) if args.ssl or config.get('main', 'ssl'): certfile = args.certfile or config.get('ssl', 'certfile') keyfile = args.keyfile or config.get('ssl', 'keyfile') ca_certs = args.ca_certs or config.get('ssl', 'ca_certs') run(host=host, port=port, server=SSLWSGIRefServer, certfile=certfile, keyfile=keyfile, ca_certs=ca_certs) else: run(host=host, port=port)
def coll_corpus(coll_dir, encoding='utf8', ignore=IGNORE, nltk_stop=True, stop_freq=1, add_stop=None, decode=False, verbose=1, simple=False, tokenizer=word_tokenize): """ `coll_corpus` is a convenience function for generating Corpus objects from a directory of plain text files. It will also strip punctuation and arabic numerals outside the range 1-29. All letters are made lowercase. :param coll_dir: Directory containing a collections of books which contain pages as plain-text files. :type coll_dir: string-like :param encoding: A string indicating the file encoding or 'detect', in which case `chardet` is used to automatically guess the encoding. Default is `utf8`. :type encoding: string, optional :param ignore: The list containing suffixes of files to be filtered. The suffix strings are normally file types. Default is ['.json', '.log','.pickle', '.DS_Store']. :type ignore: list of strings, optional :param nltk_stop: If `True` then the corpus object is masked using the NLTK English stop words. Default is `False`. :type nltk_stop: boolean, optional :param stop_freq: The upper bound for a word to be masked on the basis of its collection frequency. Default is 1. :type stop_freq: int, optional :param add_stop: A list of stop words. Default is `None`. :type add_stop: array-like, optional :param decode: If `True` then unicode characters are converted to ASCII. Default is `False`. :type decode: boolean, optional :param verbose: Verbosity level. 1 prints a progress bar. :type verbose: int, default 1 :returns: c : a Corpus object Contains the tokenized corpus built from the plain-text files in `coll_dir` corpus. Document tokens are named `documents`. """ books = [] book_names = os.listdir(coll_dir) book_names = filter_by_suffix(book_names, ignore) book_names.sort() for book_name in book_names: pages = [] book_path = os.path.join(coll_dir, book_name) page_names = os.listdir(book_path) page_names = filter_by_suffix(page_names, ignore) page_names.sort() for page_name in page_names: page_file = book_name + '/' + page_name page_name = os.path.join(book_path, page_name) if encoding == 'detect': encoding = detect_encoding(page_name) try: if decode: with open(page_name, mode='r', encoding=encoding) as f: pages.append((unidecode(f.read()), page_file)) else: with open(page_name, mode='r', encoding=encoding) as f: pages.append((f.read(), page_file)) except UnicodeDecodeError: encoding = detect_encoding(page_name) if decode: with open(page_name, mode='r', encoding=encoding) as f: pages.append((unidecode(f.read()), page_file)) else: with open(page_name, mode='r', encoding=encoding) as f: pages.append((f.read(), page_file)) books.append(pages) words, tok = coll_tokenize(books, book_names, simple=simple, tokenizer=tokenizer) names, data = list(zip(*list(tok.items()))) c = Corpus(words, context_data=data, context_types=names) in_place_stoplist(c, nltk_stop=nltk_stop, freq=stop_freq, add_stop=add_stop) return c
def json_corpus(json_file, doc_key, label_key, encoding='utf8', nltk_stop=False, stop_freq=0, add_stop=None, tokenizer=word_tokenize): """ `json_corpus` is a convenience function for generating Corpus objects from a json file. It construct a corpus, document labels and metadata respectively from the specified fields in the json file. `json_corpus` will perform word-level tokenization. It will also strip punctuation and arabic numerals outside the range 1-29. All letters are made lowercase. :param json_file: Json file name containing documents and metadata. :type json_file: string-like :param doc_key: Name of the key for documents. :type doc_key: string-like :param label_key: Name of the key used for document labels. Labels are used when a viewer function outputs a list of documents. Any field other than `doc_key` and `label_key` is stored as metadata. :type label_key: string-like :param encoding: A string indicating the file encoding or 'detect', in which case `chardet` is used to automatically guess the encoding. Default is `utf8`. :type encoding: string, optional :param nltk_stop: If `True` then the corpus object is masked using the NLTK English stop words. Default is `False`. :type nltk_stop: boolean, optional :param stop_freq: The upper bound for a word to be masked on the basis of its collection frequency. Default is 0. :type stop_freq: int, optional :param add_stop: A list of stop words. Default is `None`. :type add_stop: array-like, optional :param tokenizer: word tokenization function. Defaults to `vsm.extensions.corpusbuilders.util.word_tokenize`. :type tokenizer: lambda s -> tokens :returns: c : a Corpus object Contains the tokenized corpus built from the input plain-text corpus. Document tokens are named `documents`. :See Also: :class:`vsm.corpus.Corpus`, :meth:`vsm.corpus.util.paragraph_tokenize`, :meth:`vsm.corpus.util.apply_stoplist` """ import json if encoding == 'detect': encoding = detect_encoding(json_file) with open(json_file, 'r', encoding=encoding) as f: json_data = json.load(f) docs = [] label = [] metadata = [] for i in json_data: docs.append(i.pop(doc_key, None).encode('ascii', 'ignore')) label.append(i.pop(label_key, None)) metadata.append(i) # metadata are all the rest docs = [tokenizer(d) for d in docs] corpus = sum(docs, []) tok = np.cumsum(np.array([len(d) for d in docs])) # add document label and metadata dtype = [('idx', np.array(tok).dtype), ('document_label', np.object_), ('metadata', np.array(metadata).dtype) ] # todo: create separate dtype for each key? tok = np.array(list(zip(tok, label, metadata)), dtype=dtype) c = Corpus(corpus, context_data=[tok], context_types=['document']) if nltk_stop or stop_freq or add_stop: c = apply_stoplist(c, nltk_stop=nltk_stop, freq=stop_freq, add_stop=add_stop) return c
from collections import defaultdict import numpy as np from hyperbrain.parse import * from vsm.corpus import Corpus import sys c = Corpus.load(sys.argv[-1]) # get all terms in corpus abi_vocab = [word for word in c.words if word.startswith('abi:')] # get all counts abi_counts = defaultdict(int) for word in abi_vocab: id = int(word.replace('abi:', '')) count = (c.corpus == c.words_int[word]).sum() abi_counts[id] = count # calculate how many children there are of each node def get_child_counts(key): if children[key]: return abi_counts[key] + sum([ get_child_counts(child_key) for child_key in children[key] if child_key != key ]) else: return abi_counts[key]
def dir_corpus(plain_dir, chunk_name='article', encoding='utf8', paragraphs=True, word_len=2, nltk_stop=True, stop_freq=1, add_stop=None, corpus_sent=True, ignore=['.log', '.pickle', '.xml'], decode=False, simple=False): """ `dir_corpus` is a convenience function for generating Corpus objects from a directory of plain text files. `dir_corpus` will retain file-level tokenization and perform sentence and word tokenizations. Optionally, it will provide paragraph-level tokenizations. It will also strip punctuation and arabic numerals outside the range 1-29. All letters are made lowercase. :param plain_dir: String containing directory containing a plain-text corpus. :type plain_dir: string-like :param chunk_name: The name of the tokenization corresponding to individual files. For example, if the files are pages of a book, one might set `chunk_name` to `pages`. Default is `articles`. :type chunk_name: string-like, optional :param paragraphs: If `True`, a paragraph-level tokenization is included. Defaults to `True`. :type paragraphs: boolean, optional :param word_len: Filters words whose lengths are <= word_len. Default is 2. :type word_len: int, optional :param nltk_stop: If `True` then the corpus object is masked using the NLTK English stop words. Default is `False`. :type nltk_stop: boolean, optional :param stop_freq: The upper bound for a word to be masked on the basis of its collection frequency. Default is 1. :type stop_freq: int, optional :param corpus_sent: If `True` a CorpusSent object is returned. Otherwise Corpus object is returned. Default is `True`. :type corpus_sent: boolean, optional :param add_stop: A list of stop words. Default is `None`. :type add_stop: array-like, optional :param ignore: The list containing suffixes of files to be filtered. The suffix strings are normally file types. Default is ['.json', '.log', '.pickle']. :type ignore: list of strings, optional :returns: c : Corpus or CorpusSent Contains the tokenized corpus built from the input plain-text corpus. Document tokens are named `documents`. :See Also: :class: Corpus, :class: CorpusSent, :meth: dir_tokenize, :meth: apply_stoplist """ chunks = [] filenames = os.listdir(plain_dir) filenames = filter_by_suffix(filenames, ignore) filenames.sort() for filename in filenames: filename = os.path.join(plain_dir, filename) if encoding == 'detect': encoding = detect_encoding(filename) try: if decode: with open(filename, mode='r', encoding=encoding) as f: chunks.append(unidecode(f.read())) else: with open(filename, mode='r', encoding=encoding) as f: chunks.append(f.read()) except UnicodeDecodeError: encoding = detect_encoding(filename) if decode: with open(filename, mode='r', encoding=encoding) as f: chunks.append(unidecode(f.read())) else: with open(filename, mode='r', encoding=encoding) as f: chunks.append(f.read()) words, tok, sent = dir_tokenize(chunks, filenames, chunk_name=chunk_name, paragraphs=paragraphs) names, data = zip(*tok.items()) if corpus_sent: c = CorpusSent(words, sent, context_data=data, context_types=names, remove_empty=False) else: c = Corpus(words, context_data=data, context_types=names) in_place_stoplist(c, nltk_stop=nltk_stop, add_stop=add_stop, freq=stop_freq) return c
def _load_corpus(self, corpus_file): self.c = Corpus.load(corpus_file, load_corpus=False) self.labels = self.c.view_metadata(self.context_type)[self.label_name]
def nested_arr_to_np(arr, arrarr=False): outli = [] for r in arr: inli = [] for c in r: inli.append(c) if arrarr: inli = np.array(inli) outli.append(inli) return np.array(outli) if __name__=='__main__': from vsm.corpus import Corpus path = '../org/knowceans/gibbstest/' c = Corpus.load(path+'church_corp.npz') writepath = '/home/doori/inpho/org/knowceans/gibbstest/' ctx = 'document' # java can't process '..' in the path. gw, m = lda_run(c, path+'churchcorp.txt', ctx, 10000, 2, writepath+'church-meta.txt', 0.01, 0.01) save(m, ctx, writepath+'church_lda.npz', writepath+'church-meta.txt')
prop = [] for i in range(0, len(sttest)): if sttest[i][1] == 'start': prop = [] inprop = True if endtest[i][1] == 'end': if prop != []: prop.append(sttest[i][0]) props.append(prop) prop = [] inprop = False if inprop: prop.append(sttest[i][0]) #Get topics for each prop c = Corpus.load(exepath+sys.argv[4]) m = LDA.load(exepath+sys.argv[5]) v = LDAViewer(c,m) stopwords = stopwords.words('english') allowed_chars=string.ascii_letters trans_table = string.maketrans('','') print "Applying topic model" #Remove props with only words in stoplist vsmprops = [] np = [] for p in props: np = [w.lower() for w in p if w.lower() not in stopwords and not w.translate(trans_table,allowed_chars)] if len(np) > 0:
def toy_corpus(plain_corpus, is_filename=False, encoding='utf8', nltk_stop=False, stop_freq=0, add_stop=None, decode=False, metadata=None, autolabel=False, tokenizer=word_tokenize, simple=False): """ `toy_corpus` is a convenience function for generating Corpus objects from a given string or a single file. `toy_corpus` will perform both word and document-level tokenization. It will also strip punctuation and arabic numerals outside the range 1-29. All letters are made lowercase. Document tokens are delimited by two or more line breaks. E.g., <document 0> <document 1> ... <document n> where <document i> is any chunk of text to be tokenized by word. :param plain_corpus: String containing a plain-text corpus or a filename of a file containing one. :type plain_corpus: string-like :param is_filename: If `True` then `plain_corpus` is treated like a filename. Otherwise, `plain_corpus` is presumed to contain the corpus. Default is `False`. :type is_filename: boolean, optional :param encoding: A string indicating the file encoding or 'detect', in which case `chardet` is used to automatically guess the encoding. Default is `utf8`. :type encoding: string, optional :param nltk_stop: If `True` then the corpus object is masked using the NLTK English stop words. Default is `False`. :type nltk_stop: boolean, optional :param stop_freq: The upper bound for a word to be masked on the basis of its collection frequency. Default is 0. :type stop_freq: int, optional :param add_stop: A list of stop words. Default is `None`. :type add_stop: array-like, optional :param decode: If `True` then unicode characters are converted to ASCII. Default is `False`. :type decode: boolean, optional :param metadata: A list of strings providing metadata about the documents. If provided, must have length equal to the number of documents. Default is `None`. :type metadata: array-like, optional :param autolabel: A boolean specifying whether to automatically label documents by position in file. Default is False :type metadata: boolean, optional :param tokenizer: word tokenization function. Defaults to `vsm.extensions.corpusbuilders.util.word_tokenize`. :type tokenizer: lambda s -> tokens :returns: c : a Corpus object Contains the tokenized corpus built from the input plain-text corpus. Document tokens are named `documents`. :See Also: :class:`vsm.corpus.Corpus`, :meth:`vsm.corpus.util.paragraph_tokenize`, :meth:`vsm.corpus.util.apply_stoplist` """ if is_filename: if encoding == 'detect': encoding = detect_encoding(plain_corpus) with open(plain_corpus, 'rb', encoding=encoding) as f: plain_corpus = f.read() if decode: plain_corpus = unidecode(plain_corpus) docs = paragraph_tokenize(plain_corpus) docs = [tokenizer(d) for d in docs] corpus = sum(docs, []) tok = np.cumsum(np.array([len(d) for d in docs])) if not metadata and autolabel: metadata = ['Document {0}'.format(i) for i in range(len(tok))] if metadata: if not len(metadata) == len(tok): msg = 'Metadata mismatch: metadata length is {0} and number'\ 'of documents is {1}'.format(len(metadata), len(tok)) raise Exception(msg) else: md_type = np.object_ dtype = [('idx', np.array(tok).dtype), ('document_label', md_type)] tok = np.array(list(zip(tok, metadata)), dtype=dtype) else: dtype = [('idx', np.array(tok).dtype)] tok = np.array([(i, ) for i in tok], dtype=dtype) c = Corpus(corpus, context_data=[tok], context_types=['document']) if nltk_stop or stop_freq or add_stop: c = apply_stoplist(c, nltk_stop=nltk_stop, freq=stop_freq, add_stop=add_stop) return c
def main(args): from vsm.corpus import Corpus from vsm.model.lda import LDA config = ConfigParser() config.read(args.config_file) corpus_filename = config.get("main", "corpus_file") model_path = config.get("main", "path") if args.k is None: try: if config.get("main", "topics"): default = ' '.join(map(str, eval(config.get("main", "topics")))) else: raise NoOptionError except NoOptionError: default = ' '.join(map(str, range(20,100,20))) while args.k is None: ks = raw_input("Number of Topics [Default '{0}']: ".format(default)) try: if ks: args.k = [int(n) for n in ks.split()] elif not ks.strip(): args.k = [int(n) for n in default.split()] if args.k: print "\nTIP: number of topics can be specified with argument '-k N N N ...':" print " vsm train %s -k %s\n" %\ (args.config_file, ' '.join(map(str, args.k))) except ValueError: print "Enter valid integers, separated by spaces!" if args.processes < 0: args.processes = multiprocessing.cpu_count() + args.processes print "Loading corpus... " corpus = Corpus.load(corpus_filename) try: model_pattern = config.get("main", "model_pattern") except NoOptionError: model_pattern = None if model_pattern is not None and\ bool_prompt("Existing models found. Continue training?", default=True): m = LDA.load(model_pattern.format(args.k[0]), multiprocessing=args.processes > 1, n_proc=args.processes) if args.iter is None: args.iter = int_prompt("Total number of training iterations:", default=int(m.iteration*1.5), min=m.iteration) print "\nTIP: number of training iterations can be specified with argument '--iter N':" print " vsm train --iter %d %s\n" % (args.iter, args.config_file) del m # if the set changes, build some new models and continue some old ones config_topics = eval(config.get("main","topics")) if args.k != config_topics : new_models = set(args.k) - set(config_topics) continuing_models = set(args.k) & set(config_topics) build_models(corpus, corpus_filename, model_path, config.get("main", "context_type"), new_models, n_iterations=args.iter, n_proc=args.processes, seed=args.seed) model_pattern = continue_training(model_pattern, continuing_models, args.iter, n_proc=args.processes) else: model_pattern = continue_training(model_pattern, args.k, args.iter, n_proc=args.processes) else: # build a new model if args.iter is None: args.iter = int_prompt("Number of training iterations:", default=200) print "\nTIP: number of training iterations can be specified with argument '--iter N':" print " vsm train --iter %d %s\n" % (args.iter, args.config_file) ctxs = corpus.context_types ctxs = sorted(ctxs, key=lambda ctx: len(corpus.view_contexts(ctx))) if args.context_type not in ctxs: while args.context_type not in ctxs: contexts = ctxs[:] contexts[0] = contexts[0].upper() contexts = '/'.join(contexts) args.context_type = raw_input("Select a context type [%s] : " % contexts) if args.context_type.strip() == '': args.context_type = ctxs[0] if args.context_type == ctxs[0].upper(): args.context_type = ctxs[0] print "\nTIP: context type can be specified with argument '--context-type TYPE':" print " vsm train --context-type %s %s\n" % (args.context_type, args.config_file) print "\nTIP: This configuration can be automated as:" print " vsm train %s --iter %d --context-type %s -k %s\n" %\ (args.config_file, args.iter, args.context_type, ' '.join(map(str, args.k))) model_pattern = build_models(corpus, corpus_filename, model_path, args.context_type, args.k, n_iterations=args.iter, n_proc=args.processes, seed=args.seed, dry_run=args.dry_run) config.set("main", "model_pattern", model_pattern) if args.context_type: # test for presence, since continuing doesn't require context_type config.set("main", "context_type", args.context_type) args.k.sort() config.set("main", "topics", str(args.k)) if not args.dry_run: with open(args.config_file, "wb") as configfh: config.write(configfh)
def file_corpus(filename, encoding='utf8', nltk_stop=True, stop_freq=1, add_stop=None, decode=False, simple=False, tokenizer=word_tokenize): """ `file_corpus` is a convenience function for generating Corpus objects from a a plain text corpus contained in a single string. `file_corpus` will strip punctuation and arabic numerals outside the range 1-29. All letters are made lowercase. :param filename: File name of the plain text file. :type plain_dir: string-like :param encoding: A string indicating the file encoding or 'detect', in which case `chardet` is used to automatically guess the encoding. Default is `utf8`. :type encoding: string, optional :param nltk_stop: If `True` then the corpus object is masked using the NLTK English stop words. Default is `False`. :type nltk_stop: boolean, optional :param stop_freq: The upper bound for a word to be masked on the basis of its collection frequency. Default is 1. :type stop_freq: int, optional :param add_stop: A list of stop words. Default is `None`. :type add_stop: array-like, optional :param decode: If `True` then unicode characters are converted to ASCII. Default is `False`. :type decode: boolean, optional :returns: c : a Corpus object Contains the tokenized corpus built from the input plain-text corpus. Document tokens are named `documents`. :See Also: :class:`vsm.corpus.Corpus`, :meth:`file_tokenize`, :meth:`vsm.corpus.util.apply_stoplist` """ if encoding == 'detect': encoding = detect_encoding(filename) try: with open(filename, mode='r', encoding=encoding) as f: text = f.read() except UnicodeDecodeError: encoding = detect_encoding(filename) if decode: text = unidecode(text) words, tok = file_tokenize(text, simple=simple, tokenizer=tokenizer) names, data = list(zip(*list(tok.items()))) c = Corpus(words, context_data=data, context_types=names) if nltk_stop or stop_freq or add_stop: c = apply_stoplist(c, nltk_stop=nltk_stop, freq=stop_freq, add_stop=add_stop) return c
def main(args): config = ConfigParser() config.read(args.config_file) corpus_filename = config.get("main", "corpus_file") model_path = config.get("main", "path") if args.k is None: try: if config.get("main", "topics"): default = ' '.join(map(str, eval(config.get("main", "topics")))) else: raise NoOptionError except NoOptionError: default = ' '.join(map(str, range(20, 100, 20))) while args.k is None: ks = raw_input( "Number of Topics [Default '{0}']: ".format(default)) try: if ks: args.k = [int(n) for n in ks.split()] elif not ks.strip(): args.k = [int(n) for n in default.split()] if args.k: print "\nTIP: number of topics can be specified with argument '-k N N N ...':" print " vsm train %s -k %s\n" %\ (args.config_file, ' '.join(map(str, args.k))) except ValueError: print "Enter valid integers, separated by spaces!" if args.processes < 0: args.processes = multiprocessing.cpu_count() + args.processes corpus = Corpus.load(corpus_filename) try: model_pattern = config.get("main", "model_pattern") except NoOptionError: model_pattern = None if model_pattern is not None and\ bool_prompt("Existing models found. Continue training?", default=True): m = LDA.load(model_pattern.format(args.k[0]), multiprocessing=args.processes > 1, n_proc=args.processes) if args.iter is None: args.iter = int_prompt("Total number of training iterations:", default=int(m.iteration * 1.5), min=m.iteration) print "\nTIP: number of training iterations can be specified with argument '--iter N':" print " vsm train --iter %d %s\n" % (args.iter, args.config_file) del m # if the set changes, build some new models and continue some old ones config_topics = eval(config.get("main", "topics")) if args.k != config_topics: new_models = set(args.k) - set(config_topics) continuing_models = set(args.k) & set(config_topics) build_models(corpus, corpus_filename, model_path, config.get("main", "context_type"), new_models, n_iterations=args.iter, n_proc=args.processes, seed=args.seed) model_pattern = continue_training(model_pattern, continuing_models, args.iter, n_proc=args.processes) else: model_pattern = continue_training(model_pattern, args.k, args.iter, n_proc=args.processes) else: # build a new model if args.iter is None: args.iter = int_prompt("Number of training iterations:", default=200) print "\nTIP: number of training iterations can be specified with argument '--iter N':" print " vsm train --iter %d %s\n" % (args.iter, args.config_file) ctxs = corpus.context_types ctxs = sorted(ctxs, key=lambda ctx: len(corpus.view_contexts(ctx))) if args.context_type not in ctxs: while args.context_type not in ctxs: contexts = ctxs[:] contexts[0] = contexts[0].upper() contexts = '/'.join(contexts) args.context_type = raw_input("Select a context type [%s] : " % contexts) if args.context_type.strip() == '': args.context_type = ctxs[0] if args.context_type == ctxs[0].upper(): args.context_type = ctxs[0] print "\nTIP: context type can be specified with argument '--context-type TYPE':" print " vsm train --context-type %s %s\n" % ( args.context_type, args.config_file) print "\nTIP: This configuration can be automated as:" print " vsm train %s --iter %d --context-type %s -k %s\n" %\ (args.config_file, args.iter, args.context_type, ' '.join(map(str, args.k))) model_pattern = build_models(corpus, corpus_filename, model_path, args.context_type, args.k, n_iterations=args.iter, n_proc=args.processes, seed=args.seed) config.set("main", "model_pattern", model_pattern) if args.context_type: # test for presence, since continuing doesn't require context_type config.set("main", "context_type", args.context_type) args.k.sort() config.set("main", "topics", str(args.k)) with open(args.config_file, "wb") as configfh: config.write(configfh)
def dir_corpus(plain_dir, chunk_name='article', encoding='utf8', paragraphs=True, ignore=IGNORE, nltk_stop=True, stop_freq=1, add_stop=None, decode=False, verbose=1, simple=False, tokenizer=word_tokenize): """ `dir_corpus` is a convenience function for generating Corpus objects from a directory of plain text files. `dir_corpus` will retain file-level tokenization and perform sentence and word tokenizations. Optionally, it will provide paragraph-level tokenizations. It will also strip punctuation and arabic numerals outside the range 1-29. All letters are made lowercase. :param plain_dir: String containing directory containing a plain-text corpus. :type plain_dir: string-like :param chunk_name: The name of the tokenization corresponding to individual files. For example, if the files are pages of a book, one might set `chunk_name` to `pages`. Default is `articles`. :type chunk_name: string-like, optional :param encoding: A string indicating the file encoding or 'detect', in which case `chardet` is used to automatically guess the encoding. Default is `utf8`. :type encoding: string, optional :param paragraphs: If `True`, a paragraph-level tokenization is included. Defaults to `True`. :type paragraphs: boolean, optional :param ignore: The list containing suffixes of files to be filtered. The suffix strings are normally file types. Default is ['.json', '.log','.pickle', '.DS_Store']. :type ignore: list of strings, optional :param nltk_stop: If `True` then the corpus object is masked using the NLTK English stop words. Default is `False`. :type nltk_stop: boolean, optional :param stop_freq: The upper bound for a word to be masked on the basis of its collection frequency. Default is 1. :type stop_freq: int, optional :param add_stop: A list of stop words. Default is `None`. :type add_stop: array-like, optional :param decode: If `True` then unicode characters are converted to ASCII. Default is `False`. :type decode: boolean, optional :param verbose: Verbosity level. 1 prints a progress bar. :type verbose: int, default 1 :returns: c : a Corpus object Contains the tokenized corpus built from the input plain-text corpus. Document tokens are named `documents`. :See Also: :class:`vsm.corpus.Corpus`, :meth:`dir_tokenize`, :meth:`vsm.corpus.util.apply_stoplist` """ chunks = [] filenames = os.listdir(plain_dir) filenames = filter_by_suffix(filenames, ignore) filenames.sort() for filename in filenames: filename = os.path.join(plain_dir, filename) if encoding == 'detect': encoding = detect_encoding(filename) try: if decode: with open(filename, mode='r', encoding=encoding) as f: chunks.append(unidecode(f.read())) else: with open(filename, mode='r', encoding=encoding) as f: chunks.append(f.read()) except UnicodeDecodeError: encoding = detect_encoding(filename) if decode: with open(filename, mode='r', encoding=encoding) as f: chunks.append(unidecode(f.read())) else: with open(filename, mode='r', encoding=encoding) as f: chunks.append(f.read()) words, tok = dir_tokenize(chunks, filenames, chunk_name=chunk_name, paragraphs=paragraphs, verbose=verbose, simple=simple, tokenizer=tokenizer) names, data = list(zip(*list(tok.items()))) c = Corpus(words, context_data=data, context_types=names) if nltk_stop or stop_freq or add_stop: c = apply_stoplist(c, nltk_stop=nltk_stop, freq=stop_freq, add_stop=add_stop) return c
def test_LdaCgsQuerySampler_init(self): old_corp = Corpus([], remove_empty=False) old_corp.corpus = np.array([ 0, 1, 1, 0, 0, 1 ], dtype='i') old_corp.context_data = [ np.array([(3, ), (3, )], dtype=[('idx', 'i')]) ] old_corp.context_types = [ 'document' ] old_corp.words = np.array([ '0', '1' ], dtype='i') old_corp.words_int = { '0': 0, '1': 1 } new_corp = Corpus([], remove_empty=False) new_corp.corpus = np.array([ 0, 0 ], dtype='i') new_corp.context_data = [ np.array([(2, )], dtype=[('idx', 'i')]) ] new_corp.context_types = [ 'document' ] new_corp.words = np.array([ '0', '1' ], dtype='i') new_corp.words_int = { '0': 0, '1': 1 } m = LdaCgsSeq(corpus=old_corp, context_type='document', K=2, V=2) m.Z[:] = np.array([0, 0, 0, 1, 1, 1], dtype='i') m.word_top[:] = np.array([[ 1.01, 2.01 ], [ 2.01, 1.01 ]], dtype='d') m.top_doc[:] = np.array([[ 3.01, 0.01 ], [ 0.01, 3.01 ]], dtype='d') m.inv_top_sums[:] = 1. / m.word_top.sum(0) q = LdaCgsQuerySampler(m, new_corpus=new_corp, old_corpus=old_corp) self.assertTrue(q.V==2) self.assertTrue(q.K==2) self.assertTrue(len(q.corpus)==2) self.assertTrue((q.corpus==new_corp.corpus).all()) self.assertTrue(len(q.indices)==1) self.assertTrue((q.indices== new_corp.view_metadata('document')['idx']).all()) self.assertTrue(q.word_top.shape==(2, 2)) self.assertTrue((q.word_top==m.word_top).all()) self.assertTrue(q.top_doc.shape==(2, 1)) self.assertTrue((q.top_doc==[[ 0.01 ], [ 0.01 ]]).all()) self.assertTrue(q.inv_top_sums.shape==(2, )) self.assertTrue((q.inv_top_sums==m.inv_top_sums).all()) self.assertTrue(q.alpha.shape==(2, 1)) self.assertTrue((q.alpha==m.alpha).all()) self.assertTrue(q.beta.shape==(2, 1)) self.assertTrue((q.beta==m.beta).all())
def random_corpus(corpus_len, n_words, min_token_len, max_token_len, context_type='document', metadata=False, seed=None): """ Generates a random integer corpus. :param corpus_len: Size of the Corpus. :type corpus_len: int :param n_words: Number of words to draw random integers from. :type n_words: int :param min_token_len: minimum token length used to create indices for corpus. :type min_token_len: int :param max_token_len: maximum token length used to create indices for corpus. :type max_token_len: int :param context_type: A type of tokenization. Default is 'document'. :type context_type: string, optional :param metadata: If `True` generates metadata. If `False` the only metadata for the corpus is the index information. :type metadata: boolean, optional :param tokenizer: word tokenization function. Defaults to `vsm.extensions.corpusbuilders.util.word_tokenize`. :type tokenizer: lambda s -> tokens :returns: Corpus object with random integers as its entries. :See Also: :class:`vsm.corpus.Corpus` """ random_state = np.random.RandomState(seed) corpus = random_state.randint(n_words, size=corpus_len) corpus = [str(word) for word in corpus] indices = [] i = np.random.randint(min_token_len, max_token_len) while i < corpus_len: indices.append(i) i += np.random.randint(min_token_len, max_token_len) indices.append(corpus_len) if metadata: metadata_ = [ '{0}_{1}'.format(context_type, i) for i in range(len(indices)) ] dtype = [('idx', np.array(indices).dtype), (context_type + '_label', np.object_)] rand_tok = np.array(list(zip(indices, metadata_)), dtype=dtype) else: rand_tok = np.array([(i, ) for i in indices], dtype=[('idx', np.array(indices).dtype)]) return Corpus(corpus, context_types=[context_type], context_data=[rand_tok])
def test_LdaCgsQuerySampler_init(self): old_corp = Corpus([], remove_empty=False) old_corp.corpus = np.array([0, 1, 1, 0, 0, 1], dtype='i') old_corp.context_data = [ np.array([(3, ), (3, )], dtype=[('idx', 'i')]) ] old_corp.context_types = ['document'] old_corp.words = np.array(['0', '1'], dtype='i') old_corp.words_int = {'0': 0, '1': 1} new_corp = Corpus([], remove_empty=False) new_corp.corpus = np.array([0, 0], dtype='i') new_corp.context_data = [np.array([(2, )], dtype=[('idx', 'i')])] new_corp.context_types = ['document'] new_corp.words = np.array(['0', '1'], dtype='i') new_corp.words_int = {'0': 0, '1': 1} m = LdaCgsSeq(corpus=old_corp, context_type='document', K=2, V=2) m.Z[:] = np.array([0, 0, 0, 1, 1, 1], dtype='i') m.word_top[:] = np.array([[1.01, 2.01], [2.01, 1.01]], dtype='d') m.top_doc[:] = np.array([[3.01, 0.01], [0.01, 3.01]], dtype='d') m.inv_top_sums[:] = 1. / m.word_top.sum(0) q = LdaCgsQuerySampler(m, new_corpus=new_corp, old_corpus=old_corp) self.assertTrue(q.V == 2) self.assertTrue(q.K == 2) self.assertTrue(len(q.corpus) == 2) self.assertTrue((q.corpus == new_corp.corpus).all()) self.assertTrue(len(q.indices) == 1) self.assertTrue( (q.indices == new_corp.view_metadata('document')['idx']).all()) self.assertTrue(q.word_top.shape == (2, 2)) self.assertTrue((q.word_top == m.word_top).all()) self.assertTrue(q.top_doc.shape == (2, 1)) self.assertTrue((q.top_doc == np.array([[0.01], [0.01]], dtype=q.top_doc.dtype)).all()) self.assertTrue(q.inv_top_sums.shape == (2, )) self.assertTrue((q.inv_top_sums == m.inv_top_sums).all()) self.assertTrue(q.alpha.shape == (2, 1)) self.assertTrue((q.alpha == m.alpha).all()) self.assertTrue(q.beta.shape == (2, 1)) self.assertTrue((q.beta == m.beta).all())
def main(args): global context_type, lda_c, lda_m, lda_v, label, id_fn # load in the configuration file config = ConfigParser({ 'certfile' : None, 'keyfile' : None, 'ca_certs' : None, 'ssl' : False, 'port' : '8000', 'host' : '0.0.0.0', 'topic_range' : '{0},{1},1'.format(args.k, args.k+1), 'icons': 'link', 'corpus_link' : None, 'doc_title_format' : None, 'doc_url_format' : None, 'topics': None}) config.read(args.config) # path variables path = config.get('main', 'path') context_type = config.get('main', 'context_type') corpus_file = config.get('main', 'corpus_file') model_pattern = config.get('main', 'model_pattern') # automatic port assignment def test_port(port): try: host = args.host or config.get("www","host") if host == '0.0.0.0': host = 'localhost' try: s = socket.create_connection((host,port), 2) s.close() raise IOError("Socket connectable on port {0}".format(port)) except socket.error: pass return port except IOError: port = int_prompt( "Conflict on port {0}. Enter new port:".format(port)) return test_port(port) port = args.port or int(config.get('www','port').format(0)) + args.k port = test_port(port) # prompt to save if (int(config.get("www","port").format(0)) + args.k) != port: if bool_prompt("Change default baseport to {0}?".format(port - args.k), default=True): config.set("www","port", str(port - args.k)) # create deep copy of configuration # see http://stackoverflow.com/a/24343297 config_string = StringIO() config.write(config_string) # skip DEFAULT section config_string.seek(0) idx = config_string.getvalue().index("[main]") config_string.seek(idx) # read deep copy new_config = ConfigParser() new_config.readfp(config_string) # write deep copy without DEFAULT section # this preserves DEFAULT for rest of program with open(args.config, 'wb') as configfh: new_config.write(configfh) # hostname assignment host = args.host or config.get('www','host') # LDA objects lda_c = Corpus.load(corpus_file) lda_m = None lda_v = None def load_model(k): global lda_m, lda_v lda_m = LDA.load(model_pattern.format(k)) lda_v = LDAViewer(lda_c, lda_m) load_model(args.k) # label function imports try: label_module = config.get('main', 'label_module') label_module = import_module(label_module) print "imported label module" label_module.init(config.get('main','path'), lda_v, context_type) except (ImportError, NoOptionError, AttributeError): pass try: label = label_module.label print "imported label function" except (AttributeError, UnboundLocalError): label = lambda x: x print "using default label function" try: id_fn = label_module.id_fn print "imported id function" except (AttributeError, UnboundLocalError): id_fn = def_label_fn print "using default id function" config_icons = config.get('www','icons').split(",") @route('/icons.js') def icons(): with open(resource_filename(__name__, '../www/icons.js')) as icons: text = '{0}\n var icons = {1};'\ .format(icons.read(), json.dumps(config_icons)) return text # index page parameterization corpus_name = config.get('www','corpus_name') corpus_link = config.get('www','corpus_link') doc_title_format = config.get('www', 'doc_title_format') doc_url_format = config.get('www', 'doc_url_format') if config.get('main', 'topic_range'): topic_range = map(int, config.get('main', 'topic_range').split(',')) topic_range = range(*topic_range) if config.get('main', 'topics'): topic_range = eval(config.get('main', 'topics')) topic_range = [{'k' : k, 'port' : int(config.get('www','port').format(0)) + k} for k in topic_range] renderer = pystache.Renderer(escape=lambda u: u) @route('/') def index(): response.set_header('Expires', _cache_date()) with open(resource_filename(__name__, '../www/index.mustache.html'), encoding='utf-8') as tmpl_file: template = tmpl_file.read() return renderer.render(template, {'corpus_name' : corpus_name, 'corpus_link' : corpus_link, 'context_type' : context_type, 'topic_range' : topic_range, 'doc_title_format' : doc_title_format, 'doc_url_format' : doc_url_format}) @route('/<filename:path>') @_set_acao_headers def send_static(filename): return static_file(filename, root=resource_filename(__name__, '../www/')) if args.ssl or config.get('main', 'ssl'): certfile = args.certfile or config.get('ssl', 'certfile') keyfile = args.keyfile or config.get('ssl', 'keyfile') ca_certs = args.ca_certs or config.get('ssl', 'ca_certs') run(host=host, port=port, server=SSLWSGIRefServer, certfile=certfile, keyfile=keyfile, ca_certs=ca_certs) else: run(host=host, port=port)
def nested_arr_to_np(arr, arrarr=False): outli = [] for r in arr: inli = [] for c in r: inli.append(c) if arrarr: inli = np.array(inli) outli.append(inli) return np.array(outli) if __name__ == '__main__': from vsm.corpus import Corpus path = '../org/knowceans/gibbstest/' c = Corpus.load(path + 'church_corp.npz') writepath = '/home/doori/inpho/org/knowceans/gibbstest/' ctx = 'document' # java can't process '..' in the path. gw, m = lda_run(c, path + 'churchcorp.txt', ctx, 10000, 2, writepath + 'church-meta.txt', 0.01, 0.01) save(m, ctx, writepath + 'church_lda.npz', writepath + 'church-meta.txt')
def main(args): from vsm.corpus import Corpus config = ConfigParser({"htrc": False}) config.read(args.config_file) if args.lang is None: args.lang = [] args.corpus_path = config.get("main", "corpus_file") c = Corpus.load(args.corpus_path) # check for htrc metadata if args.htrc or config.get("main","htrc"): htrc_langs = get_htrc_langs(args) if htrc_langs: args.lang.extend(new_langs) # auto-guess a language """ new_langs = [lang for lang in detect_langs(c) if lang in langs and lang not in args.lang] if new_langs: args.lang.extend(new_langs) """ # check for any new candidates args.lang = [lang for lang in args.lang if stop_language(c, langs[lang])] if args.lang and not args.quiet: args.lang = lang_prompt(args.lang) stoplist = set() # Apply stop words print " " for lang in args.lang: print "Applying", langs[lang], "stopwords" candidates = stop_language(c, langs[lang]) if len(candidates): stoplist.update(candidates) # Apply custom stopwords file if args.stopword_file: with open(args.stopword_file, encoding='utf8') as swf: candidates = [unidecode(word.strip()) for word in swf] if len(candidates): print "Applying custom stopword file to remove {} word{}.".format(len(candidates), 's' if len(candidates) > 1 else '') stoplist.update(candidates) if args.min_word_len: candidates = get_small_words(c, args.min_word_len) if len(candidates): print "Filtering {} small word{} with less than {} characters.".format(len(candidates), 's' if len(candidates) > 1 else '', args.min_word_len) stoplist.update(candidates) if not args.special_chars: candidates = get_special_chars(c) if len(candidates): print "Filtering {} word{} with special characters.".format(len(candidates), 's' if len(candidates) > 1 else '') stoplist.update(candidates) if not args.high_filter: high_filter, candidates = get_high_filter(args, c, words=stoplist) if len(candidates): print "Filtering {} high frequency word{}.".format(len(candidates), 's' if len(candidates) > 1 else '') stoplist.update(candidates) else: high_filter = args.high_filter candidates = get_candidate_words(c,args.high_filter, sort=False) if len(candidates): print "Filtering {} high frequency word{}.".format(len(candidates), 's' if len(candidates) > 1 else '') stoplist.update(candidates) if not args.low_filter: low_filter, candidates = get_low_filter(args, c, words=stoplist) if len(candidates): print "Filtering {} low frequency word{}.".format(len(candidates), 's' if len(candidates) > 1 else '') stoplist.update(candidates) else: low_filter = args.low_filter candidates = get_candidate_words(c, -1*args.low_filter, sort=False) if len(candidates): print "Filtering {} low frequency words.".format(len(candidates)) stoplist.update(candidates) if stoplist: print "\n\nApplying {} stopword{}".format(len(stoplist), 's' if len(stoplist) > 1 else '') c.in_place_stoplist(stoplist) print "\n" def name_corpus(dirname, languages, lowfreq=None, highfreq=None): items, counts = get_items_counts(c.corpus) corpus_name = [dirname] if args.lang: corpus_name.append('nltk') corpus_name.append(''.join(args.lang)) if lowfreq > 0: corpus_name.append('freq%s'%lowfreq) else: corpus_name.append('freq%s'%min(counts)) if highfreq > 0: corpus_name.append('N%s'%highfreq) else: corpus_name.append('freq%s'%max(counts)) corpus_name = '-'.join(corpus_name) corpus_name += '.npz' return corpus_name dirname = os.path.basename(args.corpus_path).split('-nltk-')[0].replace('.npz','') corpus_name = name_corpus(dirname, ['en'], low_filter, high_filter) model_path = os.path.dirname(args.corpus_path) args.corpus_path = os.path.join(model_path, corpus_name) c.save(args.corpus_path) config.set("main", "corpus_file", args.corpus_path) config.remove_option("main", "model_pattern") with open(args.config_file, 'wb') as configfh: config.write(configfh)