def get_docs(docs=None, id_as_key=False, query=None): ctx_md = lda_v.corpus.view_metadata(context_type) if docs: # filter to metadata for selected docs ids = [lda_v.corpus.meta_int(context_type, {doc_label_name(context_type) : doc} ) for doc in docs] ctx_md = ctx_md[ids] else: #get metadata for all documents docs = lda_v.corpus.view_metadata(context_type)[doc_label_name(context_type)] js = dict() if id_as_key else list() for doc, md in zip(docs, ctx_md): if query is None or query.lower() in label(doc).lower(): struct = { 'id': doc, 'label' : label(doc), 'metadata' : dict(zip(md.dtype.names, [str(m) for m in md])) } if id_as_key: js[doc] = struct else: js.append(struct) return js
def get_docs(docs=None, id_as_key=False, query=None): ctx_md = lda_v.corpus.view_metadata(context_type) if docs: # filter to metadata for selected docs ids = [ lda_v.corpus.meta_int(context_type, {doc_label_name(context_type): doc}) for doc in docs ] ctx_md = ctx_md[ids] else: #get metadata for all documents docs = lda_v.corpus.view_metadata(context_type)[doc_label_name( context_type)] js = dict() if id_as_key else list() for doc, md in zip(docs, ctx_md): if query is None or query.lower() in label(doc).lower(): struct = { 'id': doc, 'label': label(doc), 'metadata': dict(zip(md.dtype.names, [str(m) for m in md])) } if id_as_key: js[doc] = struct else: js.append(struct) return js
def add_htrc_metadata(config, corpus=None, corpus_filename=None): import htrc.metadata config.set("main", "label_module", "topicexplorer.extensions.htrc") config.set("www", "doc_title_format", '<a href="{1}">{0}</a>') config.set("www", "doc_url_format", 'http://hdl.handle.net/2027/{0}') config.set("www", "icons", "htrcbook,link") config.set("main", "htrc", True) if corpus_filename: corpus = Corpus.load(corpus_filename) config.set("main", "context_type", corpus.context_types[0]) if corpus: ctx_type = config.get("main", "context_type") label_name = doc_label_name(ctx_type) ids = corpus.view_metadata(ctx_type)[label_name] htrc_metapath = os.path.abspath(config.get("main", "corpus_file")) htrc_metapath = os.path.join( os.path.dirname(htrc_metapath), os.path.basename(htrc_metapath) + '.metadata.json') print("Downloading metadata to ", htrc_metapath) htrc.metadata.get_metadata(ids, output_file=htrc_metapath) config.set("www", "htrc_metadata", htrc_metapath) return config
def docs(docs=None, q=None): response.content_type = 'application/json; charset=UTF8' response.set_header('Expires', _cache_date()) try: if request.query.q: q = unquote(request.query.q) except: pass try: if request.query.id: docs = [unquote(request.query.id)] except: pass try: response.set_header('Expires', 0) response.set_header('Pragma', 'no-cache') response.set_header('Cache-Control', 'no-cache, no-store, must-revalidate') if request.query.random: docs = [np.random.choice(lda_v.corpus.view_metadata(context_type)[doc_label_name(context_type)])] except: pass js = get_docs(docs, query=q) return json.dumps(js)
def docs(docs=None, q=None): response.content_type = 'application/json; charset=UTF8' response.set_header('Expires', _cache_date()) try: if request.query.q: q = unquote(request.query.q) except: pass try: if request.query.id: docs = [unquote(request.query.id)] except: pass try: response.set_header('Expires', 0) response.set_header('Pragma', 'no-cache') response.set_header('Cache-Control', 'no-cache, no-store, must-revalidate') if request.query.random: docs = [ np.random.choice( lda_v.corpus.view_metadata(context_type)[doc_label_name( context_type)]) ] except: pass js = get_docs(docs, query=q) return json.dumps(js)
def add_htrc_metadata(config, corpus=None): import htrc.metadata config.set("main", "label_module", "topicexplorer.extensions.htrc") config.set("www", "doc_title_format", '<a href="{1}">{0}</a>') config.set("www", "doc_url_format", 'http://hdl.handle.net/2027/{0}') config.set("www", "icons", "htrcbook,link") config.set("main", "htrc", True) if corpus: ctx_type = config.get("main", "context_type") label_name = doc_label_name(ctx_type) ids = corpus.view_metadata(ctx_type)[label_name] htrc_metapath = os.path.abspath(config.get("main", "corpus_file")) htrc_metapath = os.path.join( os.path.dirname(htrc_metapath), os.path.basename(htrc_metapath) + '.metadata.json') print("Downloading metadata to ", htrc_metapath) htrc.metadata.get_metadata(ids, output_file=htrc_metapath) config.set("www", "htrc_metadata", htrc_metapath) return config
def build_corpus(corpus_path, model_path, nltk_stop=False, stop_freq=0, context_type='document', ignore=['.json', '.log', '.err', '.pickle', '.npz'], decode=True, sentences=False, simple=True, tokenizer='default'): from vsm.corpus import Corpus # ensure that nltk packages are downloaded ensure_nltk_data_downloaded() # import appropriate tokenizer if tokenizer == 'default': from vsm.extensions.corpusbuilders.util import word_tokenize tokenizer = word_tokenize elif tokenizer == 'simple': from topicexplorer.tokenizer import simple_tokenizer tokenizer = simple_tokenizer elif tokenizer == 'zh': from topicexplorer.lib.chinese import modern_chinese_tokenizer tokenizer = modern_chinese_tokenizer elif tokenizer == 'ltc' or tokenizer == 'och': from topicexplorer.lib.chinese import ancient_chinese_tokenizer tokenizer = ancient_chinese_tokenizer elif tokenizer == 'inpho': from topicexplorer.extensions.inpho import inpho_tokenizer tokenizer = inpho_tokenizer elif tokenizer == 'brain': from hyperbrain.parse import brain_tokenizer tokenizer = brain_tokenizer else: raise NotImplementedError( "Tokenizer '{}' is not included in topicexplorer".format(tokenizer)) # pre-process PDF files contains_pdfs = corpus_path[-4:] == '.pdf' or contains_pattern(corpus_path, '*.pdf') if contains_pdfs: corpus_path = process_pdfs(corpus_path) print("Building corpus from", corpus_path, end=' ') corpusbuilder = get_corpusbuilder_fn(corpus_path, sentences, ignore=ignore) print("with {} function".format(corpusbuilder.__name__)) c = corpusbuilder(corpus_path, nltk_stop=nltk_stop, stop_freq=stop_freq, ignore=ignore, decode=decode, simple=simple, tokenizer=tokenizer) if contains_pdfs: from vsm.viewer.wrappers import doc_label_name import re label_name = doc_label_name(c.context_types[0]) new_labels = [re.sub('txt$', 'pdf', label) for label in c.context_data[0][label_name]] c.context_data[0][label_name] = new_labels filename = get_corpus_filename( corpus_path, model_path, nltk_stop, stop_freq, context_type) c.save(filename) return filename
def extract_labels(corpus, ctx_type, filename): """ Creates a new csv where each row is a label in the corpus. """ label_name = doc_label_name(ctx_type) labels = corpus.view_metadata(ctx_type)[label_name] with open(filename, 'w') as outfile: outfile.write(label_name + '\n') for label in labels: outfile.write(label + '\n')
def parse_metadata_from_csvfile(filename, context_type): """ Takes a csvfile where the first column in each row is the label. Returns a dictionary of dictionaries where each key is the label, and each value is a dictionary of field values. """ label_name = doc_label_name(context_type) with open(filename, encoding='utf8') as csvfile: reader = UnicodeDictReader(csvfile) metadata = SortedDict() for row in reader: metadata[row[label_name]] = row return metadata
def parse_metadata_from_csvfile(filename, context_type): """ Takes a csvfile where the first column in each row is the label. Returns a dictionary of dictionaries where each key is the label, and each value is a dictionary of field values. """ label_name = doc_label_name(context_type) with open(filename, encoding='utf8') as csvfile: reader = UnicodeDictReader(csvfile, delimiter='\t', quoting=csv.QUOTE_NONE) metadata = SortedDict() for row in reader: metadata[row[label_name]] = row return metadata
def docs(request): """! Función para servir los documentos como un json @author Jorge Redondo (jredondo at cenditel.gob.ve) @copyright GNU/GPLv2 @param request <b>{object}</b> Objeto que mantiene la peticion @return Retorna el objeto json """ global lda_v try: docs = lda_v.corpus.view_metadata(context_type)[doc_label_name(context_type)] js = list() for doc in docs: js.append({ 'id': doc, 'label' : label(doc) }) return HttpResponse(json.dumps(js)) except: return dump_exception()
def add_metadata(corpus, ctx_type, new_metadata, force=False, rename=False): import vsm.corpus # get existing metadata i = corpus.context_types.index(ctx_type) md = corpus.context_data[i] fields = md.dtype.fields.keys() # sort new_metadata according to existing md order # Note: this may raise a KeyError - in which case there's not md # for each entry. label_name = doc_label_name(ctx_type) labels = md[label_name] if rename: new_data = new_metadata.values() else: try: if force: new_data = [new_metadata.get(id, {}) for id in labels] else: new_data = [new_metadata[id] for id in labels] if not new_data: print("No metadata labels match existing labels.") print("If you changed labels, run with the `--rename` flag.") sys.exit(0) elif not force and len(new_data) != len(labels): raise KeyError except KeyError: print("New metadata does not span all documents in corpus.") print("If you changed labels, run with the `--rename` flag.") print("If you wish to force insertion of blank metadata,") print("run with the `--force` flag.") import sys sys.exit(1) # look for new fields new_fields = set() for vals in new_metadata.values(): new_fields.update(vals.keys()) # differentiate new and updated fields updated_fields = new_fields.intersection(fields) if not rename: updated_fields.remove(label_name) new_fields = new_fields.difference(fields) if None in new_fields: new_fields.remove(None) # process new fields for field in new_fields: if force: data = [d.get(field, '') for d in new_data] else: # new_data is a sorted list of metadata dictionaries data = [d[field] for d in new_data] corpus = vsm.corpus.add_metadata(corpus, ctx_type, field, data) # process existing fields for field in updated_fields: if force: data = [d.get(field, '') for d in new_data] else: data = [d[field] for d in new_data] corpus.context_data[i][field] = data return corpus
def load_corpus(self): self.c = Corpus.load(self.config.get('main', 'corpus_file')) self.context_type = self.config.get('main', 'context_type') self.ctx_metadata = self.c.view_metadata(self.context_type) self.all_ids = self.ctx_metadata[doc_label_name(self.context_type)]
config_file = r"$config_file" config = ConfigParser({ 'topic_range': None, 'topics': None, 'sentences' : 'false'}) config.read(config_file) # load the corpus if config.getboolean('main','sentences'): from vsm.extensions.ldasentences import CorpusSent c = CorpusSent.load(config.get('main', 'corpus_file')) else: c = Corpus.load(config.get('main', 'corpus_file')) context_type = config.get('main', 'context_type') ctx_metadata = c.view_metadata(context_type) all_ids = ctx_metadata[doc_label_name(context_type)] # create topic model patterns pattern = config.get('main', 'model_pattern') if config.get('main', 'topic_range'): topic_range = map(int, config.get('main', 'topic_range').split(',')) topic_range = range(*topic_range) if config.get('main', 'topics'): topic_range = eval(config.get('main', 'topics')) # load the topic models class keydefaultdict(defaultdict): """ Solution from: http://stackoverflow.com/a/2912455 """ def __missing__(self, key): if self.default_factory is None: raise KeyError( key )
def build_corpus(corpus_path, model_path, nltk_stop=False, stop_freq=0, context_type='document', ignore=['.json', '.log', '.err', '.pickle', '.npz'], decode=True, sentences=False, simple=True, tokenizer='default'): from vsm.corpus import Corpus from vsm.extensions.corpusbuilders import coll_corpus, dir_corpus, toy_corpus if sentences: print "Importing sentence constructors" from vsm.extensions.ldasentences import dir_corpus, toy_corpus # import appropriate tokenizer if tokenizer == 'default': from vsm.extensions.corpusbuilders.util import word_tokenize tokenizer = word_tokenize elif tokenizer == 'zh': from topicexplorer.lib.chinese import modern_chinese_tokenizer tokenizer = modern_chinese_tokenizer elif tokenizer == 'ltc' or tokenizer == 'och': from topicexplorer.lib.chinese import ancient_chinese_tokenizer tokenizer = ancient_chinese_tokenizer else: raise NotImplementedError( "Tokenizer '{}' is not included in topicexplorer".format( tokenizer)) # pre-process PDF files contains_pdfs = corpus_path[-4:] == '.pdf' or contains_pattern( corpus_path, '*.pdf') if contains_pdfs: corpus_path = process_pdfs(corpus_path) print "Building corpus from", corpus_path, corpusbuilder = get_corpusbuilder_fn(corpus_path, sentences, ignore=ignore) print "with {} function".format(corpusbuilder.__name__) c = corpusbuilder(corpus_path, nltk_stop=nltk_stop, stop_freq=stop_freq, ignore=ignore, decode=decode, simple=simple, tokenizer=tokenizer) ''' if os.path.isfile(corpus_path): print "Constructing toy corpus, each line is a document" if sentences: c = toy_corpus(corpus_path, is_filename=True, nltk_stop=nltk_stop, stop_freq=stop_freq, autolabel=True, decode=decode) else: c = toy_corpus(corpus_path, is_filename=True, nltk_stop=nltk_stop, stop_freq=stop_freq, autolabel=True, decode=decode, simple=simple, tokenizer=tokenizer) elif os.path.isdir(corpus_path): contents = listdir_nohidden(corpus_path) contents = [os.path.join(corpus_path,obj) for obj in contents if not any([obj.endswith(suffix) for suffix in ignore])] count_dirs = len(filter(os.path.isdir, contents)) count_files = len(filter(os.path.isfile, contents)) print "Detected %d folders and %d files in %s" %\ (count_dirs, count_files, corpus_path) if count_files > 0 and count_dirs == 0: print "Constructing directory corpus, each file is a document" if sentences: c = dir_corpus(corpus_path, nltk_stop=nltk_stop, stop_freq=stop_freq, chunk_name=context_type, ignore=ignore, decode=decode) else: c = dir_corpus(corpus_path, nltk_stop=nltk_stop, stop_freq=stop_freq, chunk_name=context_type, ignore=ignore, decode=decode, simple=simple, tokenizer=tokenizer) elif count_dirs > 0 and count_files == 0 and not sentences: print "Constructing collection corpus, each folder is a document" context_type='book' c = coll_corpus(corpus_path, nltk_stop=nltk_stop, stop_freq=stop_freq, ignore=ignore, decode=decode, simple=simple, tokenizer=tokenizer) elif count_dirs > 0 and count_files == 0 and sentences: raise NotImplementedError("""Collection corpuses are too large for sentence parsing. Reduce your corpus to a single folder or file.""") else: raise IOError("Invalid Path: empty directory") else: raise IOError("Invalid path") ''' if contains_pdfs: from vsm.viewer.wrappers import doc_label_name import re label_name = doc_label_name(c.context_types[0]) new_labels = [ re.sub('txt$', 'pdf', label) for label in c.context_data[0][label_name] ] c.context_data[0][label_name] = new_labels filename = get_corpus_filename(corpus_path, model_path, nltk_stop, stop_freq, context_type) c.save(filename) return filename
from vsm.viewer.wrappers import doc_label_name import os.path # load in the configuration file from ConfigParser import ConfigParser as ConfigParser config_file = r"$config_file" config = ConfigParser({'topic_range': None, 'topics': None}) config.read(config_file) # load the corpus c = Corpus.load(config.get('main', 'corpus_file')) context_type = config.get('main', 'context_type') ctx_metadata = c.view_metadata(context_type) all_ids = ctx_metadata[doc_label_name(context_type)] # create topic model patterns pattern = config.get('main', 'model_pattern') if config.get('main', 'topic_range'): topic_range = map(int, config.get('main', 'topic_range').split(',')) topic_range = range(*topic_range) if config.get('main', 'topics'): topic_range = eval(config.get('main', 'topics')) # load the topic models lda_m = dict() lda_v = dict() print topic_range print pattern for k in topic_range:
def build_corpus(corpus_path, model_path, nltk_stop=False, stop_freq=0, context_type='document', ignore=['.json', '.log', '.err', '.pickle', '.npz'], decode=True, sentences=False, simple=True, tokenizer='default'): from vsm.corpus import Corpus # ensure that nltk packages are downloaded ensure_nltk_data_downloaded() # import appropriate tokenizer if tokenizer == 'default': from vsm.extensions.corpusbuilders.util import word_tokenize tokenizer = word_tokenize elif tokenizer == 'simple': from topicexplorer.tokenizer import simple_tokenizer tokenizer = simple_tokenizer elif tokenizer == 'zh': from topicexplorer.lib.chinese import modern_chinese_tokenizer tokenizer = modern_chinese_tokenizer elif tokenizer == 'ltc' or tokenizer == 'och': from topicexplorer.lib.chinese import ancient_chinese_tokenizer tokenizer = ancient_chinese_tokenizer elif tokenizer == 'inpho': from topicexplorer.extensions.inpho import inpho_tokenizer tokenizer = inpho_tokenizer elif tokenizer == 'brain': from hyperbrain.parse import brain_tokenizer tokenizer = brain_tokenizer else: raise NotImplementedError( "Tokenizer '{}' is not included in topicexplorer".format( tokenizer)) # pre-process PDF files contains_pdfs = corpus_path[-4:] == '.pdf' or contains_pattern( corpus_path, '*.pdf') if contains_pdfs: corpus_path = process_pdfs(corpus_path) print("Building corpus from", corpus_path, end=' ') corpusbuilder = get_corpusbuilder_fn(corpus_path, sentences, ignore=ignore) print("with {} function".format(corpusbuilder.__name__)) c = corpusbuilder(corpus_path, nltk_stop=nltk_stop, stop_freq=stop_freq, ignore=ignore, decode=decode, simple=simple, tokenizer=tokenizer) if contains_pdfs: from vsm.viewer.wrappers import doc_label_name import re label_name = doc_label_name(c.context_types[0]) new_labels = [ re.sub('txt$', 'pdf', label) for label in c.context_data[0][label_name] ] c.context_data[0][label_name] = new_labels filename = get_corpus_filename(corpus_path, model_path, nltk_stop, stop_freq, context_type) c.save(filename) return filename
def id_fn(md): context_md = lda_v.corpus.view_metadata(context_type) ctx_label = doc_label_name(context_type) return context_md[ctx_label]
def build_corpus(corpus_path, model_path, nltk_stop=False, stop_freq=0, context_type='document', ignore=['.json','.log','.err','.pickle','.npz'], decode=True, sentences=False, simple=True, tokenizer='default'): from vsm.corpus import Corpus from vsm.extensions.corpusbuilders import coll_corpus, dir_corpus, toy_corpus if sentences: print "Importing sentence constructors" from vsm.extensions.ldasentences import dir_corpus, toy_corpus # import appropriate tokenizer if tokenizer == 'default': from vsm.extensions.corpusbuilders.util import word_tokenize tokenizer = word_tokenize elif tokenizer == 'zh': from topicexplorer.lib.chinese import modern_chinese_tokenizer tokenizer = modern_chinese_tokenizer elif tokenizer == 'ltc' or tokenizer == 'och': from topicexplorer.lib.chinese import ancient_chinese_tokenizer tokenizer = ancient_chinese_tokenizer else: raise NotImplementedError("Tokenizer '{}' is not included in topicexplorer".format(tokenizer)) # pre-process PDF files contains_pdfs = corpus_path[-4:] == '.pdf' or contains_pattern(corpus_path, '*.pdf') if contains_pdfs: corpus_path = process_pdfs(corpus_path) print "Building corpus from", corpus_path, corpusbuilder = get_corpusbuilder_fn(corpus_path, sentences, ignore=ignore) print "with {} function".format(corpusbuilder.__name__) c = corpusbuilder(corpus_path, nltk_stop=nltk_stop, stop_freq=stop_freq, ignore=ignore, decode=decode, simple=simple, tokenizer=tokenizer) ''' if os.path.isfile(corpus_path): print "Constructing toy corpus, each line is a document" if sentences: c = toy_corpus(corpus_path, is_filename=True, nltk_stop=nltk_stop, stop_freq=stop_freq, autolabel=True, decode=decode) else: c = toy_corpus(corpus_path, is_filename=True, nltk_stop=nltk_stop, stop_freq=stop_freq, autolabel=True, decode=decode, simple=simple, tokenizer=tokenizer) elif os.path.isdir(corpus_path): contents = listdir_nohidden(corpus_path) contents = [os.path.join(corpus_path,obj) for obj in contents if not any([obj.endswith(suffix) for suffix in ignore])] count_dirs = len(filter(os.path.isdir, contents)) count_files = len(filter(os.path.isfile, contents)) print "Detected %d folders and %d files in %s" %\ (count_dirs, count_files, corpus_path) if count_files > 0 and count_dirs == 0: print "Constructing directory corpus, each file is a document" if sentences: c = dir_corpus(corpus_path, nltk_stop=nltk_stop, stop_freq=stop_freq, chunk_name=context_type, ignore=ignore, decode=decode) else: c = dir_corpus(corpus_path, nltk_stop=nltk_stop, stop_freq=stop_freq, chunk_name=context_type, ignore=ignore, decode=decode, simple=simple, tokenizer=tokenizer) elif count_dirs > 0 and count_files == 0 and not sentences: print "Constructing collection corpus, each folder is a document" context_type='book' c = coll_corpus(corpus_path, nltk_stop=nltk_stop, stop_freq=stop_freq, ignore=ignore, decode=decode, simple=simple, tokenizer=tokenizer) elif count_dirs > 0 and count_files == 0 and sentences: raise NotImplementedError("""Collection corpuses are too large for sentence parsing. Reduce your corpus to a single folder or file.""") else: raise IOError("Invalid Path: empty directory") else: raise IOError("Invalid path") ''' if contains_pdfs: from vsm.viewer.wrappers import doc_label_name import re label_name = doc_label_name(c.context_types[0]) new_labels = [re.sub('txt$', 'pdf', label) for label in c.context_data[0][label_name]] c.context_data[0][label_name] = new_labels filename = get_corpus_filename( corpus_path, model_path, nltk_stop, stop_freq, context_type) c.save(filename) return filename