def process_pdfs(corpus_path, ignore=['.json', '.log', '.err', '.pickle', '.npz']): from topicexplorer.lib import pdf if os.path.isfile(corpus_path): print("PDF file detected, extracting plaintext to", corpus_path.replace('.pdf', '.txt')) pdf.main(corpus_path) corpus_path = corpus_path.replace('.pdf', '.txt') elif os.path.isdir(corpus_path): print("PDF files detected, extracting plaintext to", corpus_path + '-txt') if corpus_path.endswith('/'): corpus_path = corpus_path[:-1] # TODO: Add processing of collections contents = listdir_nohidden(corpus_path) contents = [os.path.join(corpus_path, obj) for obj in contents if not any([obj.endswith(suffix) for suffix in ignore])] count_dirs = len(list(filter(os.path.isdir, contents))) count_files = len(list(filter(os.path.isfile, contents))) if count_files > 0 and count_dirs == 0: # process all files pdf.main(corpus_path, corpus_path + '-txt') elif count_dirs > 0 and count_files == 0: # process each subdirectory for directory in contents: pdf.main(directory, directory.replace(corpus_path, corpus_path + '-txt')) else: raise IOError("Invalid Path: empty directory") corpus_path += '-txt' return corpus_path
def get_corpusbuilder_fn(corpus_path, sentences=False, ignore=['.json', '.log', '.err', '.pickle', '.npz']): relpaths = [os.path.relpath(path, start=corpus_path) for path in listdir_nohidden(corpus_path, recursive=True) if os.path.isfile(path) and not any([path.endswith(i) for i in ignore])] if sentences: raise NotImplementedError("""Collection corpuses are too large for sentence parsing. Reduce your corpus to a single folder or file.""") else: from vsm.extensions.corpusbuilders.corpusstreamers import corpus_from_files return corpus_from_files
def get_corpusbuilder_fn(corpus_path, sentences=False, ignore=['.json', '.log', '.err', '.pickle', '.npz']): relpaths = [ os.path.relpath(path, start=corpus_path) for path in listdir_nohidden(corpus_path, recursive=True) if os.path.isfile(path) and not any([path.endswith(i) for i in ignore]) ] dir_counts = defaultdict(int) for path in relpaths: dir_counts[os.path.dirname(path)] += 1 dirs = dir_counts.keys() populated_levels = [ 1 + dir.count(os.path.sep) for dir, key in dir_counts.items() ] levels = max(populated_levels) - min(populated_levels) print("{} files, {} dirs, {} levels".format(len(relpaths), len(dirs), levels)) if len(relpaths) == 1: if sentences: from vsm.extensions.ldasentences import toy_corpus else: from vsm.extensions.corpusbuilders import toy_corpus import functools toy_partial = functools.partial(toy_corpus, is_filename=True, autolabel=True) toy_partial.__name__ = 'toy_corpus' return toy_partial elif len(dirs) <= 1: if sentences: from vsm.extensions.ldasentences import dir_corpus else: from vsm.extensions.corpusbuilders import dir_corpus return dir_corpus elif sentences: raise NotImplementedError("""Collection corpuses are too large for sentence parsing. Reduce your corpus to a single folder or file.""") elif levels == 0 and max(populated_levels) == 1: from vsm.extensions.corpusbuilders import coll_corpus return coll_corpus else: from vsm.extensions.corpusbuilders import walk_corpus return walk_corpus
def get_corpusbuilder_fn(corpus_path, sentences=False, ignore=['.json', '.log', '.err', '.pickle', '.npz']): relpaths = [ os.path.relpath(path, start=corpus_path) for path in listdir_nohidden(corpus_path, recursive=True) if os.path.isfile(path) and not any([path.endswith(i) for i in ignore]) ] if sentences: raise NotImplementedError("""Collection corpuses are too large for sentence parsing. Reduce your corpus to a single folder or file.""") else: from vsm.extensions.corpusbuilders.corpusstreamers import corpus_from_files return corpus_from_files
def get_corpusbuilder_fn(corpus_path, sentences=False, ignore=[]): relpaths = [os.path.relpath(path, start=corpus_path) for path in listdir_nohidden(corpus_path, recursive=True) if os.path.isfile(path) and not any([path.endswith(i) for i in ignore])] dir_counts = defaultdict(int) for path in relpaths: dir_counts[os.path.dirname(path)] += 1 dirs = dir_counts.keys() populated_levels = [dir.count(os.path.sep) for dir, key in dir_counts.iteritems()] levels = max(populated_levels) - min(populated_levels) print "{} files, {} dirs, {} levels".format(len(relpaths), len(dirs), levels) if len(relpaths) == 1: if sentences: from vsm.extensions.ldasentences import toy_corpus else: from vsm.extensions.corpusbuilders import toy_corpus import functools return functools.partial(toy_corpus, is_filename=True, autolabel=True) elif len(dirs) <= 1: if sentences: from vsm.extensions.ldasentences import dir_corpus else: from vsm.extensions.corpusbuilders import dir_corpus return dir_corpus elif sentences: raise NotImplementedError("""Collection corpuses are too large for sentence parsing. Reduce your corpus to a single folder or file.""") elif levels == 0: from vsm.extensions.corpusbuilders import coll_corpus return coll_corpus else: from vsm.extensions.corpusbuilders import walk_corpus return walk_corpus
def process_pdfs(corpus_path, ignore=['.json', '.log', '.err', '.pickle', '.npz']): from topicexplorer.lib import pdf if os.path.isfile(corpus_path): print("PDF file detected, extracting plaintext to", corpus_path.replace('.pdf', '.txt')) pdf.main(corpus_path) corpus_path = corpus_path.replace('.pdf', '.txt') elif os.path.isdir(corpus_path): print("PDF files detected, extracting plaintext to", corpus_path + '-txt') if corpus_path.endswith('/'): corpus_path = corpus_path[:-1] # TODO: Add processing of collections contents = listdir_nohidden(corpus_path) contents = [ os.path.join(corpus_path, obj) for obj in contents if not any([obj.endswith(suffix) for suffix in ignore]) ] count_dirs = len(list(filter(os.path.isdir, contents))) count_files = len(list(filter(os.path.isfile, contents))) if count_files > 0 and count_dirs == 0: # process all files pdf.main(corpus_path, corpus_path + '-txt') elif count_dirs > 0 and count_files == 0: # process each subdirectory for directory in contents: pdf.main(directory, directory.replace(corpus_path, corpus_path + '-txt')) else: raise IOError("Invalid Path: empty directory") corpus_path += '-txt' return corpus_path
def main(args): # convert to unicode to avoid windows errors args.corpus_path = unicode(args.corpus_path, 'utf-8') # config corpus_path # process bibtex files args.bibtex = args.corpus_path.endswith('.bib') if args.bibtex: args.bibtex = args.corpus_path args.corpus_path = process_bibtex(args.corpus_path) # set corpus_name args.corpus_name = os.path.basename(args.corpus_path) if not args.corpus_name: args.corpus_name = os.path.basename(os.path.dirname(args.corpus_path)) if not args.corpus_print_name: args.corpus_print_name = prompt("Corpus Name", default=args.corpus_name) if args.htrc: import vsm.extensions.htrc as htrc htrc.proc_htrc_coll(args.corpus_path) import json data = [(id, htrc.metadata(id)) for id in listdir_nohidden(args.corpus_path) if os.path.isdir(id)] data = dict(data) md_filename = os.path.join(args.corpus_path, '../metadata.json') with open(md_filename, 'wb') as outfile: json.dump(data, outfile) # configure model-path if args.model_path is None: if os.path.isdir(args.corpus_path): args.model_path = os.path.join(args.corpus_path, '../models/') else: args.model_path = os.path.dirname(args.corpus_path) if args.model_path and not os.path.exists(args.model_path): os.makedirs(args.model_path) args.corpus_filename = get_corpus_filename(args.corpus_path, args.model_path, stop_freq=args.stop_freq) if not args.rebuild and os.path.exists(args.corpus_filename): while args.rebuild not in ['y', 'n', True]: args.rebuild = raw_input("\nCorpus file found. Rebuild? [y/N] ") args.rebuild = args.rebuild.lower().strip() if args.rebuild == 'y': args.rebuild = True elif args.rebuild == '': args.rebuild = 'n' else: args.rebuild = True if args.rebuild == True: try: args.corpus_filename = build_corpus(args.corpus_path, args.model_path, stop_freq=args.stop_freq, decode=args.decode, sentences=args.sentences, simple=args.simple, tokenizer=args.tokenizer) except IOError: print "ERROR: invalid path, please specify either:" print " * a single plain-text or PDF file," print " * a single bibtex (.bib) file with 'file' fields," print " * a folder of plain-text or PDF files, or" print " * a folder of folders of plain-text or PDF files." print "\nExiting..." sys.exit(74) """ except LookupError as e: if 'punkt' in e.message: print "\nERROR: sentence tokenizer not available, download by running:" print " python -m nltk.downloader punkt" elif 'stopwords' in e.message: print "\nERROR: stopwords not available, download by running:" print " python -m nltk.downloader stopwords" else: raise e print "\nExiting..." sys.exit(74) """ return write_config(args, args.config_file)
def main(args): # TODO: remove this code, check if there is an issue and unit test # convert to unicode to avoid windows errors # args.corpus_path = args.corpus_path # config corpus_path # process bibtex files args.bibtex = args.corpus_path.endswith('.bib') if args.bibtex: args.bibtex = args.corpus_path args.corpus_path = process_bibtex(args.corpus_path, args.quiet) # set corpus_name args.corpus_name = os.path.basename(args.corpus_path) if not args.corpus_name: args.corpus_name = os.path.basename(os.path.dirname(args.corpus_path)) if not args.corpus_print_name and not args.quiet: args.corpus_print_name = prompt("Corpus Name", default=args.corpus_name) # configure model-path if args.model_path is None: if os.path.isdir(args.corpus_path): args.model_path = os.path.join(args.corpus_path, '../models/') else: args.model_path = os.path.dirname(args.corpus_path) if args.model_path and not os.path.exists(os.path.abspath(args.model_path)): os.makedirs(os.path.abspath(args.model_path)) args.corpus_filename = get_corpus_filename( args.corpus_path, args.model_path, stop_freq=args.stop_freq) if not args.rebuild and os.path.exists(os.path.abspath(args.corpus_filename)): if args.quiet: print("Path exits: {}".format(args.corpus_filename)) sys.exit(1) else: args.rebuild = bool_prompt("\nCorpus file found. Rebuild? ", default=False) else: args.rebuild = True if args.htrc: import vsm.extensions.htrc as htrc if os.path.isdir(args.corpus_path): #htrc.proc_htrc_coll(args.corpus_path) ids = [id.replace('.txt','') for id in listdir_nohidden(args.corpus_path)] args.htrc_metapath = os.path.abspath(args.corpus_path + '/../') args.htrc_metapath = os.path.join(args.htrc_metapath, os.path.basename(args.corpus_path) + '.metadata.json') else: import topicexplorer.extensions.htrc_features as htrc_features with open(args.corpus_path) as idfile: ids = [row.strip() for row in idfile if row.strip()] c = htrc_features.create_corpus(ids, nltk_stop=args.nltk,freq=args.stop_freq) c.save(args.corpus_filename) if args.rebuild and (not args.htrc or os.path.isdir(args.corpus_path)): try: args.corpus_filename = build_corpus( args.corpus_path, args.model_path, stop_freq=args.stop_freq, decode=args.decode, nltk_stop=args.nltk, simple=args.simple, sentences=args.sentences, tokenizer=args.tokenizer) except IOError as e: print("ERROR: invalid path, please specify either:") print(" * a single plain-text or PDF file,") print(" * a single bibtex (.bib) file with 'file' fields,") print(" * a folder of plain-text or PDF files, or") print(" * a folder of folders of plain-text or PDF files.") print("\nExiting...") raise e sys.exit(74) """ except LookupError as e: if 'punkt' in e.message: print "\nERROR: sentence tokenizer not available, download by running:" print " python -m nltk.downloader punkt" elif 'stopwords' in e.message: print "\nERROR: stopwords not available, download by running:" print " python -m nltk.downloader stopwords" else: raise e print "\nExiting..." sys.exit(74) """ args.config_file = write_config(args, args.config_file) args.corpus_desc = args.config_file + '.md' if not args.quiet and os.path.exists(args.corpus_desc): while args.corpus_desc not in ['y', 'n', False]: args.corpus_desc = input("\nExisting corpus description found. Remove? [y/N] ") args.corpus_desc = args.corpus_desc.lower().strip() if args.corpus_desc == '': args.corpus_desc = False else: if args.corpus_desc == 'y': args.corpus_desc = args.config_file + '.md' if args.corpus_desc: with open(args.corpus_desc, 'w') as outfile: outfile.write( """This is an instance of the [InPhO Topic Explorer](http://inphodata.cogs.indiana.edu/). If you would like to add a custom corpus description, either: - Modify the contents of the file `{}` - Change the main:corpus_desc path in `{}` to an existing Markdown file. """.format(os.path.abspath(args.corpus_desc), os.path.abspath(args.config_file))) return args.config_file
def main(args): # TODO: remove this code, check if there is an issue and unit test # convert to unicode to avoid windows errors # args.corpus_path = args.corpus_path # config corpus_path # process bibtex files args.bibtex = args.corpus_path.endswith('.bib') if args.bibtex: args.bibtex = args.corpus_path args.corpus_path = process_bibtex(args.corpus_path, args.quiet) # set corpus_name args.corpus_name = os.path.basename(args.corpus_path) if not args.corpus_name: args.corpus_name = os.path.basename(os.path.dirname(args.corpus_path)) if not args.corpus_print_name and not args.quiet: args.corpus_print_name = prompt("Corpus Name", default=args.corpus_name) # configure model-path if args.model_path is None: if os.path.isdir(args.corpus_path): args.model_path = os.path.join(args.corpus_path, '../models/') else: args.model_path = os.path.dirname(args.corpus_path) if args.model_path and not os.path.exists(os.path.abspath( args.model_path)): os.makedirs(os.path.abspath(args.model_path)) args.corpus_filename = get_corpus_filename(args.corpus_path, args.model_path, stop_freq=args.stop_freq) if not args.rebuild and os.path.exists( os.path.abspath(args.corpus_filename)): if args.quiet: print("Path exits: {}".format(args.corpus_filename)) sys.exit(1) else: args.rebuild = bool_prompt("\nCorpus file found. Rebuild? ", default=False) else: args.rebuild = True if args.htrc: import vsm.extensions.htrc as htrc if os.path.isdir(args.corpus_path): #htrc.proc_htrc_coll(args.corpus_path) ids = [ id.replace('.txt', '') for id in listdir_nohidden(args.corpus_path) ] args.htrc_metapath = os.path.abspath(args.corpus_path + '/../') args.htrc_metapath = os.path.join( args.htrc_metapath, os.path.basename(args.corpus_path) + '.metadata.json') else: import topicexplorer.extensions.htrc_features as htrc_features with open(args.corpus_path) as idfile: ids = [row.strip() for row in idfile if row.strip()] c = htrc_features.create_corpus(ids, nltk_stop=args.nltk, freq=args.stop_freq) c.save(args.corpus_filename) if args.rebuild and (not args.htrc or os.path.isdir(args.corpus_path)): try: args.corpus_filename = build_corpus(args.corpus_path, args.model_path, stop_freq=args.stop_freq, decode=args.decode, nltk_stop=args.nltk, simple=args.simple, sentences=args.sentences, tokenizer=args.tokenizer) except IOError as e: print("ERROR: invalid path, please specify either:") print(" * a single plain-text or PDF file,") print(" * a single bibtex (.bib) file with 'file' fields,") print(" * a folder of plain-text or PDF files, or") print(" * a folder of folders of plain-text or PDF files.") print("\nExiting...") raise e sys.exit(74) """ except LookupError as e: if 'punkt' in e.message: print "\nERROR: sentence tokenizer not available, download by running:" print " python -m nltk.downloader punkt" elif 'stopwords' in e.message: print "\nERROR: stopwords not available, download by running:" print " python -m nltk.downloader stopwords" else: raise e print "\nExiting..." sys.exit(74) """ args.config_file = write_config(args, args.config_file) args.corpus_desc = args.config_file + '.md' if not args.quiet and os.path.exists(args.corpus_desc): while args.corpus_desc not in ['y', 'n', False]: args.corpus_desc = input( "\nExisting corpus description found. Remove? [y/N] ") args.corpus_desc = args.corpus_desc.lower().strip() if args.corpus_desc == '': args.corpus_desc = False else: if args.corpus_desc == 'y': args.corpus_desc = args.config_file + '.md' if args.corpus_desc: with open(args.corpus_desc, 'w') as outfile: outfile.write( """This is an instance of the [InPhO Topic Explorer](http://inphodata.cogs.indiana.edu/). If you would like to add a custom corpus description, either: - Modify the contents of the file `{}` - Change the main:corpus_desc path in `{}` to an existing Markdown file. """.format(os.path.abspath(args.corpus_desc), os.path.abspath(args.config_file))) return args.config_file
def main(args): # convert to unicode to avoid windows errors args.corpus_path = unicode(args.corpus_path, 'utf-8') # config corpus_path # process bibtex files args.bibtex = args.corpus_path.endswith('.bib') if args.bibtex: args.bibtex = args.corpus_path args.corpus_path = process_bibtex(args.corpus_path) # set corpus_name args.corpus_name = os.path.basename(args.corpus_path) if not args.corpus_name: args.corpus_name = os.path.basename(os.path.dirname(args.corpus_path)) if not args.corpus_print_name: args.corpus_print_name = prompt("Corpus Name", default=args.corpus_name) if args.htrc: import vsm.extensions.htrc as htrc htrc.proc_htrc_coll(args.corpus_path) import json data = [(id, htrc.metadata(id)) for id in listdir_nohidden(args.corpus_path) if os.path.isdir(id)] data = dict(data) md_filename = os.path.join(args.corpus_path, '../metadata.json') with open(md_filename, 'wb') as outfile: json.dump(data, outfile) # configure model-path if args.model_path is None: if os.path.isdir(args.corpus_path): args.model_path = os.path.join(args.corpus_path, '../models/') else: args.model_path = os.path.dirname(args.corpus_path) if args.model_path and not os.path.exists(args.model_path): os.makedirs(args.model_path) args.corpus_filename = get_corpus_filename( args.corpus_path, args.model_path, stop_freq=args.stop_freq) if not args.rebuild and os.path.exists(args.corpus_filename): while args.rebuild not in ['y', 'n', True]: args.rebuild = raw_input("\nCorpus file found. Rebuild? [y/N] ") args.rebuild = args.rebuild.lower().strip() if args.rebuild == 'y': args.rebuild = True elif args.rebuild == '': args.rebuild = 'n' else: args.rebuild = True if args.rebuild == True: try: args.corpus_filename = build_corpus(args.corpus_path, args.model_path, stop_freq=args.stop_freq, decode=args.decode, sentences=args.sentences, simple=args.simple,tokenizer=args.tokenizer) except IOError: print "ERROR: invalid path, please specify either:" print " * a single plain-text or PDF file," print " * a single bibtex (.bib) file with 'file' fields," print " * a folder of plain-text or PDF files, or" print " * a folder of folders of plain-text or PDF files." print "\nExiting..." sys.exit(74) """ except LookupError as e: if 'punkt' in e.message: print "\nERROR: sentence tokenizer not available, download by running:" print " python -m nltk.downloader punkt" elif 'stopwords' in e.message: print "\nERROR: stopwords not available, download by running:" print " python -m nltk.downloader stopwords" else: raise e print "\nExiting..." sys.exit(74) """ return write_config(args, args.config_file)