def main(args): if args.model_path is None: if os.path.isdir(args.corpus_path): args.model_path = os.path.join(args.corpus_path, '../models/') else: args.model_path = os.path.dirname(args.corpus_path) if args.model_path and not os.path.exists(args.model_path): os.makedirs(args.model_path) args.corpus_name = os.path.basename(args.corpus_path) if not args.corpus_name: args.corpus_name = os.path.basename(os.path.dirname(args.corpus_path)) if not args.corpus_print_name: args.corpus_print_name = prompt("Corpus Name", default=args.corpus_name) if args.htrc: import vsm.extensions.htrc as htrc htrc.proc_htrc_coll(args.corpus_path) import json data = [(id, htrc.metadata(id)) for id in os.listdir(args.corpus_path) if os.path.isdir(id)] data = dict(data) md_filename = os.path.join(args.corpus_path, '../metadata.json') with open(md_filename, 'wb') as outfile: json.dump(data, outfile) args.corpus_filename = get_corpus_filename(args.corpus_path, args.model_path, stop_freq=5) if not args.rebuild and os.path.exists(args.corpus_filename): while args.rebuild not in ['y', 'n', True]: args.rebuild = raw_input("\nCorpus file found. Rebuild? [y/N] ") args.rebuild = args.rebuild.lower().strip() if args.rebuild == 'y': args.rebuild = True elif args.rebuild == '': args.rebuild = 'n' else: args.rebuild = True if args.rebuild == True: try: args.corpus_filename = build_corpus(args.corpus_path, args.model_path, stop_freq=5) except IOError: print "ERROR: invalid path, please specify either:" print " * a single plain-text file," print " * a folder of plain-text files, or" print " * a folder of folders of plain-text files." print "\nExiting..." sys.exit(74) return write_config(args, args.config_file)
def main(args): if args.model_path is None: if os.path.isdir(args.corpus_path): args.model_path = os.path.join(args.corpus_path, '../models/') else: args.model_path = os.path.dirname(args.corpus_path) if args.model_path and not os.path.exists(args.model_path): os.makedirs(args.model_path) args.corpus_name = os.path.basename(args.corpus_path) if not args.corpus_name: args.corpus_name = os.path.basename(os.path.dirname(args.corpus_path)) if not args.corpus_print_name: args.corpus_print_name = prompt("Corpus Name", default=args.corpus_name) if args.htrc: import vsm.extensions.htrc as htrc htrc.proc_htrc_coll(args.corpus_path) import json data = [(id, htrc.metadata(id)) for id in os.listdir(args.corpus_path) if os.path.isdir(id)] data = dict(data) md_filename = os.path.join(args.corpus_path, '../metadata.json') with open(md_filename, 'wb') as outfile: json.dump(data, outfile) args.corpus_filename = get_corpus_filename( args.corpus_path, args.model_path, stop_freq=5) if not args.rebuild and os.path.exists(args.corpus_filename): while args.rebuild not in ['y', 'n', True]: args.rebuild = raw_input("\nCorpus file found. Rebuild? [y/N] ") args.rebuild = args.rebuild.lower().strip() if args.rebuild == 'y': args.rebuild = True elif args.rebuild == '': args.rebuild = 'n' else: args.rebuild = True if args.rebuild == True: try: args.corpus_filename = build_corpus(args.corpus_path, args.model_path, stop_freq=5) except IOError: print "ERROR: invalid path, please specify either:" print " * a single plain-text file," print " * a folder of plain-text files, or" print " * a folder of folders of plain-text files." print "\nExiting..." sys.exit(74) return write_config(args, args.config_file)
def main(args): # convert to unicode to avoid windows errors args.corpus_path = unicode(args.corpus_path, 'utf-8') # config corpus_path # process bibtex files args.bibtex = args.corpus_path.endswith('.bib') if args.bibtex: args.bibtex = args.corpus_path args.corpus_path = process_bibtex(args.corpus_path) # set corpus_name args.corpus_name = os.path.basename(args.corpus_path) if not args.corpus_name: args.corpus_name = os.path.basename(os.path.dirname(args.corpus_path)) if not args.corpus_print_name: args.corpus_print_name = prompt("Corpus Name", default=args.corpus_name) if args.htrc: import vsm.extensions.htrc as htrc htrc.proc_htrc_coll(args.corpus_path) import json data = [(id, htrc.metadata(id)) for id in listdir_nohidden(args.corpus_path) if os.path.isdir(id)] data = dict(data) md_filename = os.path.join(args.corpus_path, '../metadata.json') with open(md_filename, 'wb') as outfile: json.dump(data, outfile) # configure model-path if args.model_path is None: if os.path.isdir(args.corpus_path): args.model_path = os.path.join(args.corpus_path, '../models/') else: args.model_path = os.path.dirname(args.corpus_path) if args.model_path and not os.path.exists(args.model_path): os.makedirs(args.model_path) args.corpus_filename = get_corpus_filename(args.corpus_path, args.model_path, stop_freq=args.stop_freq) if not args.rebuild and os.path.exists(args.corpus_filename): while args.rebuild not in ['y', 'n', True]: args.rebuild = raw_input("\nCorpus file found. Rebuild? [y/N] ") args.rebuild = args.rebuild.lower().strip() if args.rebuild == 'y': args.rebuild = True elif args.rebuild == '': args.rebuild = 'n' else: args.rebuild = True if args.rebuild == True: try: args.corpus_filename = build_corpus(args.corpus_path, args.model_path, stop_freq=args.stop_freq, decode=args.decode, sentences=args.sentences, simple=args.simple, tokenizer=args.tokenizer) except IOError: print "ERROR: invalid path, please specify either:" print " * a single plain-text or PDF file," print " * a single bibtex (.bib) file with 'file' fields," print " * a folder of plain-text or PDF files, or" print " * a folder of folders of plain-text or PDF files." print "\nExiting..." sys.exit(74) """ except LookupError as e: if 'punkt' in e.message: print "\nERROR: sentence tokenizer not available, download by running:" print " python -m nltk.downloader punkt" elif 'stopwords' in e.message: print "\nERROR: stopwords not available, download by running:" print " python -m nltk.downloader stopwords" else: raise e print "\nExiting..." sys.exit(74) """ return write_config(args, args.config_file)
def main(args): # TODO: remove this code, check if there is an issue and unit test # convert to unicode to avoid windows errors # args.corpus_path = args.corpus_path # config corpus_path # process bibtex files args.bibtex = args.corpus_path.endswith('.bib') if args.bibtex: args.bibtex = args.corpus_path args.corpus_path = process_bibtex(args.corpus_path, args.quiet) # set corpus_name args.corpus_name = os.path.basename(args.corpus_path) if not args.corpus_name: args.corpus_name = os.path.basename(os.path.dirname(args.corpus_path)) if not args.corpus_print_name and not args.quiet: args.corpus_print_name = prompt("Corpus Name", default=args.corpus_name) # configure model-path if args.model_path is None: if os.path.isdir(args.corpus_path): args.model_path = os.path.join(args.corpus_path, '../models/') else: args.model_path = os.path.dirname(args.corpus_path) if args.model_path and not os.path.exists(os.path.abspath(args.model_path)): os.makedirs(os.path.abspath(args.model_path)) args.corpus_filename = get_corpus_filename( args.corpus_path, args.model_path, stop_freq=args.stop_freq) if not args.rebuild and os.path.exists(os.path.abspath(args.corpus_filename)): if args.quiet: print("Path exits: {}".format(args.corpus_filename)) sys.exit(1) else: args.rebuild = bool_prompt("\nCorpus file found. Rebuild? ", default=False) else: args.rebuild = True if args.htrc: import vsm.extensions.htrc as htrc if os.path.isdir(args.corpus_path): #htrc.proc_htrc_coll(args.corpus_path) ids = [id.replace('.txt','') for id in listdir_nohidden(args.corpus_path)] args.htrc_metapath = os.path.abspath(args.corpus_path + '/../') args.htrc_metapath = os.path.join(args.htrc_metapath, os.path.basename(args.corpus_path) + '.metadata.json') else: import topicexplorer.extensions.htrc_features as htrc_features with open(args.corpus_path) as idfile: ids = [row.strip() for row in idfile if row.strip()] c = htrc_features.create_corpus(ids, nltk_stop=args.nltk,freq=args.stop_freq) c.save(args.corpus_filename) if args.rebuild and (not args.htrc or os.path.isdir(args.corpus_path)): try: args.corpus_filename = build_corpus( args.corpus_path, args.model_path, stop_freq=args.stop_freq, decode=args.decode, nltk_stop=args.nltk, simple=args.simple, sentences=args.sentences, tokenizer=args.tokenizer) except IOError as e: print("ERROR: invalid path, please specify either:") print(" * a single plain-text or PDF file,") print(" * a single bibtex (.bib) file with 'file' fields,") print(" * a folder of plain-text or PDF files, or") print(" * a folder of folders of plain-text or PDF files.") print("\nExiting...") raise e sys.exit(74) """ except LookupError as e: if 'punkt' in e.message: print "\nERROR: sentence tokenizer not available, download by running:" print " python -m nltk.downloader punkt" elif 'stopwords' in e.message: print "\nERROR: stopwords not available, download by running:" print " python -m nltk.downloader stopwords" else: raise e print "\nExiting..." sys.exit(74) """ args.config_file = write_config(args, args.config_file) args.corpus_desc = args.config_file + '.md' if not args.quiet and os.path.exists(args.corpus_desc): while args.corpus_desc not in ['y', 'n', False]: args.corpus_desc = input("\nExisting corpus description found. Remove? [y/N] ") args.corpus_desc = args.corpus_desc.lower().strip() if args.corpus_desc == '': args.corpus_desc = False else: if args.corpus_desc == 'y': args.corpus_desc = args.config_file + '.md' if args.corpus_desc: with open(args.corpus_desc, 'w') as outfile: outfile.write( """This is an instance of the [InPhO Topic Explorer](http://inphodata.cogs.indiana.edu/). If you would like to add a custom corpus description, either: - Modify the contents of the file `{}` - Change the main:corpus_desc path in `{}` to an existing Markdown file. """.format(os.path.abspath(args.corpus_desc), os.path.abspath(args.config_file))) return args.config_file
def main(args): # TODO: remove this code, check if there is an issue and unit test # convert to unicode to avoid windows errors # args.corpus_path = args.corpus_path # config corpus_path # process bibtex files args.bibtex = args.corpus_path.endswith('.bib') if args.bibtex: args.bibtex = args.corpus_path args.corpus_path = process_bibtex(args.corpus_path, args.quiet) # set corpus_name args.corpus_name = os.path.basename(args.corpus_path) if not args.corpus_name: args.corpus_name = os.path.basename(os.path.dirname(args.corpus_path)) if not args.corpus_print_name and not args.quiet: args.corpus_print_name = prompt("Corpus Name", default=args.corpus_name) # configure model-path if args.model_path is None: if os.path.isdir(args.corpus_path): args.model_path = os.path.join(args.corpus_path, '../models/') else: args.model_path = os.path.dirname(args.corpus_path) if args.model_path and not os.path.exists(os.path.abspath( args.model_path)): os.makedirs(os.path.abspath(args.model_path)) args.corpus_filename = get_corpus_filename(args.corpus_path, args.model_path, stop_freq=args.stop_freq) if not args.rebuild and os.path.exists( os.path.abspath(args.corpus_filename)): if args.quiet: print("Path exits: {}".format(args.corpus_filename)) sys.exit(1) else: args.rebuild = bool_prompt("\nCorpus file found. Rebuild? ", default=False) else: args.rebuild = True if args.htrc: import vsm.extensions.htrc as htrc if os.path.isdir(args.corpus_path): #htrc.proc_htrc_coll(args.corpus_path) ids = [ id.replace('.txt', '') for id in listdir_nohidden(args.corpus_path) ] args.htrc_metapath = os.path.abspath(args.corpus_path + '/../') args.htrc_metapath = os.path.join( args.htrc_metapath, os.path.basename(args.corpus_path) + '.metadata.json') else: import topicexplorer.extensions.htrc_features as htrc_features with open(args.corpus_path) as idfile: ids = [row.strip() for row in idfile if row.strip()] c = htrc_features.create_corpus(ids, nltk_stop=args.nltk, freq=args.stop_freq) c.save(args.corpus_filename) if args.rebuild and (not args.htrc or os.path.isdir(args.corpus_path)): try: args.corpus_filename = build_corpus(args.corpus_path, args.model_path, stop_freq=args.stop_freq, decode=args.decode, nltk_stop=args.nltk, simple=args.simple, sentences=args.sentences, tokenizer=args.tokenizer) except IOError as e: print("ERROR: invalid path, please specify either:") print(" * a single plain-text or PDF file,") print(" * a single bibtex (.bib) file with 'file' fields,") print(" * a folder of plain-text or PDF files, or") print(" * a folder of folders of plain-text or PDF files.") print("\nExiting...") raise e sys.exit(74) """ except LookupError as e: if 'punkt' in e.message: print "\nERROR: sentence tokenizer not available, download by running:" print " python -m nltk.downloader punkt" elif 'stopwords' in e.message: print "\nERROR: stopwords not available, download by running:" print " python -m nltk.downloader stopwords" else: raise e print "\nExiting..." sys.exit(74) """ args.config_file = write_config(args, args.config_file) args.corpus_desc = args.config_file + '.md' if not args.quiet and os.path.exists(args.corpus_desc): while args.corpus_desc not in ['y', 'n', False]: args.corpus_desc = input( "\nExisting corpus description found. Remove? [y/N] ") args.corpus_desc = args.corpus_desc.lower().strip() if args.corpus_desc == '': args.corpus_desc = False else: if args.corpus_desc == 'y': args.corpus_desc = args.config_file + '.md' if args.corpus_desc: with open(args.corpus_desc, 'w') as outfile: outfile.write( """This is an instance of the [InPhO Topic Explorer](http://inphodata.cogs.indiana.edu/). If you would like to add a custom corpus description, either: - Modify the contents of the file `{}` - Change the main:corpus_desc path in `{}` to an existing Markdown file. """.format(os.path.abspath(args.corpus_desc), os.path.abspath(args.config_file))) return args.config_file
def main(args): # convert to unicode to avoid windows errors args.corpus_path = unicode(args.corpus_path, 'utf-8') # config corpus_path # process bibtex files args.bibtex = args.corpus_path.endswith('.bib') if args.bibtex: args.bibtex = args.corpus_path args.corpus_path = process_bibtex(args.corpus_path) # set corpus_name args.corpus_name = os.path.basename(args.corpus_path) if not args.corpus_name: args.corpus_name = os.path.basename(os.path.dirname(args.corpus_path)) if not args.corpus_print_name: args.corpus_print_name = prompt("Corpus Name", default=args.corpus_name) if args.htrc: import vsm.extensions.htrc as htrc htrc.proc_htrc_coll(args.corpus_path) import json data = [(id, htrc.metadata(id)) for id in listdir_nohidden(args.corpus_path) if os.path.isdir(id)] data = dict(data) md_filename = os.path.join(args.corpus_path, '../metadata.json') with open(md_filename, 'wb') as outfile: json.dump(data, outfile) # configure model-path if args.model_path is None: if os.path.isdir(args.corpus_path): args.model_path = os.path.join(args.corpus_path, '../models/') else: args.model_path = os.path.dirname(args.corpus_path) if args.model_path and not os.path.exists(args.model_path): os.makedirs(args.model_path) args.corpus_filename = get_corpus_filename( args.corpus_path, args.model_path, stop_freq=args.stop_freq) if not args.rebuild and os.path.exists(args.corpus_filename): while args.rebuild not in ['y', 'n', True]: args.rebuild = raw_input("\nCorpus file found. Rebuild? [y/N] ") args.rebuild = args.rebuild.lower().strip() if args.rebuild == 'y': args.rebuild = True elif args.rebuild == '': args.rebuild = 'n' else: args.rebuild = True if args.rebuild == True: try: args.corpus_filename = build_corpus(args.corpus_path, args.model_path, stop_freq=args.stop_freq, decode=args.decode, sentences=args.sentences, simple=args.simple,tokenizer=args.tokenizer) except IOError: print "ERROR: invalid path, please specify either:" print " * a single plain-text or PDF file," print " * a single bibtex (.bib) file with 'file' fields," print " * a folder of plain-text or PDF files, or" print " * a folder of folders of plain-text or PDF files." print "\nExiting..." sys.exit(74) """ except LookupError as e: if 'punkt' in e.message: print "\nERROR: sentence tokenizer not available, download by running:" print " python -m nltk.downloader punkt" elif 'stopwords' in e.message: print "\nERROR: stopwords not available, download by running:" print " python -m nltk.downloader stopwords" else: raise e print "\nExiting..." sys.exit(74) """ return write_config(args, args.config_file)