def ESA_1(fn="output/CV_Gab2005_STOP_CATS.json"): _log = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO, format='%(name)s: %(levelname)-8s %(message)s') cnt_articles, article_ids, train_set_dict = t.process_hgw_xml() _log.info("*** Data loaded. ***") while True: # enter 1 if you want tfidf processing to be performed, else saving the data will be only done s = raw_input("Press enter to start a process cycle:\n") try: reload(t) except NameError: _log.error("Could not reload the module.") try: if int(s) == 1: M, N, CSC_matrix, word_index = t.tfidf_normalize(cnt_articles, article_ids, train_set_dict) _log.info("*** Tfidf calculation successfully done. ***") except: e = sys.exc_info()[0] _log.error("Caught exception from the process\n%s\n%s" % (e, traceback.format_exc())) try: t.save_CV_all(CSC_matrix, fn, word_index, article_ids) _log.info("*** Data successfully saved. ***") except: e = sys.exc_info()[0] _log.error("Caught exception from the process\n%s\n%s" % (e, traceback.format_exc())) _log.info("Cycle ready.\n")
def ESA_selected(): _log = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO, format='%(name)s: %(levelname)-8s %(message)s') cnt_articles, article_ids, train_set_dict = t.process_hgw_xml(selected=True, f_in = "Gabrliovich_preprocessed/20051105_pages_articles.hgw.xml",f_articles_out = "output/AID_titles_test_selected.tsv") _log.info("*** Data loaded. ***") while True: # enter 1 if you want tfidf processing to be performed, else saving the data will be only done s = raw_input("Press enter to start a process cycle:\n") try: reload(t) except NameError: _log.error("Could not reload the module.") try: if int(s) == 1: M, N, CSC_matrix, word_index = t.tfidf_normalize(cnt_articles, article_ids, train_set_dict) _log.info("*** Tfidf calculation successfully done. ***") except: e = sys.exc_info()[0] _log.error("Caught exception from the process\n%s\n%s" % (e, traceback.format_exc())) try: if int(s) == 2: fn="output/CV_Gab2005_true_selected_by_Gab.json" t.save_CV_all(CSC_matrix, fn, word_index, article_ids) _log.info("*** Data successfully saved. ***") except: e = sys.exc_info()[0] _log.error("Caught exception from the process\n%s\n%s" % (e, traceback.format_exc())) try: if int(s) == 3: fn="output/CV_Gab2005_true_selected_by_Gab_pruned.json" t.save_CV_with_sliding_window_pruning(CSC_matrix, fn, word_index, article_ids) _log.info("*** Data successfully saved. ***") except: e = sys.exc_info()[0] _log.error("Caught exception from the process\n%s\n%s" % (e, traceback.format_exc())) _log.info("Cycle ready.\n")