예제 #1
0
def ESA_1(fn="output/CV_Gab2005_STOP_CATS.json"):
	_log = logging.getLogger(__name__)
	logging.basicConfig(level=logging.INFO, format='%(name)s: %(levelname)-8s %(message)s')

	cnt_articles, article_ids, train_set_dict = t.process_hgw_xml()
	_log.info("*** Data loaded. ***")

	while True:
		# enter 1 if you want tfidf processing to be performed, else saving the data will be only done
		s = raw_input("Press enter to start a process cycle:\n")
		try:
			reload(t)
		except NameError:
			_log.error("Could not reload the module.")
		try:
			if int(s) == 1:
				M, N, CSC_matrix, word_index = t.tfidf_normalize(cnt_articles, article_ids, train_set_dict)
				_log.info("*** Tfidf calculation successfully done. ***")
		except:
			e = sys.exc_info()[0]
			_log.error("Caught exception from the process\n%s\n%s" % (e, traceback.format_exc()))	
		try:
			t.save_CV_all(CSC_matrix, fn, word_index, article_ids)
			_log.info("*** Data successfully saved. ***")
		except:
			e = sys.exc_info()[0]
			_log.error("Caught exception from the process\n%s\n%s" % (e, traceback.format_exc()))
		
		_log.info("Cycle ready.\n")
예제 #2
0
파일: ESA.py 프로젝트: sanja7s/SR_Wiki_ESA
def ESA():
    global F_IN
    global F_OUT_CV
    global F_OUT_AID
    _log = logging.getLogger(__name__)
    logging.basicConfig(level=logging.INFO, format="%(name)s: %(levelname)-8s %(message)s")

    articles_with_aid = r.read_in_wikiextractor_output(F_IN, F_OUT_AID)
    # print r.articles_selected(777)
    _log.info("*** Data loaded. ***")

    tf_idf_matrix, word_index, long_articles = t.tfidf_normalize(articles_with_aid)

    s.save_CV_all(tf_idf_matrix, word_index, long_articles, F_OUT_CV)

    while True:
        # enter 1 if you want tfidf processing to be performed, else saving the data will be only done
        c = raw_input("Press enter to start a process cycle:\n")
        try:
            reload(t)
        except NameError:
            _log.error("Could not reload the module.")
        try:
            if int(c) == 1:
                tf_idf_matrix, word_index, long_articles = t.tfidf_normalize(articles_with_aid)
                _log.info("*** Tfidf calculation successfully done. ***")
        except:
            e = sys.exc_info()[0]
            _log.error("Caught exception from the process\n%s\n%s" % (e, traceback.format_exc()))
        try:
            try:
                reload(s)
            except NameError:
                _log.error("Could not reload the module.")
            if int(c) == 2:
                s.save_CV_all(tf_idf_matrix, word_index, long_articles, F_OUT_CV)
                _log.info("*** Data successfully saved. ***")
        except:
            e = sys.exc_info()[0]
            _log.error("Caught exception from the process\n%s\n%s" % (e, traceback.format_exc()))

        _log.info("Cycle ready.\n")
예제 #3
0
def ESA_selected():
	_log = logging.getLogger(__name__)
	logging.basicConfig(level=logging.INFO, format='%(name)s: %(levelname)-8s %(message)s')

	cnt_articles, article_ids, train_set_dict = t.process_hgw_xml(selected=True, f_in = "Gabrliovich_preprocessed/20051105_pages_articles.hgw.xml",f_articles_out = "output/AID_titles_test_selected.tsv")
	_log.info("*** Data loaded. ***")

	while True:
		# enter 1 if you want tfidf processing to be performed, else saving the data will be only done
		s = raw_input("Press enter to start a process cycle:\n")
		try:
			reload(t)
		except NameError:
			_log.error("Could not reload the module.")
		try:
			if int(s) == 1:
				M, N, CSC_matrix, word_index = t.tfidf_normalize(cnt_articles, article_ids, train_set_dict)
				_log.info("*** Tfidf calculation successfully done. ***")
		except:
			e = sys.exc_info()[0]
			_log.error("Caught exception from the process\n%s\n%s" % (e, traceback.format_exc()))	
		try:
			if int(s) == 2:
				fn="output/CV_Gab2005_true_selected_by_Gab.json"
				t.save_CV_all(CSC_matrix, fn, word_index, article_ids)
				_log.info("*** Data successfully saved. ***")
		except:
			e = sys.exc_info()[0]
			_log.error("Caught exception from the process\n%s\n%s" % (e, traceback.format_exc()))
		try:
			if int(s) == 3:
				fn="output/CV_Gab2005_true_selected_by_Gab_pruned.json"
				t.save_CV_with_sliding_window_pruning(CSC_matrix, fn, word_index, article_ids)
				_log.info("*** Data successfully saved. ***")
		except:
			e = sys.exc_info()[0]
			_log.error("Caught exception from the process\n%s\n%s" % (e, traceback.format_exc()))
		
		_log.info("Cycle ready.\n")