data.set_database_filename("data/wikidump.db") data.build() print("Done,", time.time() - t, "s") t = time.time() print("============== Building list of ambiguous words ===================") filename_ambiguouswords = "data/ambiguous_words.txt" with open(filename_ambiguouswords, 'r') as f: ambiguous_words = { x.rstrip() for x in f.readlines() } if "" in ambiguous_words : ambiguous_words.remove("") nb_ambiguous_words = len(ambiguous_words) print("Done,", time.time() - t, "s") t = time.time() print("======================== Build corpora ============================") for n,w in enumerate(ambiguous_words): t2 = time.time() print("%s (%d/%d)" % (w, n, nb_ambiguous_words)) filename = "data/corpora/" + w + ".dump" if os.path.isfile(filename): print("Already done.") continue corpus = data.get_corpus(w) with open(filename, 'wb') as f: pickle.dump(corpus, f) print("ok (%f s)" % (time.time() - t2)) print("Done,", time.time() - t, "s")
from ambiruptor.library.learners.models import DecisionTreeClassifier from ambiruptor.library.learners.models import RandomForestClassifier from ambiruptor.library.miners.wiki_miners import DataMining if __name__ == '__main__': # Data Mining print("************************** Data mining ***************************") t = time.time() data = DataMining() data.set_wikidump_filename("data/wikidump.xml") data.set_database_filename("data/wikidump.db") data.build() corpus = data.get_corpus("Bar_(disambiguation)") print("Size of the corpus:", len(corpus), "articles") print("Done,", time.time() - t, "s") # Building features print("********************** Building/Loading features *************************") t = time.time() feature_extractor = fe.AmbiguousExtraction() if os.path.isfile("data/feature_extractors/test.dump"): print("Loading feature extractor...") feature_extractor.load("data/feature_extractors/test.dump") corpus_extractor = fe.CorpusExtraction() for f in feature_extractor.features: corpus_extractor.add_feature(f) else: print("Building feature extractor...")