def topicOfClassificationForAllYear(probDir, modelDir, classDir, clf_dict, fun): probFiles = fileSys.traverseDirectory(probDir) topicFiles = fileSys.traverseTopicDirecotry(modelDir, 1) classFiles = fileSys.traverseDirectory(classDir) N = len(probFiles) if len(topicFiles) != N or len(classFiles) != N: print "numbers of files are not same" sys.exit('System will exit') all_clf_topic = {} if fun == 0: irange = range(0, N) # acm-class start from 1998 elif fun == 1: irange = range(5, N) for i in irange: prob = ioFile.load_object(probFiles[i]) topics = ioFile.load_object(topicFiles[i]) inFile = ioFile.dataFromFile(classFiles[i]) year = probFiles[i][-8:-4] topic_index = np.squeeze(np.array(prob.argmax(1))) doc_topic = topic_index #doc_topic = [] #[doc_topic.append(' '.join(topics[index])) for index in topic_index] all_clf, unique_clf = classificationOfDocument(inFile, clf_dict, fun) clf_topic = topicOfClassification(unique_clf, all_clf, clf_dict, doc_topic, fun) all_clf_topic[year] = clf_topic return all_clf_topic
def topics_from_to(start, end): years = range(start, end+1) topicFiles = fileSys.traverseTopicDirecotry(model_path, 1, years) # fill the path where distances are stored clf_topic = path.join(root_path, 'dtm/distance') distanceFiles = fileSys.traverseDistanceDirectory(clf_topic, years) topic_graph = graph.createGraph(topicFiles, distanceFiles, 0) return topic_graph
def statistics_for_class(class_mode, class_name): if class_mode == 'acm-class': clf_topic = ioFile.load_object(path.join(root_path, 'class_topic/class_topic_acm-class.pkl')) elif class_mode == 'arxiv-category': clf_topic = ioFile.load_object(path.join(root_path, 'class_topic/class_topic_arxiv-category.pkl')) clf_topic_stat = [] years = [] for year in range(1993, 2016): try: clf_topic_stat.append(Counter(clf_topic[str(year)][class_name])) years.append(year) except KeyError: print "No documents belonging to %s in %s " % (class_name, year) topicFiles = fileSys.traverseTopicDirecotry(model_path, 1, years) topic_bar = graph.createBarChat(topicFiles, clf_topic_stat, years) return topic_bar
def topics_for_class(class_mode, class_name, start, end): if class_mode == 'acm-class': clf_topic = ioFile.load_object(path.join(root_path, 'class_topic/class_topic_acm-class.pkl')) elif class_mode == 'arxiv-category': clf_topic = ioFile.load_object(path.join(root_path, 'class_topic/class_topic_arxiv-category.pkl')) clf_topic_stat = [] topic_num = [] years = set(range(start, end+1)) for year in range(start, end+1): try: clf_topic_stat.append(Counter(clf_topic[str(year)][class_name])) topic_num.append(len(set(clf_topic[str(year)][class_name]))) except KeyError: years.remove(year) topicFiles = fileSys.traverseTopicDirecotry(model_path, 1, years) clf_topic = path.join(root_path, 'class_topic/distance') distanceFiles = fileSys.traverseDistanceDirectory(clf_topic, list(years)) topic_graph = graph.createGraph(topicFiles, distanceFiles, 2, clf_topic_stat, topic_num) return topic_graph, years
def topicOfClassificationForAllYear(probDir, modelDir, classDir, clf_dict, fun): probFiles = fileSys.traverseDirectory(probDir) topicFiles = fileSys.traverseTopicDirecotry(modelDir, 1) classFiles = fileSys.traverseDirectory(classDir) N = len(probFiles) if len(topicFiles) != N or len(classFiles) != N: print "numbers of files are not same" sys.exit('System will exit') all_clf_topic = {} if fun == 0: irange = range(0, N) # acm-class start from 1998 elif fun == 1: irange = range(5, N) for i in irange: prob = ioFile.load_object(probFiles[i]) topics = ioFile.load_object(topicFiles[i]) inFile = ioFile.dataFromFile(classFiles[i]) year = probFiles[i][-8:-4] topic_index = np.squeeze(np.array(prob.argmax(1))) doc_topic = topic_index #doc_topic = [] #[doc_topic.append(' '.join(topics[index])) for index in topic_index] all_clf, unique_clf = classificationOfDocument(inFile, clf_dict, fun) clf_topic = topicOfClassification(unique_clf, all_clf, clf_dict, doc_topic, fun) all_clf_topic[year] = clf_topic return all_clf_topic