Exemplo n.º 1
0
def topicOfClassificationForAllYear(probDir, modelDir, classDir, clf_dict, fun):

    probFiles = fileSys.traverseDirectory(probDir)
    topicFiles = fileSys.traverseTopicDirecotry(modelDir, 1)
    classFiles = fileSys.traverseDirectory(classDir)
    
    N = len(probFiles)
    if len(topicFiles) != N or len(classFiles) != N:
        print "numbers of files are not same"
        sys.exit('System will exit')
    
    all_clf_topic = {}
    if fun == 0:
        irange = range(0, N)
    # acm-class start from 1998
    elif fun == 1:
        irange = range(5, N)
    for i in irange:
        prob = ioFile.load_object(probFiles[i])
        topics = ioFile.load_object(topicFiles[i])
        inFile = ioFile.dataFromFile(classFiles[i])
        
        year = probFiles[i][-8:-4]
        topic_index = np.squeeze(np.array(prob.argmax(1)))
        doc_topic = topic_index
        #doc_topic = []
        #[doc_topic.append(' '.join(topics[index])) for index in topic_index]
 
        all_clf, unique_clf = classificationOfDocument(inFile, clf_dict, fun)
        clf_topic = topicOfClassification(unique_clf, all_clf, clf_dict, doc_topic, fun)
        
        all_clf_topic[year] = clf_topic
    
    return all_clf_topic
Exemplo n.º 2
0
def topics_from_to(start, end):
    years = range(start, end+1)

    topicFiles = fileSys.traverseTopicDirecotry(model_path, 1, years)

    # fill the path where distances are stored
    clf_topic = path.join(root_path, 'dtm/distance')
    distanceFiles = fileSys.traverseDistanceDirectory(clf_topic, years)

    topic_graph = graph.createGraph(topicFiles, distanceFiles, 0)

    return topic_graph
Exemplo n.º 3
0
def statistics_for_class(class_mode, class_name):
    if class_mode == 'acm-class':
        clf_topic = ioFile.load_object(path.join(root_path, 'class_topic/class_topic_acm-class.pkl'))
    elif class_mode == 'arxiv-category':
        clf_topic = ioFile.load_object(path.join(root_path, 'class_topic/class_topic_arxiv-category.pkl'))

    clf_topic_stat = []
    years = []
    for year in range(1993, 2016):
        try:
            clf_topic_stat.append(Counter(clf_topic[str(year)][class_name]))
            years.append(year)
        except KeyError:
            print "No documents belonging to %s in %s " % (class_name, year)

    topicFiles = fileSys.traverseTopicDirecotry(model_path, 1, years)

    topic_bar = graph.createBarChat(topicFiles, clf_topic_stat, years)

    return topic_bar
Exemplo n.º 4
0
def topics_for_class(class_mode, class_name, start, end):
    if class_mode == 'acm-class':
        clf_topic = ioFile.load_object(path.join(root_path, 'class_topic/class_topic_acm-class.pkl'))
    elif class_mode == 'arxiv-category':
        clf_topic = ioFile.load_object(path.join(root_path, 'class_topic/class_topic_arxiv-category.pkl'))

    clf_topic_stat = []
    topic_num = []
    years = set(range(start, end+1))
    for year in range(start, end+1):
        try:
            clf_topic_stat.append(Counter(clf_topic[str(year)][class_name]))
            topic_num.append(len(set(clf_topic[str(year)][class_name])))
        except KeyError:
            years.remove(year)

    topicFiles = fileSys.traverseTopicDirecotry(model_path, 1, years)
    clf_topic = path.join(root_path, 'class_topic/distance')
    distanceFiles = fileSys.traverseDistanceDirectory(clf_topic, list(years))

    topic_graph = graph.createGraph(topicFiles, distanceFiles, 2, clf_topic_stat, topic_num)

    return topic_graph, years
Exemplo n.º 5
0
def topicOfClassificationForAllYear(probDir, modelDir, classDir, clf_dict,
                                    fun):

    probFiles = fileSys.traverseDirectory(probDir)
    topicFiles = fileSys.traverseTopicDirecotry(modelDir, 1)
    classFiles = fileSys.traverseDirectory(classDir)

    N = len(probFiles)
    if len(topicFiles) != N or len(classFiles) != N:
        print "numbers of files are not same"
        sys.exit('System will exit')

    all_clf_topic = {}
    if fun == 0:
        irange = range(0, N)
    # acm-class start from 1998
    elif fun == 1:
        irange = range(5, N)
    for i in irange:
        prob = ioFile.load_object(probFiles[i])
        topics = ioFile.load_object(topicFiles[i])
        inFile = ioFile.dataFromFile(classFiles[i])

        year = probFiles[i][-8:-4]
        topic_index = np.squeeze(np.array(prob.argmax(1)))
        doc_topic = topic_index
        #doc_topic = []
        #[doc_topic.append(' '.join(topics[index])) for index in topic_index]

        all_clf, unique_clf = classificationOfDocument(inFile, clf_dict, fun)
        clf_topic = topicOfClassification(unique_clf, all_clf, clf_dict,
                                          doc_topic, fun)

        all_clf_topic[year] = clf_topic

    return all_clf_topic