示例#1
0
def get_tf_idf_scores(accu_path, input_data=None, vis=False):

    try:
        (code_book, graphlets, data) = input_data
    except:
        code_book, graphlets, data = utils.load_all_learning_files(accu_path)

    """BINARY COUNTING OF FEATURES:"""
    feature_freq = (data != 0).sum(axis=0)  # TF: document_frequencies
    (N, f) = data.shape                     # Number of documents, and number of features
    print "number of documents = %s, number of features = %s " % (N, f)

    """
    ## Inverse document frequency scores
    ## LONG HAND
    # idf_scores=[]
    # for i in feature_freq:
    #     try:
    #         idf_scores.append(math.log((N /float(i))))
    #     except:
    #         idf_scores.append(0)
    """
    idf_scores = [(math.log((N / float(i)))) if i > 0 else 0 for i in feature_freq]

    tf_idf_scores = np.array([[]])
    for histogram in data:
        #freq = 1+math.log(freq) if freq>0 else 0  #log normalisation of Term Frequency
        foo = [idf_scores[cnt]*(math.log(1+freq)) for cnt, freq in enumerate(histogram)]
        try:
            tf_idf_scores = np.append(tf_idf_scores, np.array([foo]), axis=0)
        except ValueError:
            tf_idf_scores = np.array([foo])

    print "tf-idf shape:", tf_idf_scores.shape
    return tf_idf_scores
def run_topic_model(accu_path, n_iters, n_topics, create_images, dirichlet_params, class_thresh=0):

    code_book, graphlets, data = utils.load_all_learning_files(accu_path)
    dictionary_codebook = {}
    try:
        import pyLDAvis
        dictionary_codebook = get_dic_codebook(code_book, graphlets, create_images)
    except ImportError:
        print "No module pyLDAvis. Cannot visualise topic model"

    print "sum of all data:", data.shape, data.sum()
    vocab = [ "{:20.0f}".format(hash).lstrip() for hash in list(code_book) ]
    # print "vocab:", len(vocab)

    doc_topic, topic_word  = learn_topic_model(data, vocab, n_topics, n_iters, dictionary_codebook, dirichlet_params, class_thresh)
    print " per document topic proportions: ", doc_topic.shape
    print " per topic word distributions: ", topic_word.shape

    return doc_topic, topic_word