def get_tf_idf_scores(accu_path, input_data=None, vis=False): try: (code_book, graphlets, data) = input_data except: code_book, graphlets, data = utils.load_all_learning_files(accu_path) """BINARY COUNTING OF FEATURES:""" feature_freq = (data != 0).sum(axis=0) # TF: document_frequencies (N, f) = data.shape # Number of documents, and number of features print "number of documents = %s, number of features = %s " % (N, f) """ ## Inverse document frequency scores ## LONG HAND # idf_scores=[] # for i in feature_freq: # try: # idf_scores.append(math.log((N /float(i)))) # except: # idf_scores.append(0) """ idf_scores = [(math.log((N / float(i)))) if i > 0 else 0 for i in feature_freq] tf_idf_scores = np.array([[]]) for histogram in data: #freq = 1+math.log(freq) if freq>0 else 0 #log normalisation of Term Frequency foo = [idf_scores[cnt]*(math.log(1+freq)) for cnt, freq in enumerate(histogram)] try: tf_idf_scores = np.append(tf_idf_scores, np.array([foo]), axis=0) except ValueError: tf_idf_scores = np.array([foo]) print "tf-idf shape:", tf_idf_scores.shape return tf_idf_scores
def run_topic_model(accu_path, n_iters, n_topics, create_images, dirichlet_params, class_thresh=0): code_book, graphlets, data = utils.load_all_learning_files(accu_path) dictionary_codebook = {} try: import pyLDAvis dictionary_codebook = get_dic_codebook(code_book, graphlets, create_images) except ImportError: print "No module pyLDAvis. Cannot visualise topic model" print "sum of all data:", data.shape, data.sum() vocab = [ "{:20.0f}".format(hash).lstrip() for hash in list(code_book) ] # print "vocab:", len(vocab) doc_topic, topic_word = learn_topic_model(data, vocab, n_topics, n_iters, dictionary_codebook, dirichlet_params, class_thresh) print " per document topic proportions: ", doc_topic.shape print " per topic word distributions: ", topic_word.shape return doc_topic, topic_word