def __init__(self, model_name, func_tokenizer, func_stemmer):
        '''
            - Pre-condition: 
              pickle files or mongodb
                    data_home/ : 'dict_articles.pkl',  'df_articles.pkl', self.model_name + 'W_articles.pkl' self.model_name + 'X_articles.csv'
            - INPUT: 
                model_name   str
                func_tokenizer  tokenizer used in the model_name
                func_stemmer   stemmer used in the model_name
            - Post-condition:
                 self.df_articles, self.W_articles, self.X_articles
        '''
        if model_name == '':
            model_name = 'v2_2'
        self.top_k_recommend = 5
        self.max_rank = 50
        self.top_k_topics = 5
        self.max_cosine_sim_tfidf = 0.5

        self.method = None  # content or rating
        self.max_len_body_text = 600  # 3000
        self.model_name = model_name  # 'v2_2'

        t0 = time.time()  # time it

        self.topic_model = load_topic_model(
            model_name, func_tokenizer, func_stemmer)
        t1 = time.time()  # time it
        print "finished in  %4.4f %s " % ((t1 - t0) / 60, 'loading model\n')

        # load all articles
        # these include newest articles, which may not be used in model_name
        # and related H
        t0 = t1
        df_article_fname = data_home + 'df_articles.pkl'
        dict_article_fname = data_home + 'dict_articles.pkl'
        W_article_fname = data_home + self.model_name + 'W_articles.pkl'
        X_article_fname = data_home + self.model_name + 'X_articles.csv'

        if os.path.exists(df_article_fname):
            print 'found picklet files %s' % df_article_fname
            self.load_articles_from_pickle(
                df_article_fname, W_article_fname, X_article_fname, dict_article_fname)
        else:
            print 'no pickle files %s. read from mongodb' % df_article_fname
            self.df_articles = read_articles()
            self.W_articles, tokenized_articles, self.X_articles = self.topic_model.transform_bodytext2topics(
                self.df_articles.body_text, 1)

            with open(df_article_fname, 'w') as out_fh:
                pickle.dump(self.df_articles, out_fh)
            with open(W_article_fname, 'w') as out_fh:
                pickle.dump(self.W_articles, out_fh)
            with open(X_article_fname, 'w') as out_fh:
                pickle.dump(self.X_articles, out_fh)
            with open(dict_article_fname, 'w') as out_fh:
                pickle.dump(self.df_articles.to_dict(), out_fh)

        # print topic_model.sorted_topics_for_articles(W_articles[:1,:])
        self.sorted_topics_articles = self.topic_model.sorted_topics_for_articles(
            self.W_articles)

        t1 = time.time()  # time it
        print 'topics for articles:'
        print "finished in  %4.4f min for %s " % ((t1 - t0) / 60, 'topics of articles\n')
def make_recommendation(fname, model_name='v2_2'):
    '''
    test run content based recommendation content in fname
    command line make recommendations
        - INPUT: 
            fname str   input file name  ( in folder data/)
        - OUTPUT: 
    '''
    # load model
    t0 = time.time()
    func_tokenizer = TfidfVectorizer(stop_words='english').build_tokenizer()
    #model_name = 'v2_2'
    topic_model = load_topic_model(model_name, func_tokenizer)
    t1 = time.time()  # time it
    print "finished in  %4.4f %s " % ((t1 - t0) / 60, 'loading model\n')

    t0 = t1

    print 'fname: %s' % fname
    #read in input
    cleaned_slack = read_slack_msgs(func_tokenizer, fname=fname)
    # print type(cleaned_slack)
    W, tokenized_slacks2, test_X2 = topic_model.transform_bodytext2topics(
        [cleaned_slack], 1)
    print 'topics for slack messages'
    print topic_model.sorted_topics_for_articles(W)
    t1 = time.time()  # time it
    print "finished in  %4.4f min %s " % ((t1 - t0) / 60, 'topics of slack message\n')

    # load articles
    t0 = t1
    df_articles = read_articles()
    W_articles, tokenized_articles, X_articles = topic_model.transform_bodytext2topics(
        df_articles.body_text, 1)
    # print topic_model.sorted_topics_for_articles(W_articles[:1,:])
    sorted_topics_articles = topic_model.sorted_topics_for_articles(W_articles)
    # print sorted_topics_articles[:1]
    t1 = time.time()  # time it
    print '%i articles processed' % df_articles.shape[0]
    print "finished in  %4.4f min for %s " % ((t1 - t0) / 60, 'topics of articles\n')
    #test_X2, tokenized_slacks2 = transform_tfidf(vectorizer, [cleaned_slack])
    #test_X2 = test_X2.getA().flatten()

    # summary of input
    top_n = 50
    print "top %i most frequenct features in input %s" % (top_n, fname)
    sorted_feature_indexes = np.argsort(test_X2, axis=1)
    # print test_X2[desc_feature_indexes[:top_n]]
    features = topic_model.vectorizer.get_feature_names()
    i_article = 0
    desc_feature_indexes = sorted_feature_indexes[
        i_article, :].getA().flatten()[::-1]
    txt_list = []
    for i in desc_feature_indexes[:top_n]:
        txt_list.append('%s (%.2f)' % (features[i], test_X2[i_article, i]))
    print ', '.join(txt_list)

    # cal simmilarity to all articles
    t0 = time.time()
    cosine_similarities = linear_kernel(
        X_articles, test_X2[i_article, :]).flatten()
    cosin_simi_latent_topics = linear_kernel(
        W_articles, W[i_article, :]).flatten()
    cosine_similarities_rank = get_rank(cosine_similarities)
    cosin_simi_latent_topics_rank = get_rank(cosin_simi_latent_topics)
    t1 = time.time()  # time it
    print "finished in  %4.4f min for %s " % ((t1 - t0) / 60, 'calculate cosine similarity\n')

    # diagnostic similarity plots
    fig, ax = plt.subplots(1, 2, figsize=(10, 6))
    ax[0].scatter(cosine_similarities, cosin_simi_latent_topics, alpha=0.2)
    ax[0].set_title('cosine similarity: tfidf vs. latent topic')

    ax[1].scatter(
        cosine_similarities_rank, cosin_simi_latent_topics_rank, alpha=0.2)
    ax[1].set_title('rank cosine similarity: tfidf vs. latent topic')
    # plt.show()
    fig.savefig(fname.replace('.txt', '') + '_similarities.png')
    plt.close(fig)

    fig = plt.figure()
    plt.hist(cosine_similarities, bins=30, alpha=0.2, label='tfidf')
    plt.hist(cosin_simi_latent_topics, bins=30, alpha=0.2, label='topics')
    plt.title('cosine similarity to all articles')
    plt.legend()
    fig.savefig(fname.replace('.txt', '') + '_similarity_hist.png')
    # plt.show()

    # recommendations
    print '--------------- recommendations --------------'
    desc_sim_indexes = np.argsort(cosine_similarities)[::-1]
    cosine_similarities[desc_sim_indexes[:20]]
    i_print = 0
    top_k = 5

    for i in desc_sim_indexes[:50]:
        if cosin_simi_latent_topics_rank[i] < 50 and i_print < top_k:
            url = df_articles.iloc[i].url
            print sorted_topics_articles[i][0:2]
            print cosine_similarities[i], cosine_similarities_rank[i], '***' + df_articles.iloc[i].title + '***'
            print cosin_simi_latent_topics[i], cosin_simi_latent_topics_rank[i]
            print
            body_cleaned = ascii_text(df_articles.iloc[i].body_text[:300])
            print body_cleaned
            print
            i_print = i_print + 1