def train(links): from math import exp,fabs,log fwords=most_frequent_words() classifiers=[PredicateClassifier(HasWordsPredicate([w])) for w in fwords] #classifiers.extend(PredicateClassifier(HasWordsPredicate(duo)) for duo in most_frequent_duos(fwords)) titles=[mash_post(l) for l in links] evaluations=[1. if l.evaluation else -1. for l in links] weights=[1./len(links) for l in links] trained=[] print "Training on %d features..." % len(classifiers) while True: print ".", min_error=1e6 ; best=None for c in classifiers: c.train(titles,weights,evaluations) error=sum(weights[n]*0.5*fabs(c.predict(t)-evaluations[n]) for n,t in enumerate(titles)) if error < min_error: best=c; min_error=error if min_error>=0.5: print min_error break Zt=sum(weights[n]*exp(-best.predict(t)*evaluations[n]) for n,t in enumerate(titles)) weights=[weights[n]*exp(-best.predict(t)*evaluations[n])/Zt for n,t in enumerate(titles)] alphat=0.5*log((1-min_error)/min_error) trained.append((best,alphat)) classifiers.remove(best) for c,alpha in trained: print c.predicate,c.wordgood,alpha import cPickle cPickle.dump(trained,open("adaboost.pck","wb"),-1)
def text_analysis( data_path, column, groups, language, lemmatize, ngram_range, num_topics, num_words, manual_mappings, generate_word_cloud, word_cloud_filename, frequent_words_filename, frequent_words_plot_filename, top_tfidf_words_filename, top_tfidf_words_plot_filename, predict_topics, topics_filename, predicted_topics_filename, ldavis_filename_prefix, predict_sentiment, predicted_sentiment_filename, should_upload_db, account_key_path ): print("Loading data...") data_df = load_data(data_path, column, groups) print("Loaded data sample") print(data_df.head()) print() print("Cleaning data...") data_df[column] = clean_data(data_df[column]) print("Clean data sample") print(data_df.head()) print() print("Removing stop words from data...") data_df[column] = remove_stopwords(data_df[column], language) print("Data sample") print(data_df.head()) print() if lemmatize: print("Lemmatizing data...") data_df[column] = lemmatize_text(data_df[column], language) print("Lemmatized data sample") print(data_df.head()) print() if manual_mappings: print("Applying manual mappings...") data_df[column] = apply_manual_mappings(data_df[column], manual_mappings) print("Manually mapped data sample") print(data_df.head()) print() if generate_word_cloud: print("Generating word cloud...") plot_word_cloud(data_df[column], word_cloud_filename, language) print("word_cloud saved to:", word_cloud_filename) print() count_vectorizer, count_data = get_count_vectorizer_and_transformed_data( data_df[column], language, ngram_range ) all_word_count_pair_list = most_frequent_words( count_data, count_vectorizer, count_data.shape[0] + 1 ) word_count_pair_list = all_word_count_pair_list[:num_words] tfidf_vectorizer, tfidf_data = get_tfidf_vectorizer_and_transformed_data( data_df[column], language, ngram_range ) all_tfidf_pair_list = most_frequent_words( tfidf_data, tfidf_vectorizer, tfidf_data.shape[0] + 1 ) tfidf_pair_list = all_tfidf_pair_list[:num_words] print("Saving frequent words...") save_words( all_word_count_pair_list, frequent_words_filename ) print("Frequent words saved to:", frequent_words_filename) print() if should_upload_db: db_client = connect_db(account_key_path) else: db_client = None if should_upload_db: print("Uploading frequent words to db...") upload_db(db_client, 'frequent_words', { column: {w: int(c) for w, c in word_count_pair_list} }) print('Done') print() print("Generating frequent word plot...") plot_top_words(word_count_pair_list, frequent_words_plot_filename) print("Frequent word plot saved to:", frequent_words_plot_filename) print() print("Saving top tfidf words...") save_words( all_tfidf_pair_list, top_tfidf_words_filename ) print("Top tfidf words saved to:", top_tfidf_words_filename) print() if should_upload_db: print("Uploading frequent words to db...") upload_db(db_client, 'top_tfidf', { column: {w: int(c) for w, c in tfidf_pair_list} }) print('Done') print() print("Generating top tfidf word plot...") plot_top_words(tfidf_pair_list, top_tfidf_words_plot_filename) print("Top tfidf word plot saved to:", top_tfidf_words_plot_filename) print() if groups: group_unique_vals = {} for group in groups: group_unique_vals[group] = data_df[group].unique() splits = {} for group, unique_vals in group_unique_vals.items(): for val in unique_vals: splits[(group, val)] = data_df[group] == val for i in range(len(groups) - 1): splits = concat_splits(splits) grouped_words_counts = {} grouped_words_tfidf = {} for key, split_idcs in splits.items(): split = data_df[split_idcs] split_texts = split[column] if len(split_texts) > 0 and any(split_texts.str.len() > 0): word_cloud_filename_val = add_prefix_to_filename( word_cloud_filename, key ) frequent_words_filename_val = add_prefix_to_filename( frequent_words_filename, key ) frequent_words_plot_filename_val = add_prefix_to_filename( frequent_words_plot_filename, key ) top_tfidf_words_filename_val = add_prefix_to_filename( top_tfidf_words_filename, key ) top_tfidf_words_plot_filename_val = add_prefix_to_filename( top_tfidf_words_plot_filename, key ) if generate_word_cloud: print("Generating word cloud...") plot_word_cloud(split_texts, word_cloud_filename_val, language) print("word_cloud saved to:", word_cloud_filename_val) print() try: count_vectorizer, count_data = get_count_vectorizer_and_transformed_data( split_texts, language, ngram_range ) all_word_count_pair_list = most_frequent_words( count_data, count_vectorizer, count_data.shape[0] + 1 ) word_count_pair_list = all_word_count_pair_list[:num_words] tfidf_vectorizer, tfidf_data = get_tfidf_vectorizer_and_transformed_data( split_texts, language, ngram_range ) all_tfidf_pair_list = most_frequent_words( tfidf_data, tfidf_vectorizer, tfidf_data.shape[0] + 1 ) tfidf_pair_list = all_tfidf_pair_list[:num_words] print("Saving frequent words...") save_words( all_word_count_pair_list, frequent_words_filename_val ) print("Frequent words saved to:", frequent_words_filename_val) print() print("Generating frequent word plot...") plot_top_words(word_count_pair_list, frequent_words_plot_filename_val) print("Frequent word plot saved to:", frequent_words_plot_filename_val) print() print("Saving top tfidf words...") save_words( all_tfidf_pair_list, top_tfidf_words_filename_val ) print("Top tfidf words saved to:", top_tfidf_words_filename_val) print() print("Generating top tfidf word plot...") plot_top_words(tfidf_pair_list, top_tfidf_words_plot_filename_val) print("Top tfidf word plot saved to:", top_tfidf_words_plot_filename_val) print() grouped_words_counts[key[1::2]] = { w: int(c) for w, c in all_word_count_pair_list } grouped_words_tfidf[key[1::2]] = { w: int(c) for w, c in all_tfidf_pair_list } except: print("Error processing", key, "skipping it. texts are probably all stopwords") print("Saving grouped frequent words...") group_frequent_words_filename = add_prefix_to_filename( frequent_words_filename, groups ) remapped_grouped_words_counts = remap_keys(grouped_words_counts, groups) with open(group_frequent_words_filename, 'w', encoding="utf8") as f: json.dump(remapped_grouped_words_counts, f, ensure_ascii=False) print("Frequent words saved to:", group_frequent_words_filename) print() if should_upload_db: print("Uploading grouped_words_counts to db...") upload_db(db_client, 'grouped_words_counts', { column: remap_to_dict(remapped_grouped_words_counts) }) print('Done') print() print("Saving grouped top tfidf words...") group_top_tfidf_words_filename = add_prefix_to_filename( top_tfidf_words_filename, groups ) remapped_grouped_words_tfidf = remap_keys(grouped_words_tfidf, groups) with open(group_top_tfidf_words_filename, 'w', encoding="utf8") as f: json.dump(remapped_grouped_words_tfidf, f, ensure_ascii=False) print("Top tfidf words saved to:", group_top_tfidf_words_filename) print() if should_upload_db: print("Uploading grouped_words_tfidf to db...") upload_db(db_client, 'grouped_words_tfidf', { column: remap_to_dict(remapped_grouped_words_tfidf) }) print('Done') print() if predict_topics: print("Calculating topic model...") lda, predicted_topics = learn_topic_model(tfidf_data, num_topics) print("Topics found via LDA:") print_topics(lda, tfidf_vectorizer, num_words) print("Saving topics...") save_topics(lda, tfidf_vectorizer, topics_filename) print("Topics saved to:", topics_filename) print() print("Saving predicted topics...") save_predicted_topics(predicted_topics, predicted_topics_filename) print("Predicted topics saved to:", predicted_topics_filename) print() if should_upload_db: print("Uploading predicted topics to db...") upload_db(db_client, 'predicted_topics', { column: json.loads(pd.DataFrame(predicted_topics).to_json( orient='index', force_ascii=False )) }) print('Done') print() print("Generating LDA visualization...") visualize_topic_model(lda, count_data, tfidf_vectorizer, num_topics, ldavis_filename_prefix) print("LDA visualization saved to:", ldavis_filename_prefix) print() if predict_sentiment: if language == 'it': print("Predict sentiment...") predicted_sentiment = predict_sentiment_with_sentita(data_df[column]) save_sentiment(predicted_sentiment, predicted_sentiment_filename) print("Predict sentiment saved to:", predicted_sentiment_filename) print() if should_upload_db: print("Uploading predicted sentiment to db...") upload_db(db_client, 'predicted_sentiment', { column: json.loads(pd.DataFrame(predicted_sentiment).to_json( orient='index', force_ascii=False )) }) print('Done') print() elif language == 'en': print("Predict sentiment...") predicted_sentiment = predict_sentiment_with_paralleldots(data_df) save_sentiment(predicted_sentiment, predicted_sentiment_filename) print("Predict sentiment saved to:", predicted_sentiment_filename) print() if should_upload_db: print("Uploading predicted sentiment to db...") upload_db(db_client, 'predicted_sentiment', { column: json.loads(pd.DataFrame(predicted_sentiment).to_json( orient='index', force_ascii=False )) }) print('Done') print() else: print("Sentiment analysis on {} language is not supported") print()
import utils from datamodel import * import database as db if __name__ == '__main__': s=db.Session() links=s.query(Link) total=0;good=0;bad=0;hidden=0 for l in links: total+=1 if l.evaluation: good+=1 elif l.evaluation==False: bad+=1 if l.hidden==True: hidden+=1 titles=[utils.tokenize(l.title) for l in links] fwords=utils.most_frequent_words() print fwords not_null=0 for t in titles: for w in fwords: if w in t: not_null+=1 break print float(not_null)/len(titles)*100.,len(fwords) print "%d links, %d good, %d bad, %d hidden" %(total,good,bad,hidden) #for d in utils.most_frequent_duos(fwords): # print d