def add_tfidf(self): print 'calculating tfidf...' token_tfidf = TFIDF('token') bi_gram_tfidf = TFIDF('bi_gram') texts = self.df['body'] # tfidf_df = pd.DataFrame(columns=token_tfidf.terms) tfidf_df = pd.DataFrame(columns=token_tfidf.terms + bi_gram_tfidf.terms) is_from_sql = ('_data_source' in self.df.columns) token_dict = get_token_dict() if is_from_sql else None count = 0 for i, text in enumerate(texts): if i > 0 and i % 500 == 0: print i, 'rows' if text: token_freqs = None bi_gram_freqs = None if is_from_sql: sql_tbl_id, sql_id = self.df._data_source[i], self.df._id[i] token_freqs, bi_gram_freqs = get_freqs(sql_tbl_id, sql_id, token_dict) else: tokenizer = Tokenizer(text) token_freqs = tokenizer.get_token_count() bi_gram_freqs = tokenizer.get_n_gram_count(2, True) row = token_tfidf.get_tfidf(token_freqs) + bi_gram_tfidf.get_tfidf(bi_gram_freqs) tfidf_df.loc[i] = row # tfidf_df.loc[i] = token_tfidf.get_tfidf(token_freqs) count = i+1 self.df = pd.concat([self.df, tfidf_df], axis=1) print "tfidf'd", count, 'rows'