def digest_feeds_entries(feed_url, keywords, posted_titles, words_scores): all_feed_entries = feedworker.process_feed(feed_url) # exclude published titles feed_entries = [entry for entry in all_feed_entries if entry["title"] not in posted_titles] if len(feed_entries) == 0: return None # here each entry becomes element of a list entries_content = [entry["content"] for entry in feed_entries] # which I can pass to function pure_entries = textworker.purify_texts(entries_content, True, True) # turn each entry into a string pure_entries = [" ".join(entry) for entry in pure_entries] feed_records = [] for i in range(0, len(pure_entries)): # match_count and match_words are necessary only for debugging entry_score, match_count, matched_words = calcworker.get_document_score( textworker.count_word_frequency(pure_entries[i], keywords), words_scores ) feed_records.append( { "title": feed_entries[i]["title"], "link": feed_entries[i]["link"], "score": entry_score, "match_count": match_count, "matched_words": matched_words, } ) return feed_records
def process_user_statuses(screen_name, rank, keywords): statuses_objs = get_user_statuses(screen_name, True, 100) statuses_texts = [status.text.encode('utf-8') for status in statuses_objs] pure_texts = textworker.purify_texts(statuses_texts) concat_text = textworker.concat_texts(pure_texts) words_frequency = textworker.count_word_frequency(concat_text, keywords) words_local_scores = calcworker.get_words_local_scores(words_frequency, rank) return words_local_scores
def process_user_statuses(screen_name, rank, keywords): statuses_objs = get_user_statuses(screen_name, True, 100) statuses_texts = [status.text.encode("utf-8") for status in statuses_objs] pure_texts = textworker.purify_texts(statuses_texts) concat_text = textworker.concat_texts(pure_texts) words_frequency = textworker.count_word_frequency(concat_text, keywords) words_local_scores = calcworker.get_words_local_scores(words_frequency, rank) return words_local_scores
def digest_feeds_entries(feed_url, keywords, posted_titles, words_scores): all_feed_entries = feedworker.process_feed(feed_url) # exclude published titles feed_entries = [entry for entry in all_feed_entries if entry["title"] not in posted_titles] if len(feed_entries) == 0: return None # here each entry becomes element of a list entries_content = [entry["content"] for entry in feed_entries] # which I can pass to function pure_entries = textworker.purify_texts(entries_content, True, True) # turn each entry into a string pure_entries = [" ".join(entry) for entry in pure_entries] feed_records = [] for i in range(0, len(pure_entries)): # match_count and match_words are necessary only for debugging entry_score, match_count, matched_words = calcworker.get_document_score(textworker.count_word_frequency(pure_entries[i], keywords), words_scores) feed_records.append({"title": feed_entries[i]["title"], "link": feed_entries[i]["link"], "score": entry_score, "match_count": match_count, "matched_words": matched_words}) return feed_records