def get(): try: response = urllib.request.urlopen("http://news.google.com/news?pz=1&cf=all&ned=us&hl=en&q=kaine&output=rss") except URLError as e: print(e.reason) else: html = BeautifulSoup(response.read(), "html.parser") items = html.find_all('item') for item in items: headline = item.title.string h_split = headline.split() # We don't want to use incomplete headlines if "..." in headline: continue # Try to weed out all-caps headlines if count_caps(h_split) >= len(h_split) - 3: continue # Skip anything too offensive if not tact(headline): continue # Remove attribution string if "-" in headline: headline = headline.split("-")[:-1] headline = ' '.join(headline).strip() if process(headline): break else: continue
def get(): try: request = urllib2.Request( "http://news.google.com/news?pz=1&cf=all&ned=us&hl=en&output=rss") response = urllib2.urlopen(request) except urllib2.URLError as e: print e.reason else: html = BeautifulSoup(response.read()) items = html.find_all('item') for item in items: headline = item.title.string h_split = headline.split() # We don't want to use incomplete headlines if "..." in headline: continue # Try to weed out all-caps headlines if count_caps(h_split) >= len(h_split) - 3: continue # Skip anything too offensive if not tact(headline): continue # Remove attribution string if "-" in headline: headline = headline.split("-")[:-1] headline = ' '.join(headline).strip() if process(headline): break else: continue
def filter_tweets(tweets_): """Filter out tweets to avoid mentions, offensive content, etc. """ while True: tweet_ = tweets_.pop(0) text = tweet_.text if len(tweets_) == 0: return if not (hasattr(tweet_, "retweeted_status") or tweet_.in_reply_to_status_id or tweet_.in_reply_to_screen_name or tweet_.truncated or '@' in text or 'RT' in text or '#' in text or wordfilter.blacklisted(text) or not tact(text)): if process(text): break else: continue
def get_news(): try: request = urllib2.Request( "http://news.google.com/news?pz=1&cf=all&ned=us&hl=en&output=rss") response = urllib2.urlopen(request) except urllib2.URLError as e: print(e.reason) else: html = BeautifulSoup(response.read(), "html.parser") items = html.find_all('item') for item in items: headline = item.title.string h_split = headline.split() # Skip incomplete headlines if "..." in headline: continue # Skip headlines in all caps if count_caps(h_split) >= len(h_split) - 3: continue # Filter for offensive words if wordfilter.blacklisted(headline): continue # Filter again if not tact(headline): continue # Remove article attributions if "-" in headline: headline = headline.split("-")[:-1] headline = ' '.join(headline).strip() if process(headline): break else: continue
import offensive with open ("/home/staeiou/bots/dystopedia/titles.txt", encoding="utf-8") as f: deltext = f.read() deltext = deltext.replace(".", " ") deltext = deltext.encode('ascii', 'ignore').decode('ascii') deletion_model = markovify.NewlineText(deltext) tweet = None tweets = [] for i in range(250): title = deletion_model.make_short_sentence(90) if title is not None and not wordfilter.blacklisted(title) and offensive.tact(title): tweets.append(title) tweets = sorted(tweets, key=len, reverse=True) rand_num = random.randrange(0,25) if tweets[rand_num] is not None: print(tweets[rand_num]) CONSUMER_KEY = twitter_login.CONSUMER_KEY CONSUMER_SECRET = twitter_login.CONSUMER_SECRET ACCESS_TOKEN = twitter_login.ACCESS_TOKEN ACCESS_TOKEN_SECRET = twitter_login.ACCESS_TOKEN_SECRET auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET) auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
def _process_tweet(self, tweet): ''' Translate a tweet and post the translated one. tweet: The tweet to translate. ''' log_details = [ ('original-id', tweet.id), ('original-url', self._get_tweet_url(self._target_user_name, tweet.id)), ('original-time', tweet.created_at.isoformat()), ('original-text', tweet.full_text), ] if hasattr(tweet, 'retweeted_status'): # Note that retweets with an extra comment don't have retweeted_status, but # they have quoted_status, so we don't skip them. log_details += [ ('skipped-because-retweet', True), ] intermediate_translations = None new_tweet = None else: self._follow_mentions(tweet) intermediate_translations = [] def translation_cb(counter, language, intermediate_text): intermediate_translations.append( collections.OrderedDict([ ('counter', counter), ('language', language), ('text', intermediate_text), ])) sanitized_text = self._sanitize_tweet(tweet) equilibrium_reached, sanitized_translated_text = equilibrium.find_equilibrium( self._translator, 'en', 'ja', sanitized_text, translation_cb) translated_text = self._unsanitize_tweet_text( sanitized_translated_text) new_tweet = self._post_tweet(translated_text, tweet.id) if tweet.full_text != sanitized_text: log_details += [ ('original-sanitized-text', sanitized_text), ] log_details += [ ('translated-id', new_tweet.id), ('translated-url', self._get_tweet_url(self._my_user.id, new_tweet.id)), ('translated-time', new_tweet.created_at.isoformat()), ] # Maybe the tweet was shortened or mangled in some other way by Twitter. if translated_text != new_tweet.full_text: log_details += [ ('translated-initial-text', translated_text), ] log_details += [ ('translated-text', new_tweet.full_text), ('equilibrium-reached', equilibrium_reached), ('translator', self._translator.name), ] # For now we just log about offensiveness. # Later I can verify how useful this check is and, if needed, not post the # tweets. original_offensive = not offensive.tact(tweet.full_text) new_offensive = not offensive.tact(translated_text) if original_offensive and new_offensive: offensiveness = 'both' elif original_offensive: offensiveness = 'original' elif new_offensive: offensiveness = 'retranslated' else: offensiveness = 'none' log_details += [ ('offensiveness', offensiveness), ] self._last_processed.set_last_processed(tweet.id_str) # We save logs after the ID, so there's a chance we actually fail to save logs for # this tweet. This is better than retweeting the same thing twice. self._log(self._serialize_list_to_ordered_dict(log_details)) self._log_tweet_json(tweet) self._log_tweet_json(new_tweet) if intermediate_translations: json_text = self._serialize_json(intermediate_translations) extra_name = '{}-{}-translations.json'.format( tweet.user.screen_name, tweet.id) self._log(json_text, extra_name)