def find_locations(self, poi_data): source_ids = poi_data.get('source_ids', {}) foursquare_ids = source_ids.get('foursquare', []) facebook_ids = source_ids.get('facebook', []) poi_name = text_utils.clean_text(poi_data["name"]) lat, lng = poi_data["coordinates"] results = self.find_location(poi_name, lat, lng) return results
def _extract_feature_features(df_test): new_df = pd.DataFrame() temp_features = df_test["features"].apply( lambda x: ['none'] if not x else [clean_text(y).replace(' ', '') for y in x]) features_dummmified = pd.get_dummies(temp_features.apply( pd.Series).stack(), prefix="feature").sum(level=0) new_df = pd.concat([new_df, features_dummmified], axis=1) return new_df
import pandas as pd from utils import text_utils from sklearn.feature_extraction.text import CountVectorizer from sklearn import linear_model, model_selection # show upto 40 characters instead of default 50 pd.set_option('max_colwidth', 100) # read the data train_df = pd.read_csv("dataset/train.csv") test_df = pd.read_csv("dataset/test.csv") # remove punctuations, tokenize, remove stop words and lemmatize the words from lowercased tweets in training data train_df['text_clean'] = train_df['text'].apply( lambda x: text_utils.clean_text(x.lower())) # remove punctuations, tokenize, remove stop words and lemmatize the words from lowercased tweets in test data test_df['text_clean'] = test_df['text'].apply( lambda x: text_utils.clean_text(x.lower())) # Look at first 5 rows of training and test data print("First 5 rows in training data:") print(train_df['text'].head(5)) print("First 5 rows in training data - cleaned:") print(train_df['text_clean'].head(5)) print("First 5 rows in test data:") print(test_df['text'].head(5)) print("First 5 rows in test data - cleaned:") print(test_df['text_clean'].head(5)) # Apply Count Vectorizer
import pandas as pd from utils import text_utils from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.ensemble import RandomForestClassifier from sklearn import model_selection # show upto 40 characters instead of default 50 pd.set_option('max_colwidth', 100) # read the data train_df = pd.read_csv("dataset/train.csv") test_df = pd.read_csv("dataset/test.csv") # remove punctuations, tokenize, remove stop words and lemmatize the words from lowercased tweets in training data train_df['text_clean'] = train_df['text'].apply(lambda x: text_utils.clean_text(x.lower())) # remove punctuations, tokenize, remove stop words and lemmatize the words from lowercased tweets in test data test_df['text_clean'] = test_df['text'].apply(lambda x: text_utils.clean_text(x.lower())) # Look at first 5 rows of training and test data print("First 5 rows in training data:") print(train_df['text'].head(5)) print("First 5 rows in training data - cleaned:") print(train_df['text_clean'].head(5)) print("First 5 rows in test data:") print(test_df['text'].head(5)) print("First 5 rows in test data - cleaned:") print(test_df['text_clean'].head(5)) # Apply Tfidf Vectorizer tfidf_vect = TfidfVectorizer(analyzer=text_utils.clean_text)
def on_success(self, status): # Reset sleep seconds exponent self.sleep_exponent = 0 tweet_passes = check_tweet( status, ignore_tweet_list=ignore_tweet_list, language=LANGUAGE, ignore_user_screen_names=IGNORE_USER_SCREEN_NAMES, ignore_user_id_str=IGNORE_USER_ID_STR, ) if not tweet_passes: return profile_passes = check_profile(status, ignore_profile_list) if not profile_passes: logger.info( f"Failed check_profile: {status['user']['screen_name']}:" + f" {status['user']['description']}" ) return tweet_body = get_tweet_body(status) text = clean_text(tweet_body) text_passes = check_text_wrapper(text, ignore_tweet_list) if not text_passes: logger.info(f"Failed check_text_wrapper: {text}") return haiku = get_haiku( text, inflect_p, pronounce_dict, syllable_dict, emoticons_list, GUESS_SYL_METHOD, ) if not haiku: return # Add it to the database tweet_haiku = Haiku.add_haiku(session, status, text, haiku, log_haiku=LOG_HAIKU) logger.info("=" * 50) logger.info(f"Found new haiku:\n{tweet_haiku.haiku}") if not DEBUG_RUN: # Get haikus from the last hour haikus = Haiku.get_haikus_unposted_timedelta( session, td_seconds=EVERY_N_SECONDS ) # Delete old data by row count Haiku.keep_haikus_n_rows(session, n=ROWS_TO_KEEP) # Delete old data by timestamp Haiku.delete_haikus_unposted_timedelta(session, days=DELETE_OLDER_THAN_DAYS) Haiku.delete_haikus_posted_timedelta(session, days=DELETE_OLDER_THAN_DAYS) else: # Use the current haiku haikus = [tweet_haiku] # # Get all unposted haikus # haikus = Haiku.get_haikus_unposted(session) if len(haikus) == 0: logger.info("No haikus to choose from") return # Get the haiku to post haiku_to_post = get_best_haiku(haikus, twitter, session) if haiku_to_post["status_id_str"] == "": return status = twitter.show_status(id=haiku_to_post["status_id_str"]) # Format the haiku with attribution haiku_attributed = ( f"{haiku_to_post['haiku']}\n\n" + f"A haiku by @{status['user']['screen_name']}" ) tweet_url = ( f"https://twitter.com/{status['user']['screen_name']}" + f"/status/{status['id_str']}" ) logger.info("=" * 50) if logger.isEnabledFor(logging.DEBUG): logger.debug(pformat(status)) logger.debug(tweet_url) logger.debug(f"Original: {haiku_to_post['text_original']}") logger.debug(f"Cleaned: {haiku_to_post['text_clean']}") logger.info(f"Haiku to post:\n{haiku_attributed}") # Try to post haiku (client checks rate limit time internally) if not POST_HAIKU: logger.debug(f"Found haiku but did not post: {haiku_attributed}") return if POST_AS_REPLY: logger.info("Attempting to post haiku as reply...") # Post a tweet, sending as a reply to the coincidental haiku posted_status = twitter.update_status_check_rate( status=haiku_attributed, in_reply_to_status_id=status["id_str"], attachment_url=tweet_url, ) else: logger.info("Attempting to post haiku, but not as reply...") # Post a tweet, but not as a reply to the coincidental haiku # The user will not get a notification posted_status = twitter.update_status_check_rate( status=haiku_attributed, attachment_url=tweet_url, ) if posted_status: logger.info("Attempting to follow this poet...") Haiku.update_haiku_posted(session, haiku_to_post["status_id_str"]) # follow the user if FOLLOW_POET: try: followed = twitter.create_friendship( screen_name=haiku_to_post["user_screen_name"], # follow: enable notifications follow="false", ) if followed["following"]: logger.info("Success") else: logger.info("Could not follow") except TwythonError as e: logger.info(e)
def clean_annotations(annotations): cleaned_annotations = [] for annotation in tqdm(annotations): annotation['caption'] = ' '.join(clean_text(annotation['caption'])) cleaned_annotations.append(annotation) return cleaned_annotations