Python clean_text 예제들, utils.text_utils.clean_text Python 예제들

예제 #1

0

파일 보기

 def find_locations(self, poi_data):
     source_ids = poi_data.get('source_ids', {})
     foursquare_ids = source_ids.get('foursquare', [])
     facebook_ids = source_ids.get('facebook', [])
     poi_name = text_utils.clean_text(poi_data["name"])
     lat, lng = poi_data["coordinates"]
     results = self.find_location(poi_name, lat, lng)
     return results

예제 #2

0

파일 보기

파일: instagram_scraper.py 프로젝트: kazuar/data_mining_system

 def find_locations(self, poi_data):
     source_ids = poi_data.get('source_ids', {})
     foursquare_ids = source_ids.get('foursquare', [])
     facebook_ids = source_ids.get('facebook', [])
     poi_name = text_utils.clean_text(poi_data["name"])
     lat, lng = poi_data["coordinates"]
     results = self.find_location(poi_name, lat, lng)
     return results

예제 #3

0

파일 보기

파일: extract_features.py 프로젝트: agnesvanbelle/kagglehousing

 def _extract_feature_features(df_test):
     new_df = pd.DataFrame()
     temp_features = df_test["features"].apply(
         lambda x: ['none']
         if not x else [clean_text(y).replace(' ', '') for y in x])
     features_dummmified = pd.get_dummies(temp_features.apply(
         pd.Series).stack(),
                                          prefix="feature").sum(level=0)
     new_df = pd.concat([new_df, features_dummmified], axis=1)
     return new_df

예제 #4

0

파일 보기

import pandas as pd
from utils import text_utils
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import linear_model, model_selection

# show upto 40 characters instead of default 50
pd.set_option('max_colwidth', 100)
# read the data
train_df = pd.read_csv("dataset/train.csv")
test_df = pd.read_csv("dataset/test.csv")

# remove punctuations, tokenize, remove stop words and lemmatize the words from lowercased tweets in training data
train_df['text_clean'] = train_df['text'].apply(
    lambda x: text_utils.clean_text(x.lower()))

# remove punctuations, tokenize, remove stop words and lemmatize the words from lowercased tweets in test data
test_df['text_clean'] = test_df['text'].apply(
    lambda x: text_utils.clean_text(x.lower()))

# Look at first 5 rows of training and test data
print("First 5 rows in training data:")
print(train_df['text'].head(5))
print("First 5 rows in training data - cleaned:")
print(train_df['text_clean'].head(5))

print("First 5 rows in test data:")
print(test_df['text'].head(5))
print("First 5 rows in test data - cleaned:")
print(test_df['text_clean'].head(5))

# Apply Count Vectorizer

예제 #5

0

파일 보기

파일: random_forest.py 프로젝트: sappia/real_or_not

import pandas as pd
from utils import text_utils
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection

# show upto 40 characters instead of default 50
pd.set_option('max_colwidth', 100)
# read the data
train_df = pd.read_csv("dataset/train.csv")
test_df = pd.read_csv("dataset/test.csv")

# remove punctuations, tokenize, remove stop words and lemmatize the words from lowercased tweets in training data
train_df['text_clean'] = train_df['text'].apply(lambda x: text_utils.clean_text(x.lower()))

# remove punctuations, tokenize, remove stop words and lemmatize the words from lowercased tweets in test data
test_df['text_clean'] = test_df['text'].apply(lambda x: text_utils.clean_text(x.lower()))

# Look at first 5 rows of training and test data
print("First 5 rows in training data:")
print(train_df['text'].head(5))
print("First 5 rows in training data - cleaned:")
print(train_df['text_clean'].head(5))

print("First 5 rows in test data:")
print(test_df['text'].head(5))
print("First 5 rows in test data - cleaned:")
print(test_df['text_clean'].head(5))

# Apply Tfidf Vectorizer
tfidf_vect = TfidfVectorizer(analyzer=text_utils.clean_text)

예제 #6

0

파일 보기

    def on_success(self, status):
        # Reset sleep seconds exponent
        self.sleep_exponent = 0

        tweet_passes = check_tweet(
            status,
            ignore_tweet_list=ignore_tweet_list,
            language=LANGUAGE,
            ignore_user_screen_names=IGNORE_USER_SCREEN_NAMES,
            ignore_user_id_str=IGNORE_USER_ID_STR,
        )

        if not tweet_passes:
            return

        profile_passes = check_profile(status, ignore_profile_list)

        if not profile_passes:
            logger.info(
                f"Failed check_profile: {status['user']['screen_name']}:"
                + f" {status['user']['description']}"
            )
            return

        tweet_body = get_tweet_body(status)
        text = clean_text(tweet_body)
        text_passes = check_text_wrapper(text, ignore_tweet_list)

        if not text_passes:
            logger.info(f"Failed check_text_wrapper: {text}")
            return

        haiku = get_haiku(
            text,
            inflect_p,
            pronounce_dict,
            syllable_dict,
            emoticons_list,
            GUESS_SYL_METHOD,
        )

        if not haiku:
            return

        # Add it to the database
        tweet_haiku = Haiku.add_haiku(session, status, text, haiku, log_haiku=LOG_HAIKU)
        logger.info("=" * 50)
        logger.info(f"Found new haiku:\n{tweet_haiku.haiku}")

        if not DEBUG_RUN:
            # Get haikus from the last hour
            haikus = Haiku.get_haikus_unposted_timedelta(
                session, td_seconds=EVERY_N_SECONDS
            )

            # Delete old data by row count
            Haiku.keep_haikus_n_rows(session, n=ROWS_TO_KEEP)

            # Delete old data by timestamp
            Haiku.delete_haikus_unposted_timedelta(session, days=DELETE_OLDER_THAN_DAYS)
            Haiku.delete_haikus_posted_timedelta(session, days=DELETE_OLDER_THAN_DAYS)
        else:
            # Use the current haiku
            haikus = [tweet_haiku]

        # # Get all unposted haikus
        # haikus = Haiku.get_haikus_unposted(session)

        if len(haikus) == 0:
            logger.info("No haikus to choose from")
            return

        # Get the haiku to post
        haiku_to_post = get_best_haiku(haikus, twitter, session)
        if haiku_to_post["status_id_str"] == "":
            return

        status = twitter.show_status(id=haiku_to_post["status_id_str"])

        # Format the haiku with attribution
        haiku_attributed = (
            f"{haiku_to_post['haiku']}\n\n"
            + f"A haiku by @{status['user']['screen_name']}"
        )

        tweet_url = (
            f"https://twitter.com/{status['user']['screen_name']}"
            + f"/status/{status['id_str']}"
        )

        logger.info("=" * 50)
        if logger.isEnabledFor(logging.DEBUG):
            logger.debug(pformat(status))
            logger.debug(tweet_url)
            logger.debug(f"Original: {haiku_to_post['text_original']}")
            logger.debug(f"Cleaned:  {haiku_to_post['text_clean']}")
        logger.info(f"Haiku to post:\n{haiku_attributed}")

        # Try to post haiku (client checks rate limit time internally)
        if not POST_HAIKU:
            logger.debug(f"Found haiku but did not post: {haiku_attributed}")
            return

        if POST_AS_REPLY:
            logger.info("Attempting to post haiku as reply...")
            # Post a tweet, sending as a reply to the coincidental haiku
            posted_status = twitter.update_status_check_rate(
                status=haiku_attributed,
                in_reply_to_status_id=status["id_str"],
                attachment_url=tweet_url,
            )
        else:
            logger.info("Attempting to post haiku, but not as reply...")
            # Post a tweet, but not as a reply to the coincidental haiku
            # The user will not get a notification
            posted_status = twitter.update_status_check_rate(
                status=haiku_attributed,
                attachment_url=tweet_url,
            )
        if posted_status:
            logger.info("Attempting to follow this poet...")
            Haiku.update_haiku_posted(session, haiku_to_post["status_id_str"])

            # follow the user
            if FOLLOW_POET:
                try:
                    followed = twitter.create_friendship(
                        screen_name=haiku_to_post["user_screen_name"],
                        # follow: enable notifications
                        follow="false",
                    )
                    if followed["following"]:
                        logger.info("Success")
                    else:
                        logger.info("Could not follow")
                except TwythonError as e:
                    logger.info(e)

예제 #7

0

파일 보기

def clean_annotations(annotations):
    cleaned_annotations = []
    for annotation in tqdm(annotations):
        annotation['caption'] = ' '.join(clean_text(annotation['caption']))
        cleaned_annotations.append(annotation)
    return cleaned_annotations