Пример #1
0
def preprocess(data):
    lang = data['lang']
    text_xx = 'text_' + lang

    data['tweet_date'] = datetime.datetime.strptime(data['created_at'], '%a %b %d %H:%M:%S %z %Y').strftime('%Y-%m-%dT%H:00:00.00Z')

    if 'extended_tweet' in data:
        data['tweet_text'] = data['extended_tweet']['full_text']
    else:
        data['tweet_text'] = data['text']
    
    if 'geo' in data and data['geo'] and 'coordinates' in data['geo'] and 'coordinates' in data['geo']['coordinates']:
        data['tweet_loc'] = ','.join(str(x) for x in data['geo']['coordinates']['coordinates'])
    elif 'place' in data and data['place'] and 'bounding_box' in data['place']:
        data['tweet_loc'] = ','.join(str(x) for x in data['place']['bounding_box']['coordinates'][0][0])

    parsed_text = p.parse(data['tweet_text'])
    data['tweet_emoticons'] = [t.match for e in [parsed_text.emojis, parsed_text.smileys] if e is not None for t in e ]
    data[text_xx] = p.clean(data['tweet_text']).lower()
    # words = re.findall(r'\w+', data[text_xx], flags = re.UNICODE)
    # if lang in lang_map:
    #     data[text_xx] = remove_stop_words(stopwords.words(lang_map[lang]), words)
    # elif lang == 'hi':
    #     data[text_xx] = remove_stop_words(hindi_words, words)
    # else:
    #     data[text_xx] = remove_stop_words(thai_words, words)
    return data
Пример #2
0
    def preprocess_data(X):
        """

        :param X:
        :return:
        """

        X_new = np.array(X)
        for i, x in enumerate(X_new):
            # For each sample tweet x in the domain X_new, clean all URLs and numbers
            p.set_options(*CLEAN_OPTION_SET)
            s = p.clean(x)
            # Replace all caps words with placeholder ALL_CAPS_WORD
            split_tweet = s.split()
            for w in split_tweet:
                if w.isupper():
                    s = s.replace(w, "ALL_CAPS_WORD")
            p.set_options(*PARSE_OPTION_SET)
            # Figure out if the tweet is a retweet - if so, replace it with the person being retweeted
            tweet = s
            s = p.parse(tweet)
            p.set_options(*TOKENIZING_OPTION_SET)
            if s.reserved is not None and "RT" in s.reserved[0].match:
                tweet_modified = s.mentions[0].match
            else:
                tweet_modified = p.tokenize(tweet)
            # Replace all special characters with placeholders to make them unique
            for ch in SPECIAL_CHAR_SET:
                tweet_modified = tweet_modified.replace(
                    ch, SPECIAL_CHAR_SET[ch])
            X_new[i] = tweet_modified
        return X_new
Пример #3
0
def clean_tweet_text(tweet_text): 
    tweet_text = tweet_text.replace("’", "'").replace("…", "...")
    tweet_parser = p.parse(tweet_text)
    cleaned_tweet = tweet_text
    hash_tags = tweet_parser.hashtags
    if hash_tags is not None:
        for hash_tag in hash_tags:
            cleaned_tweet = cleaned_tweet.replace(hash_tag.match, " ".join(wordninja.split(hash_tag.match[1:])))
    tweet_urls = tweet_parser.urls
    if tweet_urls is not None:
        for url_link in tweet_urls:
            cleaned_tweet = cleaned_tweet.replace(url_link.match, " url$$ ")
    tweet_emojis = tweet_parser.emojis
    if tweet_emojis is not None:
        for emoji in tweet_emojis:
            cleaned_tweet = cleaned_tweet.replace(emoji.match, " emoji$$ ")
    cleaned_tweet = cleaned_tweet.split("via")[0].split("|")[0].split(" - ")[0].split(" – ")[0]
    cleaned_tweet_tokens = []
    for word_token in cleaned_tweet.split(" "):
        word_token = word_token.strip().rstrip()
        if word_token.endswith("$$") or word_token in COMMON_ENGLISH_WORDS:
            cleaned_tweet_tokens.append(word_token)
        elif len(word_token) > 0:
            split_tokens = [w for w in wordninja.split(word_token) if w not in string.punctuation]
            cleaned_tweet_tokens += [token for token in split_tokens if not is_number(token)]

    cleaned_tweet = " ".join(cleaned_tweet_tokens)
    return cleaned_tweet
Пример #4
0
    def test_set_options(self):
        tweet = "Preprocessor now has custom #options support! https://github.com/s/preprocessor"
        p.set_options(p.OPT.URL)
        parsed_tweet = p.parse(tweet)

        self.assertIsNone(parsed_tweet.hashtags)
        self.assertIsNotNone(parsed_tweet.urls)
Пример #5
0
    def test_set_options(self):
        tweet = 'Preprocessor now has custom #options support! https://github.com/s/preprocessor'
        p.set_options(p.OPT.URL)
        parsed_tweet = p.parse(tweet)

        self.assertIsNone(parsed_tweet.hashtags)
        self.assertIsNotNone(parsed_tweet.urls)
Пример #6
0
def clean_tweet(tweet):
    tweet_clean = {key: tweet[key] for key in
                   ['created_at', 'id', 'id_str', 'in_reply_to_user_id', 'in_reply_to_user_id_str',
                    'in_reply_to_screen_name',
                    'retweet_count', 'favorite_count', 'lang']}
    if 'full_text' in tweet.keys():
        tweet_clean['text'] = tweet['full_text']
    elif 'extended_tweet' in tweet.keys():
        tweet_clean['text'] = tweet['extended_tweet']['full_text']
    else:
        tweet_clean['text'] = tweet['text']
    if 'quote_count' in tweet.keys(): tweet_clean['quote_count'] = tweet['quote_count']
    if 'reply_count' in tweet.keys(): tweet_clean['reply_count'] = tweet['reply_count']
    tweet_clean['datetime'] = datetime.fromtimestamp(parser.parse(tweet['created_at']).timestamp())
    if 'type' not in tweet.keys(): tweet_clean['type'] = tweet_type(tweet)
    if 'tweet_user_id' not in tweet.keys(): tweet_clean['tweet_user_id'] = tweet_creator(tweet)['id']
    if 'tweet_user_id_str' not in tweet.keys(): tweet_clean['tweet_user_id_str'] = tweet_creator(tweet)['id_str']
    if 'tweet_user_screen_name' not in tweet.keys(): tweet_clean['tweet_user_screen_name'] = tweet_creator(tweet)[
        'screen_name']

    tweet_clean['timestamp'] = parser.parse(tweet['created_at']).timestamp()

    tweet_clean['text_processed'] = preprocess_text(tweet_clean['text'])
    text = tweetp.parse(tweet_clean['text'])
    tweet_clean['emojis'] = min(length(text.emojis), 127)
    tweet_clean['hashtags'] = min(length(text.hashtags), 127)
    tweet_clean['urls'] = min(length(text.urls), 127)
    tweet_clean['mentions'] = min(length(text.mentions), 127)
    return tweet_clean
Пример #7
0
def cleaning_sentence(sentence):
    stop_free = " ".join(
        [i for i in sentence.lower().split() if i not in stop])
    hashtag_value = p.parse(stop_free).hashtags
    sentence = p.clean(stop_free)
    sentence = re.sub("[^A-Za-z .]+", "", sentence)
    normalized = " ".join(lemma.lemmatize(word) for word in sentence.split())
    return normalized
Пример #8
0
    def _parse(text):
        """
        Parses elements from tweets

        :param text:
        :return:
        """
        return preproc.parse(text)
Пример #9
0
    def updating_columns(self):
        # Creating other columns for post
        self.posts['hashtags'] = self.posts.text.apply(lambda t: [h.match for h in p.parse(t).hashtags] if p.parse(t).hashtags else None)
        self.posts['time'] = self.posts.time.apply(datetime.fromtimestamp)

        # Creating other
        self.comments['time'] = self.comments.time.apply(datetime.fromtimestamp)
        self.comments['processed_comment'] = self.comments.text.str.lower()
        self.comments = stopwords_correction(self.comments, 'processed_comment')
def replace_hashtags(tweet):

    p.set_options(p.OPT.MENTION, p.OPT.HASHTAG, p.OPT.URL, p.OPT.RESERVED)
    t = p.parse(tweet)
    if t.hashtags:
        for i in t.hashtags:
            tweet = tweet[:i.start_index] + ' ' + tweet[i.start_index + 1:]

    return tweet
Пример #11
0
def tweet_cleaning(text):
    parsed = p.parse(text)
    emojis = [x.match
              for x in parsed.emojis] if not parsed.emojis is None else []
    hashtags = [x.match for x in parsed.hashtags
                ] if not parsed.hashtags is None else []
    p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.RESERVED, p.OPT.NUMBER)
    text = p.clean(text)
    p.set_options(p.OPT.EMOJI, p.OPT.SMILEY, p.OPT.HASHTAG)
    text2 = p.clean(text)
    return [text, text2, emojis, hashtags]
Пример #12
0
    def insertTweetsIntoDB(self):
        concept = "coronavirus"
        analysisID = "passif"

        preProcessing = TweetsPreProcessing()
        dirPath = "C:/Users/Raouf/PycharmProjects/PFE_SII_M2/TweetFiles/"

        allFiles = [f for f in listdir(dirPath) if isfile(join(dirPath, f))]
        print(allFiles)
        for fileName in allFiles[63:]:
            print(fileName)
            if fileName.startswith("ExtractedTweetsFor"):
                if not fileName.endswith("Loaded.json"):  # check if the file is loaded to the database or not
                    fullFileName = dirPath + fileName
                    tweetsFile = open(fullFileName, 'r', encoding="utf-8")
                    tweets = json.load(tweetsFile)

                    cpt = 0
                    for tweet in tweets['tweets']:
                        tweetLanguage = tweet['lang']
                        if tweetLanguage == "en":
                            allTweets = AllTweets()
                            tweetID = tweet['id_str']
                            parsed_tweet = p.parse(tweet['text'])
                            hashtagsList = parsed_tweet.hashtags
                            print(hashtagsList)
                            hashtagsText = ""
                            if hashtagsList!= None:
                                for hashtag in hashtagsList:
                                    print(hashtag.match)
                                    hashtagsText += " "+ str(hashtag.match)
                            text = p.clean(tweet['text'])
                            text = " ".join(re.findall('\w+', text))
                            text += ", hashtags : "+hashtagsText
                            #text = "text"
                            row = [tweetID, text]
                            row += preProcessing.getLangage(tweet['lang']) + preProcessing.getLocation(
                                tweet['user']['location']) \
                                   + preProcessing.getTime(tweet['created_at']) + preProcessing.getSentimentAnalysis(
                                tweet['text']) \
                                   + preProcessing.getSource(tweet['source'])
                            row += [analysisID, concept]
                            #print(row)
                            try:
                                allTweets.insert(row)
                                cpt += 1
                                print(cpt, "tweets ", sep=" ")
                            except:
                                print("erreur encodage")

                    print("For the file : ", fileName, ", Tweets number is : ", cpt)
                    tweetsFile.close()
Пример #13
0
def _tweet_preprocessing(tweet_texts):
    """Preprocessing
        1. remove URLs (and put them in separate place)
        2. remove Emojis

    Returns the cleaned Tweet and parsed URLs
    """
    cleaned_tweet_texts = twitter_prepro.clean(tweet_texts)
    parsed_url = twitter_prepro.parse(tweet_texts).urls
    if parsed_url is not None:
        parsed_url = [u.match for u in parsed_url]

    return cleaned_tweet_texts, parsed_url
Пример #14
0
    def get_tweet_url(tweet_text: str):
        parsed_tweet = cleaning_processor.parse(tweet_text)
        tweet_url = None
        if not parsed_tweet.urls:
            return tweet_url

        last_index = len(tweet_text)
        for url_info in parsed_tweet.urls:
            if url_info.end_index == last_index:
                tweet_url = url_info.match
                break

        return tweet_url
def parse_mentions(tweet):
    """
    Parses a tweet for mentions.
    
    Parameters
    ----------
        tweet : the text of a tweet.
        
    Returns
        A list of parsed mentions (or an empty list).
    """
    parsed_tweet = p.parse(tweet)
    if parsed_tweet.mentions == None:
        return []
    else:
        return [x.match for x in parsed_tweet.mentions]
def parse_hashtags(tweet):
    """
    Parses a tweet for hashtags.
    
    Parameters
    ----------
        tweet : the text of a tweet.
        
    Returns
        A list of parsed hashtags (or an empty list).
    """
    parsed_tweet = p.parse(tweet)
    if parsed_tweet.hashtags == None:
        return []
    else:
        return [x.match for x in parsed_tweet.hashtags]
def parse_reserved_words(tweet):
    """
    Parses a tweet for reserved words ("RT" or "FAV").
    
    Parameters
    ----------
        tweet : the text of a tweet.
        
    Returns
        A list of parsed reserved words (or an empty list).
    """
    parsed_tweet = p.parse(tweet)
    if parsed_tweet.reserved == None:
        return []
    else:
        return [x.match for x in parsed_tweet.reserved]
def parse_smileys(tweet):
    """
    Parses a tweet for smiley faces.
    
    Parameters
    ----------
        tweet : the text of a tweet.
        
    Returns
        A list of parsed smiley faces (or an empty list).
    """
    parsed_tweet = p.parse(tweet)
    if parsed_tweet.smileys == None:
        return []
    else:
        return [x.match for x in parsed_tweet.smileys]
def parse_numbers(tweet):
    """
    Parses a tweet for numbers.
    
    Parameters
    ----------
        tweet : the text of a tweet.
        
    Returns
        A list of parsed numbers (or an empty list).
    """
    parsed_tweet = p.parse(tweet)
    if parsed_tweet.numbers == None:
        return []
    else:
        return [x.match for x in parsed_tweet.numbers]
def parse_urls(tweet):
    """
    Parses a tweet for URLs.
    
    Parameters
    ----------
        tweet : the text of a tweet.
        
    Returns
        A list of parsed URLs (or an empty list).
    """
    parsed_tweet = p.parse(tweet)
    if parsed_tweet.urls == None:
        return []
    else:
        return [x.match for x in parsed_tweet.urls]
Пример #21
0
def extract_info_from_cluster_table(cluster_edge_table):
	text_list = []
	htag_list = []
	url_list = []
	for index,row in cluster_edge_table.iterrows():
		username = row['user']
		#if not 'tweets' in row:
		#	continue # pass to the next user
		tweet_df = pd.read_json(row['tweets'])
		
		for idx,tweet in tweet_df.iterrows():
			htags = tweet['hashtags']
			urls = tweet['urls']
			text = tweet['text']
			retweet_count = tweet['retweet_count']
			favorite_count = tweet['favorite_count']

			parsed_tweet = tweetpre.parse(text)
			# extract emojis
			emojis = []
			if parsed_tweet.emojis is not None:
				emojis = [emo.match for emo in parsed_tweet.emojis]
			tweetpre.set_options(tweetpre.OPT.MENTION, tweetpre.OPT.URL)
			filtered_text = tweetpre.clean(text)
			tweetpre.set_options()
			#emojis = parsed_tweet.emojis.match ???
			url_c = [twitutils.convert_bitly_url(url_string) for url_string in urls]
			text_list.append({'text': text, 'user': username, 'url': url_c , 'emojis':emojis , 
				'retweet_count':retweet_count, 'favorite_count': favorite_count, 'filtered text': filtered_text, 
				'bcentrality': row['bcentrality']})
			htag_list += htags
			url_list += urls
	if not text_list:
		empty_df = pd.DataFrame()
		return {'text': empty_df, 'hashtags': empty_df, 'words': empty_df, 'urls': empty_df}
	text_df = pd.DataFrame(text_list)
	mostcommon_words_df = most_common_words(text_df['filtered text'])
	hashtags_df = count_order_items(htag_list,'hashtag')
	url_df = count_order_items(url_list,'url')
	url_df = twitutils.convert_bitly_table(url_df)
	filtered_url_df = twitutils.drop_twitter_urls(url_df)
	return {'text': text_df, 'hashtags': hashtags_df, 'words': mostcommon_words_df, 'urls': filtered_url_df}
Пример #22
0
    def test_parse(self):
        tweet = 'A tweet with #hashtag :) @mention 😀 and http://github.com/s.'
        parsed_tweet = p.parse(tweet)

        self.assertIsNotNone(parsed_tweet.urls)
        self.assertEqual(1, len(parsed_tweet.urls))

        self.assertIsNotNone(parsed_tweet.hashtags)
        self.assertEqual(1, len(parsed_tweet.hashtags))

        self.assertIsNotNone(parsed_tweet.mentions)
        self.assertEqual(1, len(parsed_tweet.mentions))

        self.assertIsNone(parsed_tweet.reserved_words)

        self.assertIsNotNone(parsed_tweet.emojis)
        self.assertEqual(1, len(parsed_tweet.emojis))
        self.assertEqual("😀", parsed_tweet.emojis[0].match)

        self.assertIsNotNone(parsed_tweet.smileys)
        self.assertEqual(1, len(parsed_tweet.smileys))
        self.assertEqual(":)", parsed_tweet.smileys[0].match)
Пример #23
0
    def test_parse(self):
        tweet = 'A tweet with #hashtag :) @mention 😀 and http://github.com/s.'
        p.set_options(p.OPT.URL, p.OPT.HASHTAG, p.OPT.MENTION, p.OPT.EMOJI, p.OPT.SMILEY)
        parsed_tweet = p.parse(tweet)

        self.assertIsNotNone(parsed_tweet.urls)
        self.assertEqual(1, len(parsed_tweet.urls))

        self.assertIsNotNone(parsed_tweet.hashtags)
        self.assertEqual(1, len(parsed_tweet.hashtags))

        self.assertIsNotNone(parsed_tweet.mentions)
        self.assertEqual(1, len(parsed_tweet.mentions))

        self.assertIsNone(parsed_tweet.reserved_words)

        self.assertIsNotNone(parsed_tweet.emojis)
        self.assertEqual(1, len(parsed_tweet.emojis))
        self.assertEqual("😀", parsed_tweet.emojis[0].match)

        self.assertIsNotNone(parsed_tweet.smileys)
        self.assertEqual(1, len(parsed_tweet.smileys))
        self.assertEqual(":)", parsed_tweet.smileys[0].match)
Пример #24
0
    def test_parse(self):
        tweet = "A tweet with #hashtag :) @mention 😀 and http://github.com/s."
        p.set_options(p.OPT.URL, p.OPT.HASHTAG, p.OPT.MENTION, p.OPT.EMOJI, p.OPT.SMILEY)
        parsed_tweet = p.parse(tweet)

        self.assertIsNotNone(parsed_tweet.urls)
        self.assertEqual(1, len(parsed_tweet.urls))

        self.assertIsNotNone(parsed_tweet.hashtags)
        self.assertEqual(1, len(parsed_tweet.hashtags))

        self.assertIsNotNone(parsed_tweet.mentions)
        self.assertEqual(1, len(parsed_tweet.mentions))

        self.assertIsNone(parsed_tweet.reserved_words)

        self.assertIsNotNone(parsed_tweet.emojis)
        self.assertEqual(1, len(parsed_tweet.emojis))
        self.assertEqual("😀", parsed_tweet.emojis[0].match)

        self.assertIsNotNone(parsed_tweet.smileys)
        self.assertEqual(1, len(parsed_tweet.smileys))
        self.assertEqual(":)", parsed_tweet.smileys[0].match)
tweet=[]
hashlist = ""
menlist = ""
urllist = ""
emolist = ""
emolist1 = ""
#jsonFile = open("dilmabr.json",'a')
with open ("ArvindKejriwal.json",'r') as a: 
     
    
#    y = json.loads(a)

         for line in a:
            
            content = json.loads(line)
            x= p.parse(content["text"])
            
            if content["in_reply_to_screen_name"] !=  "null":
                content["poi_name"] = content["user"]["screen_name"]
                content["poi_id"] = content["user"]["id"]
                poiname = content["poi_name"]
                
                
                
                
            else:
                content["poi_name"] = content["in_reply_to_screen_name"]
                content["poi_id"] = content["in_reply_to_user_id"]
                poiname = content["poi_name"]
                
                
Пример #26
0
def get_usermentions(singletweet):
    return p.parse(singletweet).mentions
Пример #27
0
# -*- coding: utf-8 -*-

import string

import preprocessor as p
from transliteration import getInstance

text = "RT @Pritpal77777: @Gurmeetramrahim GURU JI!! 61210 peoples take pledge to leave their drug and bad deeds n to adopt path of righteousness i\xe2\x80\xa6"
parsed_tweet = p.parse(text)
print(parsed_tweet.hashtags, parsed_tweet.mentions,
      parsed_tweet.reserved_words)

text = "ke liye best kar rahe hain to apki talash yahan par khatm hoti hai kyonki is post me main apko top video downloader apps ke baare me bataunga jinhe bahut hi aasani se install or use kiya jaa sakta ha."
t = getInstance()
t_text = t.transliterate(text, "hi_IN")
print(t_text)
# urls = None
#     emojis = None
#     smileys = None
#     hashtags = None
#     mentions = None
#     reserved_words = None
Пример #28
0
def prepare_data(folder, input_name2hourList, phase_category2keywords,
                 output_flag):

    print "prepare_data()..."
    tweet_id2content = {}
    tweet_id_set = set()
    chunk2importance = {}
    chunk2IDF = {}

    #hashtag2tweet_num = {}
    mention2tweet_num = {}

    actor2tweet_num = {}
    actor2tweet_ids = {}
    actor2replied_actor = {}
    actor2geo = {}

    nlp = StanfordCoreNLP("../../tools/stanford-corenlp-full-2017-06-09")

    word2lemma = {}

    for input_name in input_name2hourList:

        input_file = open(folder + input_name, "r")
        hourList = input_name2hourList[input_name]

        for line in input_file:
            tweet = ast.literal_eval(line)

            if "body" in tweet and "actor_Username" in tweet:
                actor = "@" + tweet["actor_Username"]
                words = tweet["body"].split()
                if words[0][0] == "@":
                    if actor not in actor2replied_actor:
                        actor2replied_actor[actor] = [[
                            words[0], tweet["postedTime"]
                        ]]
                    else:
                        actor2replied_actor[actor] += [[
                            words[0], tweet["postedTime"]
                        ]]
                elif words[0] == "RT":
                    if actor not in actor2replied_actor:
                        actor2replied_actor[actor] = [[
                            words[1].replace(":", ""), tweet["postedTime"]
                        ]]
                    else:
                        actor2replied_actor[actor] += [[
                            words[1].replace(":", ""), tweet["postedTime"]
                        ]]

            if tweet_filter(tweet, hourList) == True:
                continue

            words = tweet["link"].split("/")
            tweet_id = words[3] + "_" + words[5]

            chunk_set, hashtag_set, mention_set = set(), set(), set()
            tweet_id_set.add(tweet_id)

            actor = "@" + words[3]

            if actor not in actor2tweet_num:
                actor2tweet_num[actor] = 1
            else:
                actor2tweet_num[actor] += 1

            if actor not in actor2tweet_ids:
                actor2tweet_ids[actor] = [tweet_id]
            else:
                actor2tweet_ids[actor] += [tweet_id]

            if "geo" in tweet:
                if actor not in actor2geo:
                    actor2geo[actor] = [tweet["geo"]]
                else:
                    actor2geo[actor] += [tweet["geo"]]

            parsed_tweet = p.parse(tweet["body"])
            mentions = parsed_tweet.mentions
            hashtags = parsed_tweet.hashtags

            sentence_chunks = set()
            for chunk in tweet["chunkList"]:
                if chunk[0] == None:
                    continue

                new_chunk = None
                if chunk[1] in word2lemma:
                    new_chunk = word2lemma[chunk[1]]
                else:
                    new_chunk = lemmatize(nlp, chunk[1])
                    word2lemma[chunk[1]] = new_chunk

                if new_chunk not in chunk2importance:
                    chunk2importance[new_chunk] = list([chunk[2]])
                else:
                    chunk2importance[new_chunk] += [chunk[2]]

                sentence_chunks.add(new_chunk)

                chunk_set.add(new_chunk)

            for new_chunk in sentence_chunks:
                if new_chunk not in chunk2IDF:
                    chunk2IDF[new_chunk] = 1.0
                else:
                    chunk2IDF[new_chunk] += 1.0

            if hashtags != None:
                for hashtag in hashtags:
                    tag = hashtag.match.lower()
                    hashtag_set.add(tag)
                    #if tag not in hashtag2tweet_num:
                    #    hashtag2tweet_num[tag] = 1
                    #else:
                    #    hashtag2tweet_num[tag] += 1

            if mentions != None:
                for mention in mentions:
                    m = mention.match
                    mention_set.add(m)
                    if m not in mention2tweet_num:
                        mention2tweet_num[m] = 1
                    else:
                        mention2tweet_num[m] += 1

            if "geo" in tweet:
                tweet_id2content[tweet_id] = {
                    "body": tweet["body"],
                    "actor": actor,
                    "chunks": chunk_set,
                    "hashtags": hashtag_set,
                    "mentions": mention_set,
                    "geo": tweet["geo"],
                    "postedTime": tweet["postedTime"]
                }
            else:
                tweet_id2content[tweet_id] = {
                    "body": tweet["body"],
                    "actor": actor,
                    "chunks": chunk_set,
                    "hashtags": hashtag_set,
                    "mentions": mention_set,
                    "postedTime": tweet["postedTime"]
                }

        input_file.close()

    nlp.close()

    total_doc = len(tweet_id_set)
    for chunk in chunk2IDF:
        chunk2IDF[chunk] = math.log(total_doc / chunk2IDF[chunk])

    pickle.dump(tweet_id_set, open(output_flag + "tweet_id_set.p", "wb"))
    pickle.dump(chunk2importance, open(output_flag + "chunk2importance.p",
                                       "wb"))
    pickle.dump(chunk2IDF, open(output_flag + "chunk2IDF.p", "wb"))
    pickle.dump(tweet_id2content, open(output_flag + "tweet_id2content.p",
                                       "wb"))

    #pickle.dump(hashtag2tweet_num, open(output_flag + "hashtag2tweet_num.p", "wb"))
    pickle.dump(mention2tweet_num,
                open(output_flag + "mention2tweet_num.p", "wb"))
    pickle.dump(actor2tweet_num, open(output_flag + "actor2tweet_num.p", "wb"))
    pickle.dump(actor2tweet_ids, open(output_flag + "actor2tweet_ids.p", "wb"))
    pickle.dump(actor2replied_actor,
                open(output_flag + "actor2replied_actor.p", "wb"))
    pickle.dump(actor2geo, open(output_flag + "actor2geo.p", "wb"))

    for phase_category in phase_category2keywords:
        keywords = set(phase_category2keywords[phase_category])
        category_tweet_id_set = set()

        for tweet_id in tweet_id2content:
            #if len(tweet_id2content[tweet_id]["chunks"] & keywords) != 0:
            cleaned_tweet_words = p.clean(tweet_id2content[tweet_id]["body"])
            cleaned_tweet_words = process_text(cleaned_tweet_words)
            cleaned_tweet_words = set(cleaned_tweet_words.split())

            if len(cleaned_tweet_words & keywords) != 0:
                category_tweet_id_set.add(tweet_id)

        pickle.dump(category_tweet_id_set,
                    open(phase_category + "/tweet_id_set.p", "wb"))
Пример #29
0
                splits.append(split_on_numbers[0])
            else:
                splits.append(upper_split)


tokenizer = TweetTokenizer()
lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer('english')
word_vectors = []
length_vector = []
index = 0
start = timeit.default_timer()
text_file = open("cleanedTweets.txt", "w")
for tweets in open('tweets.txt', encoding="utf-8"):
    p.set_options(p.OPT.HASHTAG)
    parsed_tweet = p.parse(tweets)
    hashtags = parsed_tweet.hashtags
    splits = []
    if hashtags:
        for hashtag in hashtags:
            split_hashtags(hashtag)
    p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.RESERVED, p.OPT.NUMBER,
                  p.OPT.HASHTAG)
    stop_words = set(stopwords.words("english"))
    extra_stop_words = [
        '.', '|', '+', '~', '✓', '︎', '“', "'", '—', '⠀', '-', ',', '•', '・',
        '_', '!', '&', ')', '(', '…', '️', ' ', '...', '"', '/', '?', '', '..',
        ':'
    ]
    for symbol in extra_stop_words:
        stop_words.add(symbol)
def extract_elements(tweets):
    """ Function that uses the tweet-preprocessor and emojis libraries
        to identify and save:
            - #Hashtags
            - @Mentions
            - Emojis

    Args:
        tweets: list containing all tweets

    Returns:
        hashtags: list of hashtags
        mentions: list of mentions
        emojis: list of emojis

    Sources:
        https://pypi.org/project/tweet-preprocessor/
        https://github.com/carpedm20/emoji/

    """

    # set the global options for the library. These settings define which
    # elements of the tweet to pay attention to
    tp.set_options(tp.OPT.URL, tp.OPT.EMOJI, tp.OPT.MENTION,
                   tp.OPT.NUMBER, tp.OPT.HASHTAG)

    # create empty lists to store the resuts
    hashtags = []
    mentions = []
    emojis = []

    # iterate over all tweets in the list
    for tweet in tweets:

        # parse tweet to extract the relevant elements defined in the options
        parsed_tweet = tp.parse(tweet)

        # 1. save the hashtags
        h_tweet = []
        if parsed_tweet.hashtags is not None:
            for hashtag in parsed_tweet.hashtags:
                h_tweet.append(hashtag.match)

        # save to the global list as a space separated string
        hashtags.append(' '.join(h_tweet))

        # 2. save the emojis (using the library)
        e_tweet = []
        if len(emoji.emoji_lis(tweet)) > 0:
            for e in emoji.emoji_lis(tweet):
                e_tweet.append(e['emoji'])

        # save to the global list as a space separated string
        emojis.append(' '.join(e_tweet))

        # 3. save the mentions
        m_tweet = []
        if parsed_tweet.mentions is not None:
            for mention in parsed_tweet.mentions:
                m_tweet.append(mention.match)

        # save to the global list as a space separated string
        mentions.append(' '.join(m_tweet))

    return(hashtags, mentions, emojis)
Пример #31
0
def cleaning_sentence(sentence):
	hashtag_value = p.parse(sentence).hashtags
	sentence = p.clean(sentence)	
	sentence = re.sub("[^A-Za-z .]+","", sentence)
	return sentence
Пример #32
0
def csv_to_json(filename, output_file):

    d = {}

    with open(os.path.join(DATA_DIR, filename), encoding="utf-8") as r:
        reader = csv.reader(r, delimiter=",", quotechar='"')

        next(reader, None)
        for row in reader:
            link, date, time, author_id, followers, following, *content_rows = row

            content = ""

            for row in content_rows:
                content += f" {row}"

            url, id_str = link.split("statuses/")

            hashtags = []
            mentions = []
            url_strings = []

            parsed_tweet = p.parse(content)

            try:
                for hashtag in parsed_tweet.hashtags:
                    hashtags.append(hashtag.match)
            except TypeError:
                pass

            try:
                for mention in parsed_tweet.mentions:
                    mentions.append(mention.match)
            except TypeError:
                print(f"{author_id}: {content}")

            try:
                for url_string in parsed_tweet.urls:
                    url_strings.append(url_string.match)
            except TypeError:
                pass

                d[id_str] = {
                    'id_str': id_str,
                    'date': date,
                    'text': content,

                    # To do: tokenize tweet to get retweets
                    'retweet_count': 0,
                    'favorite_count': 0,
                    'reply_to': 0,
                    'coordinates': 0,
                    'reply_to_tweet': 0,
                    'user_screen_name': f"@{author_id}",
                    'quoted_status': 0,
                    'lang': 0,
                    'entities': 0,
                    'urls': url_strings,
                    'hashtags': hashtags,
                    'user_mentions': mentions,
                    'user': author_id,
                }

    with open(os.path.join(RESULTS_DIR, "converted_" + output_file + ".json"),
              'w') as f:
        f.write(json.dumps(d, indent=1))

    print("Success")
    else:
        reply_text = tweet_text
    tweet_lang = tweet['lang']
    hashtags = ''
    mentions = ''
    tweet_urls = ''
    tweet_emoticons = ''
    tweet_date = tweet['created_at']
    tweet_loc = tweet['coordinates']

    #if 'retweeted_status' in tweet:
    #    retweeted_status = tweet['retweeted_status']
    #    isRetweet = True
    #else:
    #    isRetweet = False
    parsedTweet = tpp.parse(tweet_text)

    splitTime = tweet_date.split()
    month = months[splitTime[1]]
    year = splitTime[5]
    day = splitTime[2]
    time = splitTime[3]  #.split(':')[0]+':00:00'
    tweet_date = year + '-' + month + '-' + day + 'T' + time + 'Z'
    try:
        for temp in parsedTweet.hashtags:
            hashtags = hashtags + temp.match + ' '
        hashtags = hashtags.replace('#', '').strip(' ')
    except TypeError as te:
        hashtags = ''
    try:
        for temp in parsedTweet.mentions: