tweet = re.sub(r'((http)([^\s]*)(\s|$))|((http)([^\s]*)$)', "", tweet) #remove links with no http (probably unnecessary) tweet = re.sub(r'(\s([^\s]*)\.([^\s]*)\/([^\s]*)\s)|(^([^\s]*)\.([^\s]*)\/([^\s]*)(\s|$))|(\s([^\s]*)\.([^\s]*)\/([^\s]*)$)', " ", tweet) #remove mentions tweet = re.sub(r'(\s(@)([^\s]*)\s)|((^@)([^\s]*)(\s|$))|(@([^\s]*)$)', " ", tweet) #hashtags are removed by countvectorizer filteredTweets.append(tweet) if len(filteredTweets) == 0: print("Not enough tweets for prediction.") continue #now we can process the tweet using embeddings.transofrmTextForTraining try: tweetEmbeddings = embeddings.transformTextForTesting(wordDictionary, tweet_threshold, filteredTweets, "conc") print("Embeddings computed.") except: #most of tweets are ingored for brevity/no embedding correspondence print("Not enough tweets for prediction.") continue scores = {} #load the saved ML models for trait in ["O","C","E","A","N"]: model = joblib.load("Models/SVM_fasttext_conc_"+trait+".pkl") mean = np.mean(tweetEmbeddings, axis = 0) score = model.predict([mean]) scores[trait] = float(str(score[0])[0:5]) print("\tScore for",trait,"is:",str(score[0])[0:5])
def calc_tweet_personality(sessionID, screen_name, profile_img): # load embedding dataset curr_path = os.path.dirname(os.path.abspath(__file__)) dataset_path = curr_path + "/fastText/wiki-news-300d-1M.vec" wordDictionary = dsu.parseFastText(dataset_path) # load predictive models models = {} for trait in ["O", "C", "E", "A", "N"]: models[trait] = joblib.load(curr_path + "/models/model_" + trait + ".pkl") # read tweets awsPath = os.path.join(sessionID, screen_name) sessionDir = os.environ['SESSIONDIR'] localPath = os.path.join(sessionDir + '/collection', sessionID) if not os.path.exists(localPath): try: os.makedirs(localPath) except: pass try: s3.downloadToDisk(screen_name + '_tweets.txt', localPath, awsPath) except: raise ValueError('Cannot find the timeline in the remote storage!') # process the tweets tweet_file_path = os.path.join(localPath, screen_name + '_tweets.txt') filteredTweets = [] word_count = 0 for tweet in open(tweet_file_path, "r", encoding="utf-8"): if re.match(r'^(RT)', tweet) or tweet == '\n' \ or tweet == '' or tweet == ' ': continue #remove links starting with "http" tweet = re.sub(r'((http)([^\s]*)(\s|$))|((http)([^\s]*)$)', "", tweet) #remove links with no http (probably unnecessary) tweet = re.sub( r'(\s([^\s]*)\.([^\s]*)\/([^\s]*)\s)|(^([^\s]*)\.([^\s]*)\/([^\s]*)(\s|$))|(\s([^\s]*)\.([^\s]*)\/([^\s]*)$)', " ", tweet) #remove mentions tweet = re.sub(r'(\s(@)([^\s]*)\s)|((^@)([^\s]*)(\s|$))|(@([^\s]*)$)', " ", tweet) #hashtags are removed by countvectorizer filteredTweets.append(tweet) word_count += len(tweet.split()) if len(filteredTweets) == 0: print("Not enough tweets for prediction.") continue #now we can process the tweet using embeddings.transofrmTextForTraining try: tweetEmbeddings = embeddings.transformTextForTesting( wordDictionary, 3, filteredTweets, "conc") except: print("Not enough tweets for prediction.") # predict using saved models # range is 0 ~ 5 scores = {} for trait in ["O", "C", "E", "A", "N"]: model = models[trait] preds = model.predict(tweetEmbeddings) scores[trait] = float(str(np.mean(np.array(preds)))[0:5]) jung = "" if scores["E"] > 3: jung = "E" else: jung = "I" if scores["O"] > 3: jung = jung + "N" else: jung = jung + "S" if scores["A"] > 3: jung = jung + "F" else: jung = jung + "T" if scores["C"] > 3: jung = jung + "J" else: jung = jung + "P" scores["jung"] = jung # sort the output result = {} result['screen_name'] = screen_name result['profile_img'] = profile_img result['personality'] = { "word_count": word_count, "processed_language": "en", 'personality': [{ 'name': 'Openness', 'percentile': scores['O'] / 5 }, { 'name': 'Conscientiousness', 'percentile': scores['C'] / 5 }, { 'name': 'Extraversion', 'percentile': scores['E'] / 5 }, { 'name': 'Agreeableness', 'percentile': scores['A'] / 5 }, { 'name': 'Emotional range', 'percentile': scores['N'] / 5 }] } # save to json and upload to s3 bucket with open(os.path.join(localPath, screen_name + '_twitPersonality.json'), 'w') as outfile: json.dump(result, outfile) s3.upload(localPath, awsPath, screen_name + '_twitPersonality.json') # delete localPath files try: os.remove(os.path.join(localPath, screen_name + '_tweets.txt')) os.remove( os.path.join(localPath, screen_name + '_twitPersonality.json')) except: # already deleted! pass print(s3.generate_downloads(awsPath, screen_name + '_twitPersonality.json')) return result