Exemplos de generate_downloads em Python, exemplos de writeToS3.generate_downloads em Python

Exemplo n.º 1

0

Exibir arquivo

def main(remoteSavePath):

    output = {}

    for file in listdir('results'):
        if isfile(join('results', file)):
            s3.upload('results', remoteSavePath, file)

            if file == 'config.json':
                output['config'] = s3.generate_downloads(remoteSavePath, file)
            elif file == 'div.html':
                output['visualization'] = s3.generate_downloads(remoteSavePath, file)
            elif file == 'AutoPhrase_multi-words.txt':
                output['multi-words'] = s3.generate_downloads(remoteSavePath, file)
            elif file == 'AutoPhrase_single-word.txt':
                output['single-word'] = s3.generate_downloads(remoteSavePath, file)
            elif file == 'AutoPhrase.txt':
                output['autophrase'] = s3.generate_downloads(remoteSavePath, file)
            elif file == 'segmentation.model':
                output['model'] = s3.generate_downloads(remoteSavePath, file)
            elif file == 'token_mapping.txt':
                output['token-mapping'] = s3.generate_downloads(remoteSavePath, file)
            else:
                output['misc'] = s3.generate_downloads(remoteSavePath, file)

    return output

Exemplo n.º 2

0

Exibir arquivo

def lambda_handler(event, context):

    output = dict()

    uid = event['uid']
    awsPath = event['s3FolderName'] + '/ML/classification/' + uid + '/'
    localSavePath = '/tmp/' + event[
        's3FolderName'] + '/ML/classification/' + uid + '/'
    if not os.path.exists(localSavePath):
        os.makedirs(localSavePath)
    if not os.path.exists(localSavePath):
        os.makedirs(localSavePath)

    # download config to local folder
    fname_config = 'config.json'
    try:
        s3.downloadToDisk(fname_config, localSavePath, awsPath)
        with open(localSavePath + fname_config, "r") as fp:
            data = json.load(fp)
            for key in data.keys():
                if key not in event.keys():
                    event[key] = data[key]
        with open(localSavePath + fname_config, "w") as f:
            json.dump(event, f)
        s3.upload(localSavePath, awsPath, fname_config)
        output['config'] = s3.generate_downloads(awsPath, fname_config)
        output['uid'] = uid

    except:
        raise ValueError('This session ID is invalid!')
        exit()

    # download unlabeled data to local folder
    fname_unlabeled = 'testing.csv'
    try:
        s3.downloadToDisk(fname_unlabeled, localSavePath, awsPath)
    except:
        raise ValueError('You\'re requesting ' + fname_unlabeled +
                         ' file, and it\'s not found in your remote directory!\
            It is likely that you have not yet performed step 1 -- split the dataset into training and predicting set, or you have provided the wrong sessionID.'
                         )
        exit()

    #download pickle model to local folder
    fname_pickle = 'pipeline.pickle'
    try:
        s3.downloadToDisk(fname_pickle, localSavePath, awsPath)
    except:
        raise ValueError(
            'You\'re requesting ' + fname_pickle +
            ' file, and it\'s not found in your remote directory! \
            It is likely that you have not yet performed step 2 -- model training, or you have provided the wrong sessionID.'
        )
        exit()

    classification = Classification(awsPath, localSavePath)
    output['predicting'] = classification.predict()
    output['div_category'] = classification.plot()

    return output

Exemplo n.º 3

0

Exibir arquivo

def save_remote_output(localSavePath, remoteSavePath, fname):
    """

    :param localSavePath:
    :param remoteSavePath:
    :param fname:
    :param output_data:
    :return:
    """
    zipf = zipfile.ZipFile(os.path.join(localSavePath, fname), 'w',
                           zipfile.ZIP_DEFLATED)
    zipdir(os.path.join(localSavePath, 'img'), zipf)
    zipf.close()

    s3.upload(localSavePath, remoteSavePath, fname)
    url = s3.generate_downloads(remoteSavePath, fname)

    return url

Exemplo n.º 4

0

Exibir arquivo

Arquivo: batch_classification_train.py Projeto: IllinoisSocialMediaMacroscope/smm-analytics

    def metrics(self):
        report = np.array(metrics.precision_recall_fscore_support(self.target,self.predicted,labels=self.labels)).T
        avg_report = list(metrics.precision_recall_fscore_support(self.target,self.predicted,average='weighted'))
        avg_report.insert(0,'AVG')

        # save metrics report
        fname_metrics = 'classification_report.csv'
        with open(self.localSavePath + fname_metrics,'w',newline="") as f:
            writer = csv.writer(f)
            writer.writerow(['label','precision','recall','f1-score','support'])
            for i in range(len(report)):
                writer.writerow([self.labels[i],
                                    round(report[i][0],4),
                                    round(report[i][1],4),
                                    round(report[i][2],4),
                                    round(report[i][3],4)])
            writer.writerow(avg_report)
        s3.upload(self.localSavePath, self.awsPath, fname_metrics)
        return {'metrics': s3.generate_downloads(self.awsPath, fname_metrics)}

Exemplo n.º 5

0

Exibir arquivo

    def plot(self):
        y_pred_dict = Counter(self.predicted)
        labels = []
        values = []
        for i in y_pred_dict.keys():
            labels.append("class: " + str(i))
            values.append(y_pred_dict[i])
        trace = go.Pie(labels=labels, values=values, textinfo='label')
        div_category = plot([trace],
                            output_type='div',
                            image='png',
                            auto_open=False,
                            image_filename='plot_img')

        fname_div_category = 'div_category.html'
        with open(self.localSavePath + fname_div_category, "w") as f:
            f.write(div_category)
        s3.upload(self.localSavePath, self.awsPath, fname_div_category)
        return s3.generate_downloads(self.awsPath, fname_div_category)

Exemplo n.º 6

0

Exibir arquivo

def lambda_handler(event, context):

    awsPath = os.path.join(event['sessionID'], event['screen_name'])
    localSavePath = os.path.join('/tmp', event['sessionID'],
                                 event['screen_name'])
    if not os.path.exists(localSavePath):
        os.makedirs(localSavePath)

    auth = tweepy.OAuthHandler(event['consumer_key'], event['consumer_secret'])
    auth.set_access_token(event['access_token'], event['access_token_secret'])
    api = tweepy.API(auth)

    tweets = []
    for status in tweepy.Cursor(api.user_timeline,
                                screen_name=event['screen_name'],
                                count=100,
                                tweet_mode="extended").items():
        tweets.append([
            status._json['id'],
            status._json['full_text'].encode('utf-8', 'ignore').decode()
        ])

    if len(tweets) > 0:
        fname = event['screen_name'] + '_tweets.txt'
        with open(os.path.join(localSavePath, fname),
                  'w',
                  encoding='utf-8',
                  newline='') as f:
            header = ['id', 'text']
            writer = csv.writer(f, delimiter=",")
            writer.writerow(header)
            for row in tweets:
                writer.writerow(row)

        s3.upload(localSavePath, awsPath, fname)

        return {'url': s3.generate_downloads(awsPath, fname)}
    else:
        raise ValueError('This user\'s timeline (screen_name: ' +
                         event['screen_name'] +
                         ') is empty. There is nothing to analyze!')

Exemplo n.º 7

0

Exibir arquivo

def save_remote_output(localSavePath, remoteSavePath, fname, output_data):
    """
    save output in memory first to local file, then upload to remote S3 bucket
    :param localSavePath: local saved file
    :param remoteSavePath: remote save file path
    :param fname: filename
    :param output_data: the actual data
    :return: url of the file saved in S3 bucket
    """

    # json
    if isinstance(output_data, dict):
        fname += '.json'
        with open(os.path.join(localSavePath, fname), 'w') as f:
            json.dump(output_data, f)

    # # dataframe to csv
    # elif isinstance(output_data, pd.DataFrame):
    #     fname += '.csv'
    #     output_data.to_csv(fname)

    # string to html
    elif isinstance(output_data, str):
        fname += '.html'
        with open(os.path.join(localSavePath, fname), 'w') as f:
            f.write(output_data)

    # list(list) to csv
    elif isinstance(output_data, list) \
            and (isinstance(output_data[0], list) or isinstance(output_data[0],
                                                                tuple)):
        fname += '.csv'
        with open(os.path.join(localSavePath, fname),
                  'w',
                  newline='',
                  encoding='utf-8') as f:
            writer = csv.writer(f)
            for row in output_data:
                try:
                    writer.writerow(row)
                except UnicodeEncodeError as e:
                    print(e)

    # generator
    elif isinstance(output_data, types.GeneratorType):
        if fname == 'gephi':
            fname += '.gml'
        elif fname == 'pajek':
            fname += '.net'
        else:
            fname += '.unknown'

        with open(os.path.join(localSavePath, fname), 'w', newline='') as f:
            for line in output_data:
                f.write(line + '\n')

    # else pickle the object
    else:
        fname += '.pickle'
        with open(os.path.join(localSavePath, fname), 'wb') as f:
            pickle.dump(output_data, f)

    s3.upload(localSavePath, remoteSavePath, fname)
    url = s3.generate_downloads(remoteSavePath, fname)

    return url

Exemplo n.º 8

0

Exibir arquivo

Arquivo: RedditComment.py Projeto: IllinoisSocialMediaMacroscope/smm-analytics

    # loop through the id and store their comments
    for url, id in zip(urls, ids):
        url = "https://www.reddit.com" + url
        try:
            submission = reddit.submission(url=url)
            if not bfs(submission, id, comments_folder):
                # zip goes here
                zipf = zipfile.ZipFile(temp_dir + fname_zip, 'w',
                                       zipfile.ZIP_DEFLATED)
                zipdir(comments_folder + '/', zipf)
                zipf.close()

                # upload this zip to the s3 corresponding folder
                s3.upload(temp_dir, args.remoteReadPath, fname_zip)
                url = s3.generate_downloads(args.remoteReadPath, fname_zip)
                # delete the files
                d.deletedir('/tmp')
                # send out email notification
                n.notification(args.email,
                               case=1,
                               filename=args.remoteReadPath,
                               links=url,
                               sessionURL=args.sessionURL)
                exit(code='Lack of disk space')
        except:
            # escape those can't be found in url
            pass

    # success and send email notification
    # zip goes here

Exemplo n.º 9

0

Exibir arquivo

Arquivo: batch_classification_train.py Projeto: IllinoisSocialMediaMacroscope/smm-analytics

    def classify(self, model):

        if model == 'NaiveBayes':
            text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                                 ('tfidf', TfidfTransformer()),
                                 ('clf',MultinomialNB())])
            # 10 fold cross validation 
            self.predicted = cross_val_predict(text_clf, self.data, self.target, cv=10)
            # fit the model
            text_clf.fit(self.data, self.target)
            y_score = text_clf.predict_proba(self.data)
        elif model == 'Perceptron':
            text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                                 ('tfidf', TfidfTransformer()),
                                 ('clf',Perceptron())])
            # 10 fold cross validation 
            self.predicted = cross_val_predict(text_clf, self.data, self.target, cv=10)
            # fit the model
            text_clf.fit(self.data, self.target)
            y_score = text_clf.decision_function(self.data)
        elif model == 'SGD':
            text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                                 ('tfidf', TfidfTransformer()),
                                 ('clf',SGDClassifier())])
            # 10 fold cross validation 
            self.predicted = cross_val_predict(text_clf, self.data, self.target, cv=10)
            # fit the model
            text_clf.fit(self.data, self.target)
            y_score = text_clf.decision_function(self.data)
        elif model == 'RandomForest':
            text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                                 ('tfidf', TfidfTransformer()),
                                 ('clf',RandomForestClassifier(n_estimators=100))])
            # 10 fold cross validation 
            self.predicted = cross_val_predict(text_clf, self.data, self.target, cv=10)
            # fit the model
            text_clf.fit(self.data, self.target)
            y_score = text_clf.predict_proba(self.data)
        elif model == 'KNN':
            text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                                 ('tfidf', TfidfTransformer()),
                                 ('clf',KNeighborsClassifier(n_neighbors=10))])
            # 10 fold cross validation 
            self.predicted = cross_val_predict(text_clf, self.data, self.target, cv=10)
            # fit the model
            text_clf.fit(self.data, self.target)
            y_score = text_clf.predict_proba(self.data)
        elif model == 'passiveAggressive':
            text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                                 ('tfidf', TfidfTransformer()),
                                 ('clf',PassiveAggressiveClassifier(n_iter=50))])
            # 10 fold cross validation 
            self.predicted = cross_val_predict(text_clf, self.data, self.target, cv=10)
            # fit the model
            text_clf.fit(self.data, self.target)
            y_score = text_clf.decision_function(self.data)           
            
        # get 10 fold cross validation accuracy score
        fold_scores = cross_val_score(text_clf, self.data, self.target, cv=10)
        fname_folds = 'accuracy_score.csv'
        with open(self.localSavePath + fname_folds,'w',newline="") as f:
            writer = csv.writer(f)
            writer.writerow(['fold_1','fold_2','fold_3','fold_4','fold_5',
                             'fold_6','fold_7','fold_8','fold_9','fold_10'])
            writer.writerow([ '%.4f' % elem for elem in fold_scores ])
        s3.upload(self.localSavePath, self.awsPath, fname_folds)
        accuracy_url = s3.generate_downloads(self.awsPath, fname_folds)
        
        # pickle the Pipeline for future use
        fname_pickle = 'classification_pipeline.pickle'
        with open(self.localSavePath + fname_pickle,'wb') as f:
            pickle.dump(text_clf,f)
        s3.upload(self.localSavePath, self.awsPath, fname_pickle)
        pickle_url = s3.generate_downloads(self.awsPath, fname_pickle)

        # plotting the roc curve
        self.labels = text_clf.classes_       
        y = label_binarize(self.target,classes = self.labels)

        
        # binary class
        if len(self.labels) <= 2:
            if model == 'Perceptron' or model == 'SGD' or model == 'passiveAggressive':
                fpr, tpr, _ = roc_curve(y[:, 0], y_score)
            else:
                y = []
                for label in self.target:
                    item = []
                    for i in range(len(text_clf.classes_)):
                        if label == text_clf.classes_[i]:
                            item.append(1)
                        else:
                            item.append(0)
                    y.append(item)
                y = np.array(y)
                fpr, tpr, _ = roc_curve(y.ravel(), y_score.ravel())
            
            roc_auc = auc(fpr, tpr)
            trace = go.Scatter(
                x = fpr,
                y = tpr,
                name = 'ROC curve (area =' + str(roc_auc) + ' )',
                line = dict(color=('deeppink'), width = 4)
            )
            data = [trace]

        # multiclasses  
        else:
            fpr = {}
            tpr = {}
            roc_auc = {}
            for i in range(len(self.labels)):
                fpr[self.labels[i]], tpr[self.labels[i]], _ = roc_curve(y[:, i], y_score[:, i])
                roc_auc[self.labels[i]] = auc(fpr[self.labels[i]], tpr[self.labels[i]])
            
            # Compute micro-average ROC curve and ROC area
            fpr["micro"], tpr["micro"], _ = roc_curve(y.ravel(), y_score.ravel())
            roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

            # First aggregate all false positive rates
            all_fpr = np.unique(np.concatenate([fpr[self.labels[i]] for i in range(len(self.labels))]))

            # Then interpolate all ROC curves at this points
            mean_tpr = np.zeros_like(all_fpr)
            for i in range(len(self.labels)):
                mean_tpr += interp(all_fpr, fpr[self.labels[i]], tpr[self.labels[i]])

            # Finally average it and compute AUC
            mean_tpr /= len(self.labels)

            fpr["macro"] = all_fpr
            tpr["macro"] = mean_tpr
            roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

            # plotting
            trace0 = go.Scatter(
                x = fpr['micro'],
                y = tpr['micro'],
                name = 'micro-average ROC curve (area =' + str(roc_auc["micro"]) + ' )',
                line = dict(color=('deeppink'), width = 4)
            )
            trace1 = go.Scatter(
                x = fpr['macro'],
                y = tpr['macro'],
                 name = 'macro-average ROC curve (area =' + str(roc_auc["macro"]) + ' )',
                line = dict(
                    color = ('navy'),
                    width = 4,)
            )
            data = [trace0, trace1]
            colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
            for i, color in zip(range(len(self.labels)), colors):
                trace = go.Scatter(
                    x = fpr[self.labels[i]], 
                    y = tpr[self.labels[i]],
                    name = 'ROC curve of class {0} (area = {1:0.2f})'.format(self.labels[i], roc_auc[self.labels[i]]),
                    line = dict(
                        color = (color),
                        width = 4, 
                        dash = 'dash')
                )
                data.append(trace)

                
        layout = dict(title = model + ' model ROC curve',
              xaxis = dict(title = 'False Positive Rate'),
              yaxis = dict(title = 'True Positive Rate'),
              )

        fig = dict(data=data, layout=layout)
        div = plot(fig, output_type='div',image='png',auto_open=False, image_filename='plot_img')
        
        # print the graph file
        fname_div ='div.html'
        with open(self.localSavePath + fname_div,'w') as f:
            f.write(div)
        s3.upload(self.localSavePath, self.awsPath, fname_div)
        div_url = s3.generate_downloads(self.awsPath, fname_div)

        return {'accuracy':accuracy_url, 'pickle':pickle_url, 'div':div_url }

Exemplo n.º 10

0

Exibir arquivo

Arquivo: batch_classification_train.py Projeto: IllinoisSocialMediaMacroscope/smm-analytics

        os.makedirs(localSavePath)
    if not os.path.exists(localReadPath):
        os.makedirs(localReadPath)

    fname_config = 'config.json'
    if s3.checkExist(awsPath, fname_config):
        s3.downloadToDisk(fname_config, localSavePath, awsPath)
        with open(localSavePath + fname_config, "r") as fp:
            data = json.load(fp)
            for key in vars(args).keys():
                if key not in data.keys():
                    data[key] = vars(args)[key]
        with open(localSavePath + fname_config,"w") as f:
            json.dump(data,f)
        s3.upload(localSavePath, awsPath, fname_config)
        output['config'] = s3.generate_downloads(awsPath, fname_config)
        output['uuid'] = uid

    else:
        raise ValueError('This session ID is invalid!')
        exit()
        
   
    # download the labeled data from s3 to tmp
    classification = Classification(awsPath, localSavePath, localReadPath, args.remoteReadPath,args.labeledFilename)
    
    output.update(classification.classify(args.model))
    output.update(classification.metrics())
    
    d.deletedir('/tmp')
    n.notification(args.email,case=3,filename=awsPath)

Exemplo n.º 11

0

Exibir arquivo

    def predict(self):

        # load classification model
        pkl_model = os.path.join(self.localSavePath, 'pipeline.pickle')
        with open(pkl_model, 'rb') as f:
            text_clf = pickle.load(f)

        # load text set
        data = []
        try:
            with open(self.localSavePath + 'testing.csv',
                      'r',
                      encoding='utf-8',
                      errors="ignore") as f:
                reader = list(csv.reader(f))
                for row in reader[1:]:
                    try:
                        data.extend(row)
                    except Exception as e:
                        pass
        except:
            with open(self.localSavePath + 'testing.csv',
                      'r',
                      encoding='ISO-8859-1',
                      errors="ignore") as f:
                reader = list(csv.reader(f))
                for row in reader[1:]:
                    try:
                        data.extend(row)
                    except Exception as e:
                        pass

        # predict using trained model
        self.predicted = text_clf.predict(data)

        # save result
        fname = 'predicting.csv'
        try:
            with open(self.localSavePath + fname,
                      'w',
                      newline="",
                      encoding='utf-8',
                      errors="ignore") as f:
                writer = csv.writer(f)
                writer.writerow(['text', 'category'])
                for i in range(len(data)):
                    try:
                        writer.writerow([data[i], self.predicted[i]])
                    except:
                        pass
        except:
            with open(self.localSavePath + fname,
                      'w',
                      newline="",
                      encoding='ISO-8859-1',
                      errors="ignore") as f:
                writer = csv.writer(f)
                writer.writerow(['text', 'category'])
                for i in range(len(data)):
                    try:
                        writer.writerow([data[i], self.predicted[i]])
                    except:
                        pass
        s3.upload(self.localSavePath, self.awsPath, fname)
        return s3.generate_downloads(self.awsPath, fname)

Exemplo n.º 12

0

Exibir arquivo

    def split(self, ratio):
        training_set = list(
            random.sample(self.corpus, int(len(self.corpus) * ratio / 100)))
        testing_set = [
            item for item in self.corpus if item not in training_set
        ]

        # plot a pie chart of the split
        labels = ['training set data points', 'unlabeled data points']
        values = [len(training_set), len(testing_set)]
        trace = go.Pie(labels=labels, values=values, textinfo='value')
        div_split = plot([trace],
                         output_type='div',
                         image='png',
                         auto_open=False,
                         image_filename='plot_img')
        fname_div_split = 'div_split.html'
        with open(self.localSavePath + fname_div_split, "w") as f:
            f.write(div_split)
        s3.upload(self.localSavePath, self.awsPath, fname_div_split)
        div_url = s3.generate_downloads(self.awsPath, fname_div_split)

        fname1 = 'TRAINING_' + self.filename
        try:
            with open(self.localSavePath + fname1,
                      'w',
                      newline="",
                      encoding='utf-8') as f:
                writer = csv.writer(f)
                writer.writerow(['text', 'category'])
                for row in training_set:
                    try:
                        writer.writerow([row])
                    except UnicodeDecodeError:
                        pass
        except:
            with open(self.localSavePath + fname1,
                      'w',
                      newline="",
                      encoding='ISO-8859-1') as f:
                writer = csv.writer(f)
                writer.writerow(['text', 'category'])
                for row in training_set:
                    try:
                        writer.writerow([row])
                    except UnicodeDecodeError:
                        pass
        s3.upload(self.localSavePath, self.awsPath, fname1)
        training_url = s3.generate_downloads(self.awsPath, fname1)

        fname2 = 'UNLABELED_' + self.filename
        try:
            with open(self.localSavePath + fname2,
                      'w',
                      newline="",
                      encoding='utf-8') as f:
                writer = csv.writer(f)
                writer.writerow(['text'])
                for row in testing_set:
                    try:
                        writer.writerow([row])
                    except UnicodeDecodeError:
                        pass
        except:
            with open(self.localSavePath + fname2,
                      'w',
                      newline="",
                      encoding='ISO-8859-1') as f:
                writer = csv.writer(f)
                writer.writerow(['text'])
                for row in testing_set:
                    try:
                        writer.writerow([row])
                    except UnicodeDecodeError:
                        pass
        s3.upload(self.localSavePath, self.awsPath, fname2)
        unlabeled_url = s3.generate_downloads(self.awsPath, fname2)

        return {
            'div': div_url,
            'training': training_url,
            'testing': unlabeled_url
        }

Exemplo n.º 13

0

Exibir arquivo

def calc_tweet_personality(sessionID, screen_name, profile_img):

    # load embedding dataset
    curr_path = os.path.dirname(os.path.abspath(__file__))

    dataset_path = curr_path + "/fastText/wiki-news-300d-1M.vec"
    wordDictionary = dsu.parseFastText(dataset_path)

    # load predictive models
    models = {}
    for trait in ["O", "C", "E", "A", "N"]:
        models[trait] = joblib.load(curr_path + "/models/model_" + trait +
                                    ".pkl")

    # read tweets
    awsPath = os.path.join(sessionID, screen_name)
    sessionDir = os.environ['SESSIONDIR']
    localPath = os.path.join(sessionDir + '/collection', sessionID)
    if not os.path.exists(localPath):
        try:
            os.makedirs(localPath)
        except:
            pass

    try:
        s3.downloadToDisk(screen_name + '_tweets.txt', localPath, awsPath)
    except:
        raise ValueError('Cannot find the timeline in the remote storage!')

    # process the tweets
    tweet_file_path = os.path.join(localPath, screen_name + '_tweets.txt')
    filteredTweets = []
    word_count = 0
    for tweet in open(tweet_file_path, "r", encoding="utf-8"):
        if re.match(r'^(RT)', tweet) or tweet == '\n' \
                or tweet == '' or tweet == ' ':
            continue

        #remove links starting with "http"
        tweet = re.sub(r'((http)([^\s]*)(\s|$))|((http)([^\s]*)$)', "", tweet)
        #remove links with no http (probably unnecessary)
        tweet = re.sub(
            r'(\s([^\s]*)\.([^\s]*)\/([^\s]*)\s)|(^([^\s]*)\.([^\s]*)\/([^\s]*)(\s|$))|(\s([^\s]*)\.([^\s]*)\/([^\s]*)$)',
            " ", tweet)
        #remove mentions
        tweet = re.sub(r'(\s(@)([^\s]*)\s)|((^@)([^\s]*)(\s|$))|(@([^\s]*)$)',
                       " ", tweet)
        #hashtags are removed by countvectorizer
        filteredTweets.append(tweet)

        word_count += len(tweet.split())

        if len(filteredTweets) == 0:
            print("Not enough tweets for prediction.")
            continue

    #now we can process the tweet using embeddings.transofrmTextForTraining
    try:
        tweetEmbeddings = embeddings.transformTextForTesting(
            wordDictionary, 3, filteredTweets, "conc")
    except:
        print("Not enough tweets for prediction.")

    # predict using saved models
    # range is 0 ~ 5
    scores = {}
    for trait in ["O", "C", "E", "A", "N"]:
        model = models[trait]
        preds = model.predict(tweetEmbeddings)
        scores[trait] = float(str(np.mean(np.array(preds)))[0:5])

    jung = ""
    if scores["E"] > 3:
        jung = "E"
    else:
        jung = "I"
    if scores["O"] > 3:
        jung = jung + "N"
    else:
        jung = jung + "S"
    if scores["A"] > 3:
        jung = jung + "F"
    else:
        jung = jung + "T"
    if scores["C"] > 3:
        jung = jung + "J"
    else:
        jung = jung + "P"

    scores["jung"] = jung

    # sort the output
    result = {}
    result['screen_name'] = screen_name
    result['profile_img'] = profile_img
    result['personality'] = {
        "word_count":
        word_count,
        "processed_language":
        "en",
        'personality': [{
            'name': 'Openness',
            'percentile': scores['O'] / 5
        }, {
            'name': 'Conscientiousness',
            'percentile': scores['C'] / 5
        }, {
            'name': 'Extraversion',
            'percentile': scores['E'] / 5
        }, {
            'name': 'Agreeableness',
            'percentile': scores['A'] / 5
        }, {
            'name': 'Emotional range',
            'percentile': scores['N'] / 5
        }]
    }

    # save to json and upload to s3 bucket
    with open(os.path.join(localPath, screen_name + '_twitPersonality.json'),
              'w') as outfile:
        json.dump(result, outfile)
    s3.upload(localPath, awsPath, screen_name + '_twitPersonality.json')

    # delete localPath files
    try:
        os.remove(os.path.join(localPath, screen_name + '_tweets.txt'))
        os.remove(
            os.path.join(localPath, screen_name + '_twitPersonality.json'))
    except:
        # already deleted!
        pass

    print(s3.generate_downloads(awsPath,
                                screen_name + '_twitPersonality.json'))

    return result