def __init__(self, awsPath, localSavePath, localReadPath, remoteReadPath,filename):

        self.localSavePath = localSavePath
        self.awsPath = awsPath

        # download remote socialmedia data into a temp folder
        # load it into csv
        s3.downloadToDisk(filename=filename,localpath=localReadPath, remotepath=remoteReadPath)
        
        Array = []
        try:
            with open(localReadPath + filename,'r',encoding="utf-8") as f:
                reader = csv.reader(f)
                for row in reader:
                    try:
                        Array.append(row)
                    except Exception as e:
                        pass
        except:
            with open(localReadPath + filename,'r',encoding="ISO-8859-1") as f:
                reader = csv.reader(f)
                for row in reader:
                    try:
                        Array.append(row)
                    except Exception as e:
                        pass
                    
        self.data = []
        self.target = []
        for a in Array[1:]:
            if len(a) == 2:
                self.data.append(a[0])
                self.target.append(a[1])
Exemplo n.º 2
0
def main(remoteReadPath, column):
    filename = remoteReadPath.split('/')[-2] + '.csv'
    s3.downloadToDisk(filename=filename, localpath='data/', remotepath=remoteReadPath)

    Array = []
    try:
        with open('data/' + filename,'r',encoding="utf-8", errors="ignore") as f:
            reader = csv.reader(f)
            try:
                for row in reader:
                    Array.append(row)
            except Exception as e:
                pass
    except:
        with open('data/' + filename,'r',encoding="ISO-8859-1", errors="ignore") as f:
            reader = csv.reader(f)
            try:
                for row in reader:
                    Array.append(row)
            except Exception as e:
                pass

    df = pd.DataFrame(Array[1:],columns=Array[0])
    df[df[column]!=''][column].dropna().astype('str').to_csv('data/raw_train.txt', index=False)

    return None
Exemplo n.º 3
0
def lambda_handler(event, context):
    # create local path
    localPath = os.path.join('/tmp', 'hashtag')
    if not os.path.exists(localPath):
        os.makedirs(localPath)

    # download triggered file
    bucket = event['Records'][0]['s3']['bucket']['name']
    key = unquote_plus(event['Records'][0]['s3']['object']['key'])
    remotePath = "/".join(key.split("/")[:-1])
    filename = key.split("/")[-1]
    s3.downloadToDisk(bucket, filename, localPath, remotePath)

    # load to dataframe
    df = pd.read_csv(os.path.join(localPath, filename))

    # extract hashtag
    hash = extract_hashtag(df)

    # plot bar chart (frequency chart)
    index = hash['hashtags'].values.tolist()[:10]
    counts = hash['Freq'].values.tolist()[:10]
    title = 'Top 10 prevalent hashtags (' + filename.split(".")[0] + ')'
    div = plot.plot_bar_chart(index, counts, title)

    # save result and write back to s3
    hash_filename = filename.split(".")[0]

    hash.to_csv(os.path.join(localPath,
                             hash_filename + "_extracted_hashtag.csv"),
                index=False)
    s3.upload("macroscope-paho-covid", localPath, "hashtags",
              hash_filename + "_extracted_hashtag.csv")

    with open(
            os.path.join(localPath,
                         hash_filename + "_extracted_hashtag_frequency.html"),
            'w') as f:
        f.write(div)
    s3.upload("macroscope-paho-covid", localPath, "hashtags",
              hash_filename + "_extracted_hashtag_frequency.html")

    return None
Exemplo n.º 4
0
def lambda_handler(event, context):

    awsPath = os.path.join(event['sessionID'], event['screen_name'])
    localPath = os.path.join('/tmp', event['sessionID'], event['screen_name'])
    if not os.path.exists(localPath):
        os.makedirs(localPath)
    screen_name = event['screen_name']

    try:
        s3.downloadToDisk(screen_name + '_tweets.txt', localPath, awsPath)
    except:
        raise ValueError('Cannot find the timeline in the remote storage!')

    with open(os.path.join(localPath, screen_name + '_tweets.txt'),
              'r') as personality_text:
        headers = {'Content-Type': 'text/plain', 'Accept': 'application/json'}

        # concatenate the text field to be a paragraph
        df = pd.read_csv(os.path.join(localPath, screen_name + '_tweets.txt'))
        tweets = df['text'].tolist()
        body = '. '.join(tweets).encode('utf-8', 'ignore')

        r = requests.post(
            'https://gateway.watsonplatform.net/personality-insights/api/v3/profile?version=2017-10-13&consumption_preferences=true&raw_scores=true',
            headers=headers,
            data=body,
            auth=('apikey', event['apikey']),
            timeout=300)

        if r.status_code == 200:
            data = {'personality': r.json()}

            with open(
                    os.path.join(localPath,
                                 screen_name + '_personality' + '.json'),
                    'w') as outfile:
                json.dump(data, outfile)

            s3.upload(localPath, awsPath, screen_name + '_personality.json')

            return data
        else:
            raise ValueError(r.text)
Exemplo n.º 5
0
def lambda_handler(event, context):

    output = dict()

    uid = event['uid']
    awsPath = event['s3FolderName'] + '/ML/classification/' + uid + '/'
    localSavePath = '/tmp/' + event[
        's3FolderName'] + '/ML/classification/' + uid + '/'
    if not os.path.exists(localSavePath):
        os.makedirs(localSavePath)
    if not os.path.exists(localSavePath):
        os.makedirs(localSavePath)

    # download config to local folder
    fname_config = 'config.json'
    try:
        s3.downloadToDisk(fname_config, localSavePath, awsPath)
        with open(localSavePath + fname_config, "r") as fp:
            data = json.load(fp)
            for key in data.keys():
                if key not in event.keys():
                    event[key] = data[key]
        with open(localSavePath + fname_config, "w") as f:
            json.dump(event, f)
        s3.upload(localSavePath, awsPath, fname_config)
        output['config'] = s3.generate_downloads(awsPath, fname_config)
        output['uid'] = uid

    except:
        raise ValueError('This session ID is invalid!')
        exit()

    # download unlabeled data to local folder
    fname_unlabeled = 'testing.csv'
    try:
        s3.downloadToDisk(fname_unlabeled, localSavePath, awsPath)
    except:
        raise ValueError('You\'re requesting ' + fname_unlabeled +
                         ' file, and it\'s not found in your remote directory!\
            It is likely that you have not yet performed step 1 -- split the dataset into training and predicting set, or you have provided the wrong sessionID.'
                         )
        exit()

    #download pickle model to local folder
    fname_pickle = 'pipeline.pickle'
    try:
        s3.downloadToDisk(fname_pickle, localSavePath, awsPath)
    except:
        raise ValueError(
            'You\'re requesting ' + fname_pickle +
            ' file, and it\'s not found in your remote directory! \
            It is likely that you have not yet performed step 2 -- model training, or you have provided the wrong sessionID.'
        )
        exit()

    classification = Classification(awsPath, localSavePath)
    output['predicting'] = classification.predict()
    output['div_category'] = classification.plot()

    return output
def get_remote_input(remoteReadPath, filename, localReadPath):
    """
    download input file from s3 bucket to a local location, and then load
    it to a pandas dataframe
    :param remoteReadPath: remote path in s3 to store the data
    :param localReadPath: local location to store the data, usually in /tmp
    :return: df: dataframe that contains the complete input file
    """
    s3.downloadToDisk(filename, localReadPath, remoteReadPath)

    # quick fix for decoding error, sometimes the data is coded in ISO-8859-1
    # Array = 2D nested list holding column and row data
    Array = []
    try:
        with open(os.path.join(localReadPath, filename),
                  'r',
                  encoding='utf-8',
                  errors="ignore") as f:
            reader = csv.reader(f)
            try:
                for row in reader:
                    Array.append(row)
            except Exception as e:
                print(e)
    except Exception:
        with open(os.path.join(localReadPath, filename),
                  'r',
                  encoding='ISO-8859-1',
                  errors="ignore") as f:
            reader = csv.reader(f)
            try:
                for row in reader:
                    Array.append(row)
            except Exception as e:
                print(e)

    # load to pandas dataframe
    df = pd.DataFrame(Array[1:], columns=Array[0])

    return df
    # load url and id
    temp_dir = '/tmp/' + args.s3FolderName + '/' + uid + '/'
    if not os.path.exists(temp_dir):
        os.makedirs(temp_dir)

    # configure output directory
    # save it in download/temp/xxx-xxxxxxxxxxxxx-xxxxx/aww-comments

    file = args.remoteReadPath.split('/')[-2]
    comments_folder = temp_dir + file + '-comments/'
    if not os.path.exists(comments_folder):
        os.makedirs(comments_folder)
    fname_zip = file + '.zip'

    s3.downloadToDisk(filename=file + '.csv',
                      localpath=temp_dir,
                      remotepath=args.remoteReadPath)
    Array = []
    try:
        with open(temp_dir + file + '.csv', 'r', encoding='utf-8') as f:
            reader = csv.reader(f)
            try:
                for row in reader:
                    Array.append(row)
            except Exception as e:
                pass

    except:
        with open(temp_dir + file + '.csv', 'r', encoding="ISO-8859-1") as f:
            reader = csv.reader(f)
            try:
Exemplo n.º 8
0
def lambda_handler(event, context):

    localPath = os.path.join('/tmp', event['sessionID'])
    if not os.path.exists(localPath):
        os.makedirs(localPath)

    # default algorithm to IBM-Watson to be compatible with old version
    if 'algorithm' not in event.keys():
        event['algorithm'] = 'IBM-Personality'

    comparison_table = [[]]

    # download and read personality scores
    if event['algorithm'] == 'IBM-Personality':
        comparison_table = [[
            'screen_name', 'Personality_Openness',
            'Personality_Conscientiousness', 'Personality_Extraversion',
            'Personality_Agreeableness', 'Personality_Emotional_Range',
            'Needs_Challenge', 'Needs_Closeness', 'Needs_Curiosity',
            'Needs_Excitement', 'Needs_Harmony', 'Needs_Ideal',
            'Needs_Liberty', 'Needs_Love', 'Needs_Practicality',
            'Needs_Self_Expression', 'Needs_Stability', 'Needs_Structure',
            'Values_Conservation', 'Values_Openness', 'Values_Hedonism',
            'Values_Self_Enhancement', 'Values_Self_Transcendence'
        ]]

        for screen_name in event['screen_names']:
            awsPath = os.path.join(event['sessionID'], screen_name)
            try:
                s3.downloadToDisk(screen_name + '_personality.json', localPath,
                                  awsPath)
            except:
                raise ValueError(
                    'Cannot find the personality in the remote storage!')

            with open(
                    os.path.join(localPath, screen_name + '_personality.json'),
                    'r') as f:
                data = json.load(f)['personality']
                user_info = [screen_name]
                for p in data['personality']:
                    user_info.append(p['percentile'])
                for p in data['needs']:
                    user_info.append(p['percentile'])
                for p in data['values']:
                    user_info.append(p['percentile'])
                comparison_table.append(user_info)

    elif event['algorithm'] == 'TwitPersonality':
        comparison_table = [[
            'screen_name', 'Personality_Openness',
            'Personality_Conscientiousness', 'Personality_Extraversion',
            'Personality_Agreeableness', 'Personality_Emotional_Range'
        ]]

        for screen_name in event['screen_names']:
            awsPath = os.path.join(event['sessionID'], screen_name)
            try:
                s3.downloadToDisk(screen_name + '_twitPersonality.json',
                                  localPath, awsPath)
            except:
                raise ValueError(
                    'Cannot find the personality in the remote storage!')

            with open(
                    os.path.join(localPath,
                                 screen_name + '_twitPersonality.json'),
                    'r') as f:
                data = json.load(f)['personality']
                user_info = [screen_name]
                for p in data['personality']:
                    user_info.append(p['percentile'])
                comparison_table.append(user_info)

    elif event['algorithm'] == 'Pamuksuz-Personality':
        comparison_table = [[
            'screen_name', 'sophistication', 'excitement', 'sincerity',
            'competence', 'ruggedness'
        ]]
        for screen_name in event['screen_names']:
            awsPath = os.path.join(event['sessionID'], screen_name)
            try:
                s3.downloadToDisk(
                    screen_name + '_utku_personality_average.json', localPath,
                    awsPath)
            except:
                raise ValueError(
                    'Cannot find the personality in the remote storage!')

            with open(
                    os.path.join(
                        localPath,
                        screen_name + '_utku_personality_average.json'),
                    'r') as f:
                data = json.load(f)
                comparison_table.append([
                    screen_name, data['sophistication'], data['excitement'],
                    data['sincerity'], data['competence'], data['ruggedness']
                ])

    # computer correlations
    event['screen_names'].insert(0, 'Correlation')
    correlation_matrix = [event['screen_names']]
    correlation_matrix_no_legends = []
    for i in range(1, len(comparison_table)):
        row = [comparison_table[i][0]]
        row_no_legends = []

        for j in range(1, len(comparison_table)):
            vector_a = comparison_table[i][1:]
            vector_b = comparison_table[j][1:]

            row.append(cos_sim(vector_a, vector_b))
            row_no_legends.append(cos_sim(vector_a, vector_b))

        correlation_matrix.append(row)
        correlation_matrix_no_legends.append(row_no_legends)

    return {
        'comparison_table': comparison_table,
        'correlation_matrix': correlation_matrix,
        'correlation_matrix_no_legends': correlation_matrix_no_legends
    }
    # arranging the paths
    uid = args.uuid
    
    # check if this awsPath exist!!! if not exist, exit with error
    awsPath = args.s3FolderName + '/ML/classification/' + uid +'/'
   
    localSavePath = '/tmp/' + args.s3FolderName + '/ML/classification/' + uid + '/'
    localReadPath = '/tmp/' + args.s3FolderName + '/'
    if not os.path.exists(localSavePath):
        os.makedirs(localSavePath)
    if not os.path.exists(localReadPath):
        os.makedirs(localReadPath)

    fname_config = 'config.json'
    if s3.checkExist(awsPath, fname_config):
        s3.downloadToDisk(fname_config, localSavePath, awsPath)
        with open(localSavePath + fname_config, "r") as fp:
            data = json.load(fp)
            for key in vars(args).keys():
                if key not in data.keys():
                    data[key] = vars(args)[key]
        with open(localSavePath + fname_config,"w") as f:
            json.dump(data,f)
        s3.upload(localSavePath, awsPath, fname_config)
        output['config'] = s3.generate_downloads(awsPath, fname_config)
        output['uuid'] = uid

    else:
        raise ValueError('This session ID is invalid!')
        exit()
        
Exemplo n.º 10
0
def lambda_handler(event, context):

    awsUserPath = os.path.join(event['sessionID'], event['user_screen_name'])
    awsBrandPath = os.path.join(event['sessionID'], event['brand_screen_name'])
    localPath = os.path.join('/tmp', event['sessionID'])

    if not os.path.exists(localPath):
        os.makedirs(localPath)

    # default algorithm to IBM-Watson to be compatible with old version
    if 'algorithm' not in event.keys():
        event['algorithm'] =  'IBM-Watson'

    # calculate similarity score
    vector_a = []
    vector_b = []

    # download and read personality scores
    if event['algorithm'] == 'IBM-Watson':
        try:
            s3.downloadToDisk(event['user_screen_name'] + '_personality.json', localPath, awsUserPath)
            s3.downloadToDisk(event['brand_screen_name'] + '_personality.json', localPath, awsBrandPath)

            # open json and read in values
            with open(os.path.join(localPath, event['user_screen_name'] + '_personality.json'), 'r') as f:
                user_data = json.load(f)['personality']
            with open(os.path.join(localPath, event['brand_screen_name'] + '_personality.json'),'r') as f:
                brand_data = json.load(f)['personality']

            if event['option'] == 'personality_sim_score':
                for p in user_data['personality']:
                    vector_a.append(p['percentile'])
                for p in brand_data['personality']:
                    vector_b.append(p['percentile'])

            elif event['option'] == 'needs_sim_score':
                for p in user_data['needs']:
                    vector_a.append(p['percentile'])
                for p in brand_data['needs']:
                    vector_b.append(p['percentile'])

            elif event['option'] == 'values_sim_score':
                for p in user_data['values']:
                    vector_a.append(p['percentile'])
                for p in brand_data['values']:
                    vector_b.append(p['percentile'])
            elif event['option'] == 'consumption_sim_score':
                for p in user_data['consumption_preferences']:
                    for c in p['consumption_preferences']:
                        vector_a.append(c['score'])
                for p in brand_data['consumption_preferences']:
                    for c in p['consumption_preferences']:
                        vector_b.append(c['score'])
        except:
            raise ValueError('Cannot find the timeline in the remote storage!')

    elif event['algorithm'] == 'TwitPersonality':
        try:
            s3.downloadToDisk(event['user_screen_name'] + '_twitPersonality.json', localPath, awsUserPath)
            s3.downloadToDisk(event['brand_screen_name'] + '_twitPersonality.json', localPath, awsBrandPath)

            # open json and read in values
            with open(os.path.join(localPath, event['user_screen_name'] + '_twitPersonality.json'), 'r') as f:
                user_data = json.load(f)['personality']
            with open(os.path.join(localPath, event['brand_screen_name'] + '_twitPersonality.json'),'r') as f:
                brand_data = json.load(f)['personality']

            if event['option'] == 'personality_sim_score':
                for p in user_data['personality']:
                    vector_a.append(p['percentile'])
                for p in brand_data['personality']:
                    vector_b.append(p['percentile'])

        except:
            raise ValueError('Cannot find the timeline in the remote storage!')

    if event['algorithm'] == 'Pamuksuz-Personality':
        try:
            s3.downloadToDisk(event['user_screen_name'] + '_utku_personality_average.json', localPath, awsUserPath)
            s3.downloadToDisk(event['brand_screen_name'] + '_utku_personality_average.json', localPath, awsBrandPath)

            # open json and read in values
            with open(os.path.join(localPath, event['user_screen_name'] + '_utku_personality_average.json'), 'r') as f:
                user_data = json.load(f)
            with open(os.path.join(localPath, event['brand_screen_name'] + '_utku_personality_average.json'), 'r') as f:
                brand_data = json.load(f)

            for metric in user_data.keys():
                vector_a.append(user_data[metric])
                vector_b.append(brand_data[metric])
        except:
            raise ValueError('Cannot find the timeline in the remote storage!')

    try:
        return {'sim_score': cos_sim(vector_a, vector_b)}
    except:
        raise ValueError(
            'cannot calculate the cosine similarity of these two vectors!')
    parser.add_argument('--s3FolderName', required=True)
    parser.add_argument('--email', required=True)
    args = parser.parse_args()

    uid = args.uuid
    awsPath = args.s3FolderName + '/ML/classification/' + uid + '/'
    localSavePath = '/tmp/' + args.s3FolderName + '/ML/classification/' + uid + '/'
    if not os.path.exists(localSavePath):
        os.makedirs(localSavePath)
    if not os.path.exists(localSavePath):
        os.makedirs(localSavePath)

    # download config to local folder
    fname_config = 'config.json'
    if s3.checkExist(awsPath, fname_config):
        s3.downloadToDisk(fname_config, localSavePath, awsPath)
        with open(localSavePath + fname_config, "r") as fp:
            data = json.load(fp)
            for key in vars(args).keys():
                if key not in data.keys():
                    data[key] = vars(args)[key]
        with open(localSavePath + fname_config, "w") as f:
            json.dump(data, f)
        s3.upload(localSavePath, awsPath, fname_config)
        output['config'] = s3.generate_downloads(awsPath, fname_config)
        output['uuid'] = uid

    else:
        raise ValueError('This session ID is invalid!')
        exit()
Exemplo n.º 12
0
def lambda_handler(event, context):
    # create local path
    localPath = os.path.join('/tmp', 'frequent_phrases')
    if not os.path.exists(localPath):
        os.makedirs(localPath)

    # download triggered file
    bucket = event['Records'][0]['s3']['bucket']['name']
    key = unquote_plus(event['Records'][0]['s3']['object']['key'])
    remotePath = "/".join(key.split("/")[:-1])

    files = s3.listFiles(bucket, remotePath)
    sorted_files = sorted(files,
                          key=lambda file: file['LastModified'],
                          reverse=True)

    hashtags = ["COVID19", "coronavirus", "COVID_19"]
    date_markers = ["1day", "7days", "30days"]

    for hashtag in hashtags:
        indices_row = []
        counts_row = []
        legends_row = []
        for date_marker in date_markers:
            if date_marker == "1day":
                today = key.split("/")[-1]
                s3.downloadToDisk(bucket, today, localPath, remotePath)
                df_today = pd.read_csv(os.path.join(localPath, today))
                indices, counts = extract_frequent_phrases(
                    df_today, hashtag, date_marker, localPath)
                legends = ["word (1day)", "bigram (1day)", "trigram(1day)"]
            elif date_marker == "7days":
                last_7_days_files = sorted_files[:7]
                last_7_days_list = []
                for file in last_7_days_files:
                    fname = file['Key'].split("/")[-1]
                    s3.downloadToDisk(bucket, fname, localPath, remotePath)
                    last_7_days_list.append(
                        pd.read_csv(os.path.join(localPath, fname)))
                last_7_days_df = pd.concat(last_7_days_list,
                                           axis=0,
                                           ignore_index=True)
                indices, counts = extract_frequent_phrases(
                    last_7_days_df, hashtag, date_marker, localPath)
                legends = ["word (7days)", "bigram (7days)", "trigram(7days)"]
            elif date_marker == "30days":
                last_30_days_files = sorted_files[:30]
                last_30_days_list = []
                for file in last_30_days_files:
                    fname = file['Key'].split("/")[-1]
                    s3.downloadToDisk(bucket, fname, localPath, remotePath)
                    last_30_days_list.append(
                        pd.read_csv(os.path.join(localPath, fname)))
                last_30_days_list = pd.concat(last_30_days_list,
                                              axis=0,
                                              ignore_index=True)
                indices, counts = extract_frequent_phrases(
                    last_30_days_list, hashtag, date_marker, localPath)
                legends = [
                    "word (30days)", "bigram (30days)", "trigram(30days)"
                ]
            else:
                break

            indices_row.append(indices)
            counts_row.append(counts)
            legends_row.append(legends)

        # Plot and save
        title = "Most prevalent 10 frequent words and phrases used in #" + hashtag + " tweets"
        div = plot.plot_multiple_bar_chart(indices_row, counts_row, title,
                                           legends_row)
        with open(
                os.path.join(localPath,
                             hashtag + "_extracted_frequent_phrases.html"),
                'w') as f:
            f.write(div)
        s3.upload("macroscope-paho-covid", localPath, "frequent_phrases",
                  hashtag + "_extracted_frequent_phrases.html")

    return None
Exemplo n.º 13
0
    def __init__(self, awsPath, localSavePath, localReadPath, remoteReadPath):

        self.localSavePath = localSavePath
        self.awsPath = awsPath

        # download remote socialmedia data into a temp folder
        # load it into csv
        filename = remoteReadPath.split('/')[-2] + '.csv'
        self.filename = filename  # save it so split function can reuse this name
        s3.downloadToDisk(filename=filename,
                          localpath=localReadPath,
                          remotepath=remoteReadPath)

        Array = []
        try:
            with open(localReadPath + filename, 'r', encoding='utf-8') as f:
                reader = csv.reader(f)
                for row in reader:
                    try:
                        Array.append(row)
                    except Exception as e:
                        pass
        except:
            with open(localReadPath + filename, 'r',
                      encoding='ISO-8859-1') as f:
                reader = csv.reader(f)
                for row in reader:
                    try:
                        Array.append(row)
                    except Exception as e:
                        pass

        df = pandas.DataFrame(Array[1:], columns=Array[0])

        # remoteReadPath always follows format of sessionID/folderID/datasetName/
        # example: local/GraphQL/twitter-Tweet/trump/ => ['local','GraphQL', 'twitter-Tweet','trump','']
        source = remoteReadPath.split('/')[2]

        if (source == 'twitter-Tweet') and ('text' in Array[0]):
            self.corpus = list(
                set(df[df['text'] != '']['text'].dropna().astype(
                    'str').tolist()))
        elif (source == 'twitter-Stream') and ('_source.text' in Array[0]):
            self.corpus = list(
                set(df[df['_source.text'] != '']
                    ['_source.text'].dropna().astype('str').tolist()))

        # find the unique content in crimson hexagon
        elif (source == 'crimson-Hexagon') and ('contents' in Array[0]):
            self.corpus = list(
                set(df[df['contents'] != '']['contents'].dropna().astype(
                    'str').tolist()))

        # find the unique title in reddit posts
        elif (source == 'reddit-Search'
              or source == 'reddit-Post') and 'title' in Array[0]:
            self.corpus = list(
                set(df[df['title'] != '']['title'].dropna().astype(
                    'str').tolist()))
        elif source == 'reddit-Historical-Post' and '_source.title' in Array[0]:
            self.corpus = list(
                set(df[df['_source.title'] != '']
                    ['_source.title'].dropna().astype('str').tolist()))

        # find the unique body in reddit comments
        elif (source == 'reddit-Comment'
              or source == 'reddit-Historical-Comment') and 'body' in Array[0]:
            self.corpus = list(
                set(df[df['body'] != '']['body'].dropna().astype(
                    'str').tolist()))

        # TODO: switch reddit comment to elasticsearch endpoint
        # elif source == 'reddit-Historical-Comment' and '_source.body' in Array[0]:
        #     self.corpus = list(set(df[df['_source.body']!='']['_source.body'].dropna().astype('str').tolist()))

        # strip http in the corpus
        self.corpus = [re.sub(r"http\S+", "", text) for text in self.corpus]
Exemplo n.º 14
0
def calc_tweet_personality(sessionID, screen_name, profile_img):

    # load embedding dataset
    curr_path = os.path.dirname(os.path.abspath(__file__))

    dataset_path = curr_path + "/fastText/wiki-news-300d-1M.vec"
    wordDictionary = dsu.parseFastText(dataset_path)

    # load predictive models
    models = {}
    for trait in ["O", "C", "E", "A", "N"]:
        models[trait] = joblib.load(curr_path + "/models/model_" + trait +
                                    ".pkl")

    # read tweets
    awsPath = os.path.join(sessionID, screen_name)
    sessionDir = os.environ['SESSIONDIR']
    localPath = os.path.join(sessionDir + '/collection', sessionID)
    if not os.path.exists(localPath):
        try:
            os.makedirs(localPath)
        except:
            pass

    try:
        s3.downloadToDisk(screen_name + '_tweets.txt', localPath, awsPath)
    except:
        raise ValueError('Cannot find the timeline in the remote storage!')

    # process the tweets
    tweet_file_path = os.path.join(localPath, screen_name + '_tweets.txt')
    filteredTweets = []
    word_count = 0
    for tweet in open(tweet_file_path, "r", encoding="utf-8"):
        if re.match(r'^(RT)', tweet) or tweet == '\n' \
                or tweet == '' or tweet == ' ':
            continue

        #remove links starting with "http"
        tweet = re.sub(r'((http)([^\s]*)(\s|$))|((http)([^\s]*)$)', "", tweet)
        #remove links with no http (probably unnecessary)
        tweet = re.sub(
            r'(\s([^\s]*)\.([^\s]*)\/([^\s]*)\s)|(^([^\s]*)\.([^\s]*)\/([^\s]*)(\s|$))|(\s([^\s]*)\.([^\s]*)\/([^\s]*)$)',
            " ", tweet)
        #remove mentions
        tweet = re.sub(r'(\s(@)([^\s]*)\s)|((^@)([^\s]*)(\s|$))|(@([^\s]*)$)',
                       " ", tweet)
        #hashtags are removed by countvectorizer
        filteredTweets.append(tweet)

        word_count += len(tweet.split())

        if len(filteredTweets) == 0:
            print("Not enough tweets for prediction.")
            continue

    #now we can process the tweet using embeddings.transofrmTextForTraining
    try:
        tweetEmbeddings = embeddings.transformTextForTesting(
            wordDictionary, 3, filteredTweets, "conc")
    except:
        print("Not enough tweets for prediction.")

    # predict using saved models
    # range is 0 ~ 5
    scores = {}
    for trait in ["O", "C", "E", "A", "N"]:
        model = models[trait]
        preds = model.predict(tweetEmbeddings)
        scores[trait] = float(str(np.mean(np.array(preds)))[0:5])

    jung = ""
    if scores["E"] > 3:
        jung = "E"
    else:
        jung = "I"
    if scores["O"] > 3:
        jung = jung + "N"
    else:
        jung = jung + "S"
    if scores["A"] > 3:
        jung = jung + "F"
    else:
        jung = jung + "T"
    if scores["C"] > 3:
        jung = jung + "J"
    else:
        jung = jung + "P"

    scores["jung"] = jung

    # sort the output
    result = {}
    result['screen_name'] = screen_name
    result['profile_img'] = profile_img
    result['personality'] = {
        "word_count":
        word_count,
        "processed_language":
        "en",
        'personality': [{
            'name': 'Openness',
            'percentile': scores['O'] / 5
        }, {
            'name': 'Conscientiousness',
            'percentile': scores['C'] / 5
        }, {
            'name': 'Extraversion',
            'percentile': scores['E'] / 5
        }, {
            'name': 'Agreeableness',
            'percentile': scores['A'] / 5
        }, {
            'name': 'Emotional range',
            'percentile': scores['N'] / 5
        }]
    }

    # save to json and upload to s3 bucket
    with open(os.path.join(localPath, screen_name + '_twitPersonality.json'),
              'w') as outfile:
        json.dump(result, outfile)
    s3.upload(localPath, awsPath, screen_name + '_twitPersonality.json')

    # delete localPath files
    try:
        os.remove(os.path.join(localPath, screen_name + '_tweets.txt'))
        os.remove(
            os.path.join(localPath, screen_name + '_twitPersonality.json'))
    except:
        # already deleted!
        pass

    print(s3.generate_downloads(awsPath,
                                screen_name + '_twitPersonality.json'))

    return result
Exemplo n.º 15
0
    parsed, unknown = parser.parse_known_args()
    for arg in unknown:
        if arg.startswith("--"):
            parser.add_argument(arg, required=False)

    params = vars(parser.parse_args())

    awsPath = os.path.join(params['sessionID'], params['screen_name'])
    localPath = os.path.join('/tmp', params['sessionID'],
                             params['screen_name'])
    if not os.path.exists(localPath):
        os.makedirs(localPath)
    screen_name = params['screen_name']

    try:
        s3.downloadToDisk(screen_name + '_tweets.txt', localPath, awsPath)
    except:
        raise ValueError('Cannot find the timeline in the remote storage!')

    # calculate brand personality
    model = MultiLabelClassificationModel('roberta',
                                          'checkpoint-17315-epoch-5',
                                          num_labels=5,
                                          args={
                                              "reprocess_input_data": True,
                                              'use_cached_eval_features': False
                                          },
                                          use_cuda=False)
    df = pd.read_csv(os.path.join(localPath, screen_name + '_tweets.txt'))
    new_df = multiple_sentences(df, model)
    fname_sentences = screen_name + '_utku_personality_sentences.csv'