示例#1
0
def exp(estimators, num_episodes=500, verbose=0):
    """ conducts the whole experiments """

    fn_names, fns = list(), list()

    # Loop for all experiments on all datasets
    for data_name in DATASET_NAMES:
        if data_name == "page-blocks": data_name = "page_blocks"

        def _fn(_data_name):
            dict_bias, dict_rmse = _exp(num_episodes=num_episodes,
                                        estimators=estimators,
                                        data_name=_data_name,
                                        verbose=verbose)
            return dict_bias, dict_rmse

        fn_names.append(data_name)
        _fn = partial(_fn, _data_name=data_name)
        fns.append(_fn)

    # Run the estimators in parallel
    results = RunInParallel(fn_names=fn_names, fns=fns)

    # Summarise the results
    df_bias, df_rmse = prep_for_visualisation(results=results,
                                              data_names=DATASET_NAMES,
                                              est_names=estimators.keys())
    summary_in_txt(df=df_bias, _metric_name="bias")
    summary_in_txt(df=df_rmse, _metric_name="rmse")
    plot_bar_chart(df=df_bias, plot_name="Bias", fig_name="./results/bias.png")
    plot_bar_chart(df=df_rmse, plot_name="RMSE", fig_name="./results/rmse.png")
示例#2
0
def algorithm(df, params):
    """
    wrapper function to put each individual algorithm inside
    :param df: dataframe that contains all the input dataset
    :param params: algorithm specific parameters
    :return: a dictionary of { outputname: output content in memory }
    """

    output = {}

    # algorithm specific code
    # construct sentiment analysis
    PP = Preprocess(df, params['column'])

    output['phrases'] = PP.get_phrases()
    output['filtered'] = filtered_tokens = PP.get_words()
    output['processed'] = processed_tokens = PP.stem_lematize(
        params['process'], filtered_tokens)
    output['tagged'] = PP.tagging(params['tagger'], processed_tokens)
    filtered_most_common, processed_most_common = PP.most_frequent(
        filtered_tokens, processed_tokens)
    output['most_common'] = processed_most_common

    # plot
    index = []
    counts = []
    for common in processed_most_common[1:51]:
        index.append(common[0])
        counts.append(common[1])
    title = 'Top 50 frequent words (' + params['process'] + ')'
    output['div'] = plot.plot_bar_chart(index, counts, title)

    return output
示例#3
0
def algorithm(array, params):
    """
    wrapper function to put each individual algorithm inside
    :param array: array that contains all the input dataset
    :param params: algorithm specific parameters
    :return: a dictionary of { outputname: output content in memory }
    """

    output = {}

    CF = Classification(array)

    output['uid'] = params['uid']

    fold_scores, text_clf = CF.classify(params['model'])
    output['accuracy'] = fold_scores
    output['pipeline'] = text_clf

    labels = text_clf.classes_
    output['metrics'] = CF.calc_metrics(labels)

    # plot
    output['div_accuracy'] = plot.plot_bar_chart(
        fold_scores[0],
        fold_scores[1],
        title='10 fold cross validation accuracy score')

    return output
示例#4
0
def lambda_handler(event, context):
    # create local path
    localPath = os.path.join('/tmp', 'hashtag')
    if not os.path.exists(localPath):
        os.makedirs(localPath)

    # download triggered file
    bucket = event['Records'][0]['s3']['bucket']['name']
    key = unquote_plus(event['Records'][0]['s3']['object']['key'])
    remotePath = "/".join(key.split("/")[:-1])
    filename = key.split("/")[-1]
    s3.downloadToDisk(bucket, filename, localPath, remotePath)

    # load to dataframe
    df = pd.read_csv(os.path.join(localPath, filename))

    # extract hashtag
    hash = extract_hashtag(df)

    # plot bar chart (frequency chart)
    index = hash['hashtags'].values.tolist()[:10]
    counts = hash['Freq'].values.tolist()[:10]
    title = 'Top 10 prevalent hashtags (' + filename.split(".")[0] + ')'
    div = plot.plot_bar_chart(index, counts, title)

    # save result and write back to s3
    hash_filename = filename.split(".")[0]

    hash.to_csv(os.path.join(localPath,
                             hash_filename + "_extracted_hashtag.csv"),
                index=False)
    s3.upload("macroscope-paho-covid", localPath, "hashtags",
              hash_filename + "_extracted_hashtag.csv")

    with open(
            os.path.join(localPath,
                         hash_filename + "_extracted_hashtag_frequency.html"),
            'w') as f:
        f.write(div)
    s3.upload("macroscope-paho-covid", localPath, "hashtags",
              hash_filename + "_extracted_hashtag_frequency.html")

    return None
def algorithm(df=None, params=None):
    """
    wrapper function to put each individual algorithm inside
    :param df: dataframe that contains all the input dataset
    :param params: algorithm specific parameters
    :return: a dictionary of { outputname: output content in memory }
    """

    output = {}

    # user specify which column to; each row is a sentence, get a list of sentences
    column = params['column']
    sentences = df[df[column] != ''][column].dropna().astype('str').tolist()

    entity_list = []
    entity_freq = {}
    entity_category = {}

    # extract entities in each sentence
    ner = TwitterNER()
    for sentence in sentences:
        tokens = tokenizeRawTweetText(sentence)
        raw_entities = ner.get_entities(tokens)

        entities = []
        for entry in raw_entities:
            # record entities
            entity = " ".join(tokens[entry[0]:entry[1]])
            category = entry[2]
            entities.append((entity, category))

            # record entity frequency
            if entity not in entity_freq.keys():
                entity_freq[entity] = 1
            else:
                entity_freq[entity] += 1

            # record category
            if category not in entity_category.keys():
                entity_category[category] = 1
            else:
                entity_category[category] += 1

        entity_list.append(entities)

    # extract entities in each sentence
    output['entity'] = entity_list

    # plot bar chart of most frequent entities
    output['freq'] = entity_freq

    output['div_freq'] = plot.plot_bar_chart(
        list(entity_freq.keys())[:30],
        list(entity_freq.values())[:30], "Top 30 Most Frequent Name Entities")

    # plot pie chart of entity categories
    output['div_category'] = plot.plot_pie_chart(
        list(entity_category.keys()), list(entity_category.values()),
        "Name Entity Category Breakdowns")

    return output