def exp(estimators, num_episodes=500, verbose=0): """ conducts the whole experiments """ fn_names, fns = list(), list() # Loop for all experiments on all datasets for data_name in DATASET_NAMES: if data_name == "page-blocks": data_name = "page_blocks" def _fn(_data_name): dict_bias, dict_rmse = _exp(num_episodes=num_episodes, estimators=estimators, data_name=_data_name, verbose=verbose) return dict_bias, dict_rmse fn_names.append(data_name) _fn = partial(_fn, _data_name=data_name) fns.append(_fn) # Run the estimators in parallel results = RunInParallel(fn_names=fn_names, fns=fns) # Summarise the results df_bias, df_rmse = prep_for_visualisation(results=results, data_names=DATASET_NAMES, est_names=estimators.keys()) summary_in_txt(df=df_bias, _metric_name="bias") summary_in_txt(df=df_rmse, _metric_name="rmse") plot_bar_chart(df=df_bias, plot_name="Bias", fig_name="./results/bias.png") plot_bar_chart(df=df_rmse, plot_name="RMSE", fig_name="./results/rmse.png")
def algorithm(df, params): """ wrapper function to put each individual algorithm inside :param df: dataframe that contains all the input dataset :param params: algorithm specific parameters :return: a dictionary of { outputname: output content in memory } """ output = {} # algorithm specific code # construct sentiment analysis PP = Preprocess(df, params['column']) output['phrases'] = PP.get_phrases() output['filtered'] = filtered_tokens = PP.get_words() output['processed'] = processed_tokens = PP.stem_lematize( params['process'], filtered_tokens) output['tagged'] = PP.tagging(params['tagger'], processed_tokens) filtered_most_common, processed_most_common = PP.most_frequent( filtered_tokens, processed_tokens) output['most_common'] = processed_most_common # plot index = [] counts = [] for common in processed_most_common[1:51]: index.append(common[0]) counts.append(common[1]) title = 'Top 50 frequent words (' + params['process'] + ')' output['div'] = plot.plot_bar_chart(index, counts, title) return output
def algorithm(array, params): """ wrapper function to put each individual algorithm inside :param array: array that contains all the input dataset :param params: algorithm specific parameters :return: a dictionary of { outputname: output content in memory } """ output = {} CF = Classification(array) output['uid'] = params['uid'] fold_scores, text_clf = CF.classify(params['model']) output['accuracy'] = fold_scores output['pipeline'] = text_clf labels = text_clf.classes_ output['metrics'] = CF.calc_metrics(labels) # plot output['div_accuracy'] = plot.plot_bar_chart( fold_scores[0], fold_scores[1], title='10 fold cross validation accuracy score') return output
def lambda_handler(event, context): # create local path localPath = os.path.join('/tmp', 'hashtag') if not os.path.exists(localPath): os.makedirs(localPath) # download triggered file bucket = event['Records'][0]['s3']['bucket']['name'] key = unquote_plus(event['Records'][0]['s3']['object']['key']) remotePath = "/".join(key.split("/")[:-1]) filename = key.split("/")[-1] s3.downloadToDisk(bucket, filename, localPath, remotePath) # load to dataframe df = pd.read_csv(os.path.join(localPath, filename)) # extract hashtag hash = extract_hashtag(df) # plot bar chart (frequency chart) index = hash['hashtags'].values.tolist()[:10] counts = hash['Freq'].values.tolist()[:10] title = 'Top 10 prevalent hashtags (' + filename.split(".")[0] + ')' div = plot.plot_bar_chart(index, counts, title) # save result and write back to s3 hash_filename = filename.split(".")[0] hash.to_csv(os.path.join(localPath, hash_filename + "_extracted_hashtag.csv"), index=False) s3.upload("macroscope-paho-covid", localPath, "hashtags", hash_filename + "_extracted_hashtag.csv") with open( os.path.join(localPath, hash_filename + "_extracted_hashtag_frequency.html"), 'w') as f: f.write(div) s3.upload("macroscope-paho-covid", localPath, "hashtags", hash_filename + "_extracted_hashtag_frequency.html") return None
def algorithm(df=None, params=None): """ wrapper function to put each individual algorithm inside :param df: dataframe that contains all the input dataset :param params: algorithm specific parameters :return: a dictionary of { outputname: output content in memory } """ output = {} # user specify which column to; each row is a sentence, get a list of sentences column = params['column'] sentences = df[df[column] != ''][column].dropna().astype('str').tolist() entity_list = [] entity_freq = {} entity_category = {} # extract entities in each sentence ner = TwitterNER() for sentence in sentences: tokens = tokenizeRawTweetText(sentence) raw_entities = ner.get_entities(tokens) entities = [] for entry in raw_entities: # record entities entity = " ".join(tokens[entry[0]:entry[1]]) category = entry[2] entities.append((entity, category)) # record entity frequency if entity not in entity_freq.keys(): entity_freq[entity] = 1 else: entity_freq[entity] += 1 # record category if category not in entity_category.keys(): entity_category[category] = 1 else: entity_category[category] += 1 entity_list.append(entities) # extract entities in each sentence output['entity'] = entity_list # plot bar chart of most frequent entities output['freq'] = entity_freq output['div_freq'] = plot.plot_bar_chart( list(entity_freq.keys())[:30], list(entity_freq.values())[:30], "Top 30 Most Frequent Name Entities") # plot pie chart of entity categories output['div_category'] = plot.plot_pie_chart( list(entity_category.keys()), list(entity_category.values()), "Name Entity Category Breakdowns") return output