def get_processed_data(ids=None, lang='en'): ''' Returns the processed author data in the selected language. Default is English Parameters: ids (list): List of str ids to acquire If none are input, it will return the data of all authors. lang (str): The return language. Valid options: 'en', 'es'. Returns: data (dict): A dictionary containing all the processed author information in the selected language. ''' go_to_project_root() file_list = [ f for f in listdir(PREPROCESSED_DATA_PATH) if isfile(join(PREPROCESSED_DATA_PATH, f)) and f.split(".")[-1] == "json" ] authors = {} for file in file_list: f = open(join(PREPROCESSED_DATA_PATH, file), "r") lines = "\n".join(f.readlines()) f.close() authors[file.split(".")[0]] = jsonpickle.decode(lines, classes=Author) return authors
def get_raw_data(ids=None, lang='en'): ''' Returns the raw data in the selected language. Default is English Parameters: lang (str): The return language. Valid options: 'en', 'es'. Returns: data (dict): A dictionary containing all the raw author information in the selected language. ''' go_to_project_root() return __import_from__(RAW_DATA_PATH + lang, ids)
def convert_to_df(authors, export=False): ''' Converts the given authors to a pandas DataFrame. Parameters: authors (Author dict): The authors to be converted. export (Boolean): Whether the function will export to file. False by default Returns: pandas DataFrame ''' # Can't not hard-code this # Create table, fill table, convert to dataframe, name columns, return table = np.hstack((np.zeros( (len(list(authors.values())), 1)).astype('str'), np.zeros((len(list(authors.values())), 358)))) for i, a in enumerate(list(authors.values())): table[i, :] = [ # id a.author_id, # Lexical features a.readability, a.TTR, # Semantic Similarity a.max_similar, a.min_similar, a.mean_similar, a.number_identical, # NER / Clustring a.most_common_ner_score, a.most_common_adj_score, # Non-Linguistic a.nonlinguistic_features['url_max'], a.nonlinguistic_features['url_mean'], a.nonlinguistic_features['hashtag_max'], a.nonlinguistic_features['hashtag_mean'], a.nonlinguistic_features['user_max'], a.nonlinguistic_features['user_mean'], a.nonlinguistic_features['emoji_mean'], a.nonlinguistic_features['emoji_max'], a.nonlinguistic_features['exclamation_mean'], a.nonlinguistic_features['exclamation_max'], a.nonlinguistic_features['period_mean'], a.nonlinguistic_features['period_max'], a.nonlinguistic_features['question_mean'], a.nonlinguistic_features['question_max'], a.nonlinguistic_features['comma_mean'], a.nonlinguistic_features['comma_max'], a.nonlinguistic_features['allcaps_ratio'], a.nonlinguistic_features['allcaps_inclusion_ratio'], a.nonlinguistic_features['titlecase_ratio'], a.nonlinguistic_features['mean_words'], a.nonlinguistic_features['retweet_percentage'], # POS Tags a.POS_counts['ADJ_mean'], a.POS_counts['ADP_mean'], a.POS_counts['ADV_mean'], a.POS_counts['AUX_mean'], a.POS_counts['CONJ_mean'], a.POS_counts['CCONJ_mean'], a.POS_counts['DET_mean'], a.POS_counts['INTJ_mean'], a.POS_counts['NOUN_mean'], a.POS_counts['NUM_mean'], a.POS_counts['PART_mean'], a.POS_counts['PRON_mean'], a.POS_counts['PROPN_mean'], a.POS_counts['PUNCT_mean'], a.POS_counts['SCONJ_mean'], a.POS_counts['SYM_mean'], a.POS_counts['VERB_mean'], a.POS_counts['X_mean'], # EMOTION a.emotion["anger"], a.emotion["fear"], a.emotion["anticipation"], a.emotion["trust"], a.emotion["surprise"], a.emotion["sadness"], a.emotion["joy"], a.emotion["disgust"], a.emotion["positive"], a.emotion["negative"], *list(a.embeddings), a.truth ] df = pd.DataFrame( table, columns=[ "author_id", "readability", "TTR", "max_similar", "min_similar", "mean_similar", "number_identical", "mcts_ner", "mcts_adj", 'url_max', 'url_mean', 'hashtag_max', 'hashtag_mean', 'user_max', 'user_mean', 'emoji_mean', 'emoji_max', 'exclamation_mean', 'exclamation_max', 'period_mean', 'period_max', 'question_mean', 'question_max', 'comma_mean', 'comma_max', 'allcaps_ratio', 'allcaps_inclusion_ratio', 'titlecase_ratio', 'mean_words', 'retweet_percentage', 'ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X', "anger", "fear", "anticipation", "trust", "surprise", "sadness", "joy", "disgust", "positive", "negative" ] + [f"emb_{i}" for i in range(300)] + ["truth"]) # Enable to export to file if export: go_to_project_root() df.to_csv(filepath + "processed.csv") return df
def get_csv(): ''' Returns the processed csv data file from the default path ''' go_to_project_root() return pandas.read_csv(CSV_DATA_PATH, index_col=0)
for x in xtest: majority_vote_preds += [np.sum(classifier.predict(x).astype(int)) > 1] return majority_vote_preds # Hardcode all the different feature sets feature_sets = { "lexical": [0, 1], "semantic": [2, 3, 4, 5], "clusters": [6, 7], "nonling": list(range(8, 30)), "pos": [30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47], "emotion": [47, 48, 49, 50, 51, 52, 53, 54, 55, 56], "embeddings": list(range(57, 357)) } go_to_project_root() data_root = "data/processed/800/" datasets = [read_data(f"K{k+1}/") for k in range(3)] mean = 0 for i in range(3): xtrain, ytrain, xtest, ytest = datasets[i] c = GradientBoostingClassifier() pred = classify(c, xtrain, ytrain, xtest, ytest) acc = balanced_accuracy_score(ytest, pred) mean += acc print(f"Accuracy: {mean / 3}") ## There is a known bug here- we're getting ~0.5 accuracy on some computers