def create_clusters(wdir, freq_table_df, methods = ["KMeans"], min_MFF = 0, max_MFFs = [5000], text_representations = ["rel-zscores"], ns_clusters = [2], sampling_times = 10, ): i = 0 # Two lists for the results is initialized empty clustering_results_dict = {} parameters_results_dict = {} # Iterate over representation or trasnformation of the data for text_representation in text_representations: document_data_model_df = text2features.choose_features(freq_table_df, text_representation) # Iterate over amount of MFFs for MFW in max_MFFs: print(MFW) document_data_model_cut_df = load_data.cut_corpus(document_data_model_df, min_MFF = min_MFF, max_MFF = MFW) print(document_data_model_cut_df.head()) # Iterate over clustering algorithms for method in methods: print(method) # This if takes care of the amount of subclusters for those algorithms that need to be defined or not if method not in ["KMeans","SpectralClustering","AgglomerativeClustering"]: print(method) actual_ns_clusters = ["-"] else: actual_ns_clusters = ns_clusters # Iterate over number of cluster (this is only relevant for the algorithms that need to be initialize with a number of subclusters; for the rest I pass 30, but they will decide the number) for n_clusters in actual_ns_clusters: print(n_clusters) # Iterate over sampling times: for j in range(sampling_times): try: # Make labels and take the real number of subclusters: labels_lt = choose_cluster_algorithm(method, n_clusters = n_clusters).fit(document_data_model_cut_df).labels_ n_clusters = len(list(set(labels_lt))) clustering_results_dict["cluster_"+str(i)] = labels_lt parameters_results_dict["cluster_"+str(i)] = [text_representation, MFW, method, n_clusters, j] except: print("problem with ", text_representation, method, n_clusters) i += 1 clustering_results_df = pd.DataFrame.from_dict(clustering_results_dict) parameters_results_df = pd.DataFrame.from_dict(parameters_results_dict) print(clustering_results_df.shape) print(freq_table_df.shape) clustering_results_df.index = freq_table_df.index return clustering_results_df, parameters_results_df
def get_coef(wdir, wsdir="corpus/", freq_table=[], metadata="metadata.csv", sep="\t", class_="class", verbose=True, method="SVC", max_MFF=5000, text_representation="zscores", problematic_class_values=["n.av."], minimal_value_samples=2, make_relative=True, under_sample_method="None", maximum_cases=5000, sampling_times=1): if (type(freq_table) == list) & (type(metadata) == str): cut_raw_features, metadata = load_data.load_corpus_metadata(wdir, wsdir, sep, verbose, 0, max_MFF, freq_table, metadata) else: cut_raw_features = freq_table if make_relative == True: cut_raw_features = text2features.calculate_relative_frequencies(cut_raw_features) filtered_raw_features, labels = cull_data.cull_data(cut_raw_features, metadata, class_, verbose, problematic_class_values=problematic_class_values, minimal_value_samples=minimal_value_samples) document_data_model_cut = load_data.cut_corpus(filtered_raw_features, min_MFF=0, max_MFF=max_MFF) document_data_model = text2features.choose_features(document_data_model_cut, text_representation) coef_df = pd.DataFrame(columns=document_data_model.columns.tolist()) intercept_lt = [] print("The ten first MFWs: ", document_data_model.columns.tolist()[0:10]) print("The ten first MFWs: ", document_data_model.columns.tolist()[-10:]) # Meter sampling loop for sampling_i in range(sampling_times): sampled_labels, sampled_document_data_model = classify.under_sample(labels, document_data_model, under_sample_method, maximum_cases) classifier = classify.choose_classifier(method=method) model = classifier.fit(sampled_document_data_model, sampled_labels) #print(model.coef_) print(model.coef_.shape) sampled_coef_df = pd.DataFrame(data=model.coef_.tolist(), columns=sampled_document_data_model.columns.tolist()) print(model.intercept_) intercept_lt.append(float(model.intercept_)) coef_df = pd.concat([coef_df, sampled_coef_df]) coef_df = coef_df.reindex_axis(coef_df.mean().sort_values().index, axis=1) print(coef_df.shape) print(intercept_lt) return coef_df, intercept_lt
def regressing(wdir, features, outputs, classes, methods_lt, max_MFFs, text_representations, make_relative=False, cv=10): results_lt = [] if features.shape[0] != outputs.shape[0]: print("Features and output do not have the same shape!") return print("features ", features.head()) if make_relative == True: features = text2features.calculate_relative_frequencies(features) for class_ in classes: print("\n\nanalysed class:\t", class_) for text_representation in text_representations: transformed_features = text2features.choose_features(features, text_representation) for MFW in max_MFFs: print("MFW", MFW) transformed_features_cut = load_data.cut_corpus(transformed_features, min_MFF = 0, max_MFF = MFW) for method_st in methods_lt: try: regression_algorithm = choose_regression_algorithm(method = method_st) results_dc = cross_validate(regression_algorithm, transformed_features_cut, outputs[class_], cv=10) mean_results_fl = results_dc["test_score"].mean().round(3) print(mean_results_fl) results_lt.append([class_, text_representation, MFW, method_st, mean_results_fl, "R2"]) except: print("problems with ", method_st) results_df = pd.DataFrame(results_lt, columns = ["class", "text_representation", "MFW", "method", "mean_results", "scoring"]) results_df.sort_values(by="mean_results",ascending=False, inplace=True) return results_df
def evaluate_cluster(wdir, freq_table_df, metadata_df, ground_truths = ["author.name","decade","subgenre.cligs.important"], methods = ["KMeans"], min_MFF = 0, max_MFFs = [5000], text_representations = ["rel-zscores"], ns_clusters = [30], under_sample_method = "author.name", sampling_times = 10, method_evaluation = "ARI", ): # A list for the results is initialized empty results_lt = [] # Iterate over representation or trasnformation of the data for text_representation in text_representations: document_data_model_df = text2features.choose_features(freq_table_df, text_representation) # Iterate over amount of MFFs for MFW in max_MFFs: print(MFW) document_data_model_cut_df = load_data.cut_corpus(document_data_model_df, min_MFF = min_MFF, max_MFF = MFW) try: print("first columns ", document_data_model_cut_df.columns.tolist()[0:5]) print("last columns ", document_data_model_cut_df.columns.tolist()[-5:]) except: print("first columns ", document_data_model_cut_df.columns.tolist()[0:1]) print("last columns ", document_data_model_cut_df.columns.tolist()[-1:]) # Iterate over clustering algorithms for method in methods: print(method) # This if takes care of the amount of subclusters for those algorithms that need to be defined or not if method not in ["KMeans","SpectralClustering","AgglomerativeClustering"]: print(method) actual_ns_clusters = ["-"] else: actual_ns_clusters = ns_clusters # Iterate over number of cluster (this is only relevant for the algorithms that need to be initialize with a number of subclusters; for the rest I pass 30, but they will decide the number) for n_clusters in actual_ns_clusters: print(n_clusters) # Iterate over sampling times: for i in range(sampling_times): # Possibility of undersampling taking only one text per author (or any other class): if under_sample_method in ["author.name","authorial"]: sampled_data_df, sampled_metadata_df = sample_unique_text_by_class(document_data_model_cut_df, metadata_df, class_ = "author.name") else: sampled_data_df, sampled_metadata_df = document_data_model_cut_df, metadata_df try: # Make labels and take the real number of subclusters: labels = choose_cluster_algorithm(method, n_clusters = n_clusters).fit(sampled_data_df).labels_ n_clusters = len(list(set(labels))) # Evaluate with for ground_truth in ground_truths: evaluation = evalute_clustering(sampled_metadata_df[ground_truth], labels, method = method_evaluation) # Add everything to the list of the results results_lt.append([ground_truth, evaluation, text_representation,method, n_clusters, MFW, method_evaluation,sampled_data_df.shape[0]]) except: print("problem with ", text_representation, method, ground_truths, n_clusters) # Convert the list into a dataframe, sort, clean... results_df = pd.DataFrame(results_lt, columns=["ground_truth", "evaluation", "text_representation","method", "n_clusters", "MFW", "method_evaluation","sample_size"]) results_df = results_df.sample(frac=1).sort_values(by=["evaluation"], ascending=[False]) # Save the results results_file = "results"+"_"+ "-".join(ground_truths)+"_"+ "-".join(methods)+"_"+ "-".join(str(x) for x in max_MFFs)+"_" +"-".join(text_representations) if len(results_file) > 100: results_file = "results_"+str(datetime.datetime.now().year)+str(datetime.datetime.now().month)+str(datetime.datetime.now().day)+str(datetime.datetime.now().hour)+str(datetime.datetime.now().minute)+str(datetime.datetime.now().second) print(results_file) results_df.to_csv(wdir + results_file+".csv", sep = "\t") return results_df
def predict(wdir, entire_raw_features, metadata, class_="class", predict_class_values=["?"], verbose=True, method="SVC", min_MFF=0, max_MFF=5000, text_representation="relative", make_relative=True, iterations=1, do_scores=False, type_classes="binary"): if make_relative == True: entire_raw_features = text2features.calculate_relative_frequencies( entire_raw_features) #print(entire_raw_features.columns.tolist()[0:10]) entire_raw_features = load_data.cut_corpus(entire_raw_features, min_MFF=min_MFF, max_MFF=max_MFF) print(entire_raw_features.columns.tolist()[0:10]) print("corpus and metadata are coherent" ) if entire_raw_features.index.tolist() == metadata.index.tolist( ) else "corpus and metadata are NOT coherent" train_class_values = [ set_label for set_label in list(set(metadata[class_])) if set_label not in predict_class_values ] print("train classes", train_class_values) smallest_class = Counter(metadata.loc[metadata[class_].isin( train_class_values)][class_]).most_common()[-1] print("smallest class", smallest_class) document_data_model = text2features.choose_features( entire_raw_features, text_representation) metadata_predict = metadata.loc[metadata[class_].isin( predict_class_values)].copy() #.sort_index() metadata_predict_iterations = pd.DataFrame( index=metadata_predict.index, columns=[i for i in range(iterations)]) if type_classes == "binary": metadata_predict["sum_prediction_" + class_] = 0 document_data_model_predict = document_data_model.loc[ metadata_predict.index.tolist()] #.sort_index() #print("document data model to predict\n", document_data_model_predict.head(3)) print( "metadata and data to predict coherent?", metadata_predict.index.tolist() == document_data_model_predict.index.tolist()) for i in range(iterations): metadata_sample = pd.concat([ metadata.loc[(~metadata[class_].isin(predict_class_values)) & (metadata[class_] != smallest_class[0])], metadata.loc[metadata[class_] == smallest_class[0]].sample( n=smallest_class[1]) ]).sample(frac=1) document_data_model_sample = document_data_model.loc[ metadata_sample.index.tolist()] #print("document_data_model_sample\n", document_data_model_sample.head(3)) print("metadata and texts coherent") if metadata_sample.index.tolist( ) == document_data_model_sample.index.tolist() else print( "metadata and corpus are not coherent") print("metadata's shape", metadata_sample.shape) classifier = choose_classifier(method=method) #print(set(metadata_sample[class_])) #print(document_data_model_sample.head()) classifier.fit(document_data_model_sample, metadata_sample[class_].astype(str)) if do_scores == True: scores = classify_cross(document_data_model_sample, metadata_sample[class_].astype(str), classifier, cv=10, scoring="f1") print("scores", scores) print(document_data_model_predict.index.tolist()) results = classifier.predict(document_data_model_predict) print( i, metadata_sample.index[0:3], results, ) metadata_predict_iterations[i] = results if type_classes == "binary": metadata_predict["sum_prediction_" + class_] = np.array( results).astype(int) + metadata_predict["sum_prediction_" + class_] metadata_predict_iterations[i] = metadata_predict_iterations[ i].astype(int) if type_classes == "binary": metadata_predict["sum_prediction_" + class_] = metadata_predict["sum_prediction_" + class_] / iterations print(metadata_predict["sum_prediction_" + class_]) return metadata_predict, results, metadata_predict_iterations
def classify(wdir, wsdir="corpus/", freq_table=[], metadata="metadata.csv", sep="\t", classes=["class"], verbose=True, methods=["SVC"], min_MFF=0, max_MFFs=[5000], text_representations=["zscores"], typographies=[True], sampling_mode="cross", problematic_class_values=[ "n.av.", "other", "mixed", "?", "unknown", "none", "second-person" ], minimal_value_samples=2, make_relative=True, under_sample_method="None", maximum_cases=5000, sampling_times=1, outdir_results="", sort_by="median"): """ * wdir * wsdir = "corpus/" * freq_table = [] * metadata = "metadata.csv" * sep = "\t" * classes = ["class"] * verbose = True * method = ["SVC"] * min_MFF = 0 * max_MFFs = [5000] * text_representations = ["zscores"] * typographies = [True,False] * sampling_mode = "cross" * problematic_class_values = ["n.av.", "other", "mixed", "?", "unknown","none", "second-person"] * minimal_value_samples = 2 * make_relative = True * scoring = "f1" * under_sample_method = "None" * maximum_cases = 5000, * sampling_times = 1 """ cut_raw_features = freq_table print("cut_raw_features ", cut_raw_features.head()) print("in classify, cut_raw_features, ", cut_raw_features.shape) if make_relative == True: cut_raw_features = text2features.calculate_relative_frequencies( cut_raw_features) print("cut_raw_features after relative normalization", cut_raw_features.head()) results = [] for class_ in classes: print("\n\nanalysed class:\t", class_) # This step deletes too small classes filtered_raw_features, labels = cull_data.cull_data( cut_raw_features, metadata, class_, verbose, problematic_class_values=problematic_class_values, minimal_value_samples=minimal_value_samples) print("size after culling data:", filtered_raw_features.shape, labels.shape) for typography in typographies: filtered_raw_features_typo = cull_data.cull_typography( filtered_raw_features, keep_typography=typography) print("typography ", typography) for text_representation in text_representations: # The corpus is modeled somehow (raw, relative frequencies, tf-idf, z-scores...) document_data_model = text2features.choose_features( filtered_raw_features_typo, text_representation) print(document_data_model.shape) if verbose == True else 0 for MFW in max_MFFs: print("MFW", MFW) document_data_model_cut = load_data.cut_corpus( document_data_model, min_MFF=min_MFF, max_MFF=MFW, sort_by=sort_by) print("The three first MFWs: ", document_data_model_cut.columns.tolist()[0:3]) print("The three last MFWs: ", document_data_model_cut.columns.tolist()[-3:]) if len(set(labels.values.tolist())) < 2: print( "After culling the class", class_, " can't be divided in two groups. This category is going to be ignored" ) else: for method in methods: classifier = choose_classifier(method=method) f1s_over_sampling = np.array([]) scores_over_sampling_df = pd.DataFrame( columns=["f1", "rec", "prec"]) for sampling_i in range(sampling_times): print(labels.shape) print(document_data_model_cut.shape) sampled_labels, sampled_document_data_model_cut = sampling.under_sample( labels, document_data_model_cut, under_sample_method, maximum_cases) baseline = cull_data.calculate_baseline( sampled_labels) least_frequent_class_value = Counter( sampled_labels).most_common()[-1][1] if sampling_mode == "standard": print("standard sampling, bug coming!") results = standard_classification( wdir, least_frequent_class_value, document_data_model_cut, sampled_labels, verbose, classifier, class_) return results elif sampling_mode == "cross": cv = cull_data.calculate_cv( least_frequent_class_value) print("cross validation sampling of ", class_) scores_df = classify_cross( sampled_document_data_model_cut, sampled_labels, classifier, cv=cv) f1s_over_sampling = np.append( f1s_over_sampling, scores_df["f1"]) scores_over_sampling_df = pd.concat([ scores_df, pd.DataFrame(scores_df.mean()).T ], axis=0) #print(scoring + ": %0.2f (+/- %0.2f)" % (evaluation_over_sampling.mean(), evaluation_over_sampling.std() * 2)) test_result_param, test_result_pvalue = test_ttest_cross_results_baseline( f1s_over_sampling, baseline) # Creo que aquí hay que descender en los loops print("Class: \t", class_) print("Scores:\n \t", scores_over_sampling_df.mean().round(3)) print("p-value: ", round(test_result_pvalue, 4)) print("Baseline: \t\t", round(baseline, 2)) print(method) f1_baseline = scores_over_sampling_df.mean( )["f1"].round(3) - baseline print( scores_over_sampling_df.mean()["f1"].round(3) - baseline) results.append([ class_, scores_over_sampling_df.mean()["f1"].round(3), scores_over_sampling_df.mean()["rec"].round(3), scores_over_sampling_df.mean()["prec"].round( 3), scores_over_sampling_df.mean() ["f1_macro"].round(3), scores_over_sampling_df.mean() ["f1_micro"].round(3), baseline, f1_baseline, method, text_representation, MFW, typography, f1s_over_sampling.round(2), test_result_pvalue, sampled_labels, sampled_labels.shape[0], cv, sampling_times, classifier ]) results_df = pd.DataFrame( results, columns=[ "class", 'mean_f1', 'mean_rec', "mean_prec", "f1_macro", "f1_micro", 'baseline', "f1-baseline", 'classifier_name', 'text_representation', 'MFW', 'typography', "f1s", 'test_result_pvalue', 'labels', "sample_size", "cv", "sampling_times", 'classifier' ]) print(results_df.head()) results_df = results_df.sample(frac=1) results_df.sort_values(by=["f1-baseline", "MFW"], ascending=[False, True], inplace=True) if outdir_results == "": outdir_results = wdir results_file = "results" + "_" + "-".join( classes) + "_" + "-".join(methods) + "_" + "-".join( str(x) for x in max_MFFs) + "_" + "-".join(text_representations) if len(results_file) > 100: results_file = "results_" + str(datetime.datetime.now().year) + str( datetime.datetime.now().month) + str( datetime.datetime.now().day) + str( datetime.datetime.now().hour) + str( datetime.datetime.now().minute) + str( datetime.datetime.now().second) results_df.to_csv(outdir_results + results_file + ".csv", sep="\t") print("done!") return results_df