def embed(): print("Embedding transcripts") data = fm.get_df("0_parsed") sentence_embedding = sister.MeanEmbedding(lang="en") embedded = data["parsed"]["line"].apply(sentence_embedding) d = {"embedded": pd.DataFrame.from_records(embedded, index=embedded.index)} embedded = data.join(pd.concat(d, axis=1)) fm.write_df(embedded, "1_embedded_fasttext") return embedded
def parse(correct_spelling=False, stemming=False, remove_stopwords=False, expand_contractions=True): print("Parsing episodes") episode_array = fm.get_transcripts() lines = [] for episode in episode_array: lines.extend( parse_episode(episode, correct_spelling, stemming, remove_stopwords, expand_contractions)) df = pd.DataFrame( lines, columns=["character", "line", "wordcount", "stopwordcount"]) # Remove duplicate combinations of ["character] and ["line"] and leave only 1 # Remove duplicate ["line"] since this would mean multiple characters say the same sentence df = df[~df.duplicated(subset=["character", "line"])] df = df[~df.duplicated(subset=["line"], keep=False)] ml = {'parsed': df} ml_df = pd.concat(ml, axis=1).reindex() fm.write_df(ml_df, "0_parsed") return ml_df
def benchmark_change_data(train_or_test="test", random=False, grid=False, min=2, max=30): print("Benchmarking using new method from " + str(min) + " to " + str(max)) if not grid: dictionary = { "accuracy": [], "cross_entropy": [], "predict_proba_predicted_character": [], } else: dictionary = { "accuracy": [], "cross_entropy": [], "predict_proba_predicted_character": [], "C": [], "max_iter": [] } fasttext_dict = deepcopy(dictionary) tfidf_dict = deepcopy(dictionary) data = src.file_manager.get_df("1_embedded_fasttext") train, test = train_test_split(data, random_state=1515, train_size=0.8) test_count = {} train_count = {} for i in range(min, max): test_count.update({ i: test[test["parsed"]["wordcount"] > i].count()["parsed"] ["wordcount"] }) train_count.update({ i: train[train["parsed"]["wordcount"] > i].count()["parsed"] ["wordcount"] }) # Train shrinks, data needs to be classified only once if train_or_test == "test": classified_data, params = src.classify(technique="fasttext", train_data=train, test_data=test, unique=False, C=10.0, max_iter=200, write=False) for min_wordcount in range(min, max): print(min_wordcount) if random: if train_or_test == "test": classified_data = classified_data.sample( n=test_count.get(min_wordcount)) else: train = train.sample(n=train_count.get(min_wordcount)) classified_data, params = src.classify(technique="fasttext", train_data=train, test_data=test, unique=False, grid=grid, write=False) else: if train_or_test == "test": classified_data = classified_data[ classified_data["parsed"]["wordcount"] >= min_wordcount] else: train = train[train["parsed"]["wordcount"] >= min_wordcount] classified_data, params = src.classify(technique="fasttext", train_data=train, test_data=test, unique=False, grid=grid, write=False) labels = classified_data["predict_proba_"].columns fasttext_dict.get("accuracy").append( accuracy_score(classified_data["parsed"]["character"], classified_data["classified"]["character"])) fasttext_dict.get("cross_entropy").append( log_loss(classified_data["parsed"]["character"], classified_data["predict_proba_"], labels=labels)) fasttext_dict.get("predict_proba_predicted_character").append( classified_data["predict_proba_specific"] ["predicted_character"].mean()) if grid: fasttext_dict.get("C").append(params.get("C")) fasttext_dict.get("max_iter").append(params.get("max_iter")) print(fasttext_dict) data = src.file_manager.get_df("0_parsed") train, test = train_test_split(data, random_state=1515, train_size=0.8) wordcount_range = range(min, max) if train_or_test == "test": classified_data, params = src.classify(technique="tfidf", train_data=train, test_data=test, unique=False, C=1.0, max_iter=500, write=False) for min_wordcount in wordcount_range: print(min_wordcount) if random: if train_or_test == "test": classified_data = classified_data.sample( n=test_count.get(min_wordcount)) else: train = train.sample(n=train_count.get(min_wordcount)) classified_data, params = src.classify(technique="tfidf", train_data=train, test_data=test, unique=False, grid=grid, write=False) else: if train_or_test == "test": classified_data = classified_data[ classified_data["parsed"]["wordcount"] >= min_wordcount] else: train = train[train["parsed"]["wordcount"] >= min_wordcount] classified_data, params = src.classify(technique="tfidf", train_data=train, test_data=test, unique=False, grid=grid, write=False) labels = classified_data["predict_proba_"].columns tfidf_dict.get("accuracy").append( accuracy_score(classified_data["parsed"]["character"], classified_data["classified"]["character"])) tfidf_dict.get("cross_entropy").append( log_loss(classified_data["parsed"]["character"], classified_data["predict_proba_"], labels=labels)) tfidf_dict.get("predict_proba_predicted_character").append( classified_data["predict_proba_specific"] ["predicted_character"].mean()) if grid: tfidf_dict.get("C").append(params.get("C")) tfidf_dict.get("max_iter").append(params.get("max_iter")) print(tfidf_dict) print(wordcount_range) print() tfidf_df = pd.concat( [pd.Series(v, index=wordcount_range) for k, v in tfidf_dict.items()], keys=[k for k, v in tfidf_dict.items()], axis=1) fasttext_df = pd.concat([ pd.Series(v, index=wordcount_range) for k, v in fasttext_dict.items() ], keys=[k for k, v in fasttext_dict.items()], axis=1) d = {"tfidf": tfidf_df, "fasttext": fasttext_df} df = pd.concat(d, axis=1) fm.write_df( df, "4_benchmark_change_testing_data_" + train_or_test + ("_random" if random else "")) return df
def classify(ngrams=None, technique="tfidf", multi_class="multinomial", train_data=None, test_data=None, grid=False, C=None, max_iter=None, cv=None, min_wordcount=None, verbose=0, unique=False, write=True): if technique not in sim_types: raise ValueError("Invalid classification type " + technique + ". Expected one of: %s" % sim_types) print("Classifying lines using " + technique) if ngrams is None: if technique == "tfidf": print("TF-IDF: data obtained from 0_parsed") data = fm.get_df("0_parsed", unique=unique) else: print("fastText: data obtained from 1_embedded_fasttext") data = fm.get_df("1_embedded_" + technique, unique=unique) if data is None: data = embed.embed_transcripts(type=technique) else: data = fm.get_df("0_parsed_n_grams") if train_data is not None and test_data is not None: print("Test and train data provided, using those instead") train = train_data test = test_data else: train, test = train_test_split(data, random_state=1515, train_size=0.8) y_train = train["parsed"]["character"] y_test = test["parsed"]["character"] if technique == "tfidf": tfidf = TfidfVectorizer() if ngrams is None: x_train = tfidf.fit_transform(train["parsed"]["line"]) x_test = tfidf.transform(test["parsed"]["line"]) else: x_train = tfidf.fit_transform(train["ngrams"][str(ngrams)]) x_test = tfidf.transform(test["ngrams"][str(ngrams)]) else: x_train = train["embedded"] x_test = test["embedded"] if grid: params = { "C": np.logspace(-5, 5, 11), 'max_iter': [250, 500, 750, 1000] } if C: params["C"] = C if technique is list else [C] if max_iter: params["max_iter"] = max_iter if technique is list else [max_iter] lg = GridSearchCV(LogisticRegression(), params, verbose=verbose, n_jobs=-1, cv=cv) else: lg = LogisticRegression(C=C if C else 1, max_iter=max_iter if max_iter else 500) lg.fit(x_train, y_train) if grid: best = lg.best_params_ print("Best parameters: ", best) else: best = None predict_proba_df = pd.DataFrame(lg.predict_proba(x_test), columns=lg.classes_, index=y_test.index) decision_function_df = pd.DataFrame(lg.decision_function(x_test), columns=lg.classes_, index=y_test.index) predict_df = pd.DataFrame(lg.predict(x_test), columns=["character"], index=y_test.index) predict_proba_character_series = pd.concat([y_test, predict_proba_df], axis=1)\ .apply(lambda x: x[x["character"]], axis=1) predict_proba_predicted_series = pd.concat([predict_df, predict_proba_df], axis=1)\ .apply(lambda x: x[x["character"]], axis=1) predict_proba_specific_df = pd.concat( [predict_proba_character_series, predict_proba_predicted_series], keys=["actual_character", "predicted_character"], axis=1) decision_function_character_series = pd.concat([y_test, decision_function_df], axis=1)\ .apply(lambda x: x[x["character"]], axis=1) decision_function_predicted_series = pd.concat([predict_df, decision_function_df], axis=1) \ .apply(lambda x: x[x["character"]], axis=1) decision_function_specific_df = pd.concat([ decision_function_character_series, decision_function_predicted_series ], keys=[ "actual_character", "predicted_character" ], axis=1) is_correct_df = predict_df["character"].eq(y_test).to_frame( name="is_correct") # confidence = pd.DataFrame(lg.decision_function(x_test), columns=["confidence"],index=y_test.index) d = { "parsed": test["parsed"], "classified": pd.concat([predict_df, is_correct_df], axis=1), "predict_proba_specific": predict_proba_specific_df, "decision_function_specific": decision_function_specific_df, "predict_proba_": predict_proba_df, "decision_function": decision_function_df } data = pd.concat(d, axis=1) if write: if ngrams is None: fm.write_df(data, "2_classified_" + technique) else: fm.write_df(data, "2_classified_" + technique + "_ngrams_" + str(ngrams)) return data, best
def classify(technique="tfidf", train_data=None, test_data=None, C=None, max_iter=None, write=False): if technique not in sim_types: raise ValueError("Invalid classification type " + technique + ". Expected one of: %s" % sim_types) print("Classifying lines using " + technique) if technique == "tfidf": data = fm.get_df("0_parsed") else: data = fm.get_df("1_embedded_fasttext") if train_data is not None and test_data is not None: print("Test and train data provided, using those instead") print("Test size: ", test_data.shape) print("Train size: ", train_data.shape) train = train_data test = test_data else: train, test = train_test_split(data, random_state=1515, train_size=0.8) y_train = train["parsed"]["character"] y_test = test["parsed"]["character"] if technique == "tfidf": tfidf = TfidfVectorizer() x_train = tfidf.fit_transform(train["parsed"]["line"]) x_test = tfidf.transform(test["parsed"]["line"]) else: x_train = train["embedded"] x_test = test["embedded"] lg = LogisticRegression(C=C, max_iter=max_iter) lg.fit(x_train, y_train) predict_proba_df = pd.DataFrame(lg.predict_proba(x_test), columns=lg.classes_, index=y_test.index) predict_df = pd.DataFrame(lg.predict(x_test), columns=["character"], index=y_test.index) predict_proba_character_series = pd.concat([y_test, predict_proba_df], axis=1)\ .apply(lambda x: x[x["character"]], axis=1) predict_proba_predicted_series = pd.concat([predict_df, predict_proba_df], axis=1)\ .apply(lambda x: x[x["character"]], axis=1) predict_proba_specific_df = pd.concat( [predict_proba_character_series, predict_proba_predicted_series], keys=["actual_character", "predicted_character"], axis=1) is_correct_df = predict_df["character"].eq(y_test).to_frame( name="is_correct") d = { "parsed": test["parsed"], "classified": pd.concat([predict_df, is_correct_df], axis=1), "predict_proba_specific": predict_proba_specific_df, "predict_proba_": predict_proba_df } data = pd.concat(d, axis=1) if write: fm.write_df(data, "2_classified_" + technique) return data, lg
def benchmark(train_or_test="test", random=False, min=2, max=30, folds=5): print("Benchmarking " + train_or_test + " data from " + str(min) + " to " + str(max)) dictionary = { "accuracy": [], "accuracy_std": [], "predict_proba": [], "predict_proba_std": [], "cross_entropy_loss": [], "cross_entropy_loss_std": [] } techniques = { 'fasttext': deepcopy(dictionary), 'tfidf': deepcopy(dictionary) } data = src.file_manager.get_df("1_embedded_fasttext") wordcount = src.file_manager.get_df("details_min_wordcount") hyperparams = src.file_manager.get_df( "4_benchmark_change_testing_data_train") for cur_technique in techniques.keys(): train, test = train_test_split(data, random_state=1515, train_size=0.8) for min_wordcount in range(min, max): print("Min wordcount: ", min_wordcount) data_size = wordcount["wordcount"][train_or_test].get( min_wordcount) if train_or_test == "test": test = test[test["parsed"]["wordcount"] >= min_wordcount] C = hyperparams[cur_technique]["C"].get(min) max_iter = hyperparams[cur_technique]["max_iter"].get(min) if train_or_test == "train": train = train[train["parsed"]["wordcount"] >= min_wordcount] C = hyperparams[cur_technique]["C"].get(min_wordcount) max_iter = hyperparams[cur_technique]["max_iter"].get( min_wordcount) accuracies = [] predict_probas = [] cross_entropy_losses = [] for cur_fold in range(0, folds): classified, lg = src.classify_std.classify( technique=cur_technique, train_data=train, test_data=test, C=C, max_iter=max_iter) accuracies.append( accuracy_score(classified["parsed"]["character"], classified["classified"]["character"])) cross_entropy_losses.append( log_loss(classified["parsed"]["character"], classified["predict_proba_"], labels=classified["predict_proba_"].columns)) predict_probas.append(classified["predict_proba_specific"] ["predicted_character"].mean()) cur_details = techniques.get(cur_technique) cur_details.get("accuracy").append(np.mean(accuracies)) cur_details.get("accuracy_std").append(np.std(accuracies)) cur_details.get("predict_proba").append(np.mean(predict_probas)) cur_details.get("predict_proba_std").append(np.std(predict_probas)) cur_details.get("cross_entropy_loss").append( np.mean(cross_entropy_losses)) cur_details.get("cross_entropy_loss_std").append( np.std(cross_entropy_losses)) techniques.update({cur_technique: cur_details}) print(techniques) print(techniques) fm.write_df(pd.DataFrame.from_dict(techniques), "STD") return techniques