def sk_classification(df, rm_out=False): # Remove outliers if rm_out == True: df = remove_outliers(df, 'score', lq=0.05, uq=0.95) print("data points below 0.05 or above 0.95 quantiles removed") # Classify scores depedning on percentile df["class"] = 1 # average translation df.loc[df["score"] >= df["score"].quantile(0.67), "class"] = 0 # bad translation df.loc[df["score"] <= df["score"].quantile(0.33), "class"] = 2 # good translation # Split data into training and tests sets, set random_state for reproducibility X_train, X_test, y_train, y_test = train_test_split( df.drop(columns=["score", "class"]), df["class"], test_size=0.2, random_state=42) print("running k-neighbors classifier...") results_dict = {} for n in range(3, 31): # Create classifier neigh = KNeighborsClassifier(n_neighbors=n, algorithm='auto') # Fit classifier to train data neigh.fit(X_train, y_train) results_dict[n] = neigh.score(X_test, y_test) results_df = pd.DataFrame.from_dict(results_dict, orient='index', columns=['kn-score']) max_score = max(results_df['kn-score']) print("maximum score obtained: %0.2f%%" % (max_score * 100)) max_list = results_df.loc[results_df['kn-score'] == max_score] for n in max_list.index: # Create classifier neigh = KNeighborsClassifier(n_neighbors=n, algorithm='auto') # Fit classifier to train data neigh.fit(X_train, y_train) print("\nnumber of neighbours: %d" % n) # Predict using test data y_pred = neigh.predict(X_test) y_pred_prob = pd.DataFrame(neigh.predict_proba(X_test)).round(2) y_pred_prob.columns = ["prob 0", "prob 1", "prob 2"] # Evaluate results diff = { "bad translation": 0, "average translation": 1, "good translation": 2 } y_res = pd.DataFrame(y_pred, columns=['y_pred']) y_res['y_test'] = y_test.values for key in diff.keys(): key_val = y_res.loc[y_res["y_pred"] == diff[key]] print("Accuracy for %s: %0.2f%%" % (key, accuracy_score(key_val["y_test"], key_val["y_pred"]) * 100))
use_biber = False if use_biber == True: biber = pd.read_csv("data/en-fr-100/en-fr-100.dim", sep='\t') drop_cols = biber.columns[(biber == 0).sum() > 0.5 * biber.shape[0]] biber.drop(drop_cols, axis=1, inplace=True) features = features.merge(biber, left_index=True, right_index=True) # Join data into single dataframe df = ter.merge(features, left_index=True, right_index=True) # Remove outliers rm_out = False if rm_out == True: df = remove_outliers(df, 'score', lq=0.05, uq=0.95) print("data points below 0.05 or above 0.95 quantiles removed") # Classify scores based on percentile def classify_scores(df, num_classes=3): if num_classes == 3: df["class"] = 1 # average translation df.loc[df["score"] >= df["score"].quantile(0.66), "class"] = 0 # bad translation df.loc[df["score"] <= df["score"].quantile(0.33), "class"] = 2 # good translation diff = { "bad translation": 0, "average translation": 1,
timed = pd.read_csv("data/timed-un/reliable.dat", sep=' ') biber = pd.read_csv("data/timed-un/reliable1-dim.dat", sep='\t') encode_category = False if encode_category == True: enc = preprocessing.LabelEncoder() cat = timed['category'] enc.fit(cat) timed['cat'] = enc.transform(cat) # Join data into a single dataframe df = pd.concat([timed['perday'], biber], axis=1) # Remove outliers df = remove_outliers(df, 'perday', lq=0.05, uq=0.95) df.reset_index(inplace=True) # Change regression problem into classification n_class = 3 if n_class == 3: df["class"] = 1 # average df.loc[df["perday"] >= df["perday"].quantile(0.67), "class"] = 0 # easy df.loc[df["perday"] <= df["perday"].quantile(0.33), "class"] = 2 # hard else: df["class"] = 1 # easy df.loc[df["perday"] > df["perday"].quantile(0.75), "class"] = 0 # very easy df.loc[df["perday"] <= df["perday"].quantile(0.25), "class"] = 3 # very hard df.loc[(df["perday"] > df["perday"].quantile(0.25)) &
import pandas as pd import seaborn as sns import matplotlib.pyplot as plt from scripts.utils import remove_outliers wto = pd.read_csv("data/wto/wto_timed_all.csv") wto_french = remove_outliers( wto, 'PERDAY FRENCH', lq=0.05, uq=0.94).drop(columns=['DAYS SPANISH', 'PERDAY SPANISH']) wto_spanish = remove_outliers( wto, 'PERDAY SPANISH', lq=0.05, uq=0.94).drop(columns=['DAYS FRENCH', 'PERDAY FRENCH']) un = pd.read_csv("data/timed-un/reliable.dat", sep=' ') #un = remove_outliers(un, 'perday', lq=0.05, uq=0.94) def combined_plot(): fig, axs = plt.subplots(1, 3, sharey=True, figsize=(15, 15)) sns.distplot(un['perday'], hist=True, kde=True, bins=20, hist_kws={'edgecolor': 'black'}, kde_kws={'bw': 200}, ax=axs[0]) axs[0].set_xlabel("Translation rate (words per day)") axs[0].set_ylabel("Density")
import pandas as pd import seaborn as sns import matplotlib.pyplot as plt from scripts.utils import remove_outliers es_df = pd.read_csv("data/un-parallel/es-mt-score.txt", header=None, sep='\n') es_df.columns = ['score'] es_df = remove_outliers(es_df, 'score', lq=0.05, uq=0.95) fr_df = pd.read_csv("data/un-parallel/fr-mt-score.txt", header=None, sep='\n') fr_df.columns = ['score'] fr_df = remove_outliers(fr_df, 'score', lq=0.05, uq=0.95) def histogram(): fig, axs = plt.subplots(1, 2, sharey=True, figsize=(15, 15)) axs[0].hist(es_df.iloc[:, 0], bins=25, edgecolor='black') axs[0].set_xlabel("TER") axs[0].set_ylabel("Frequency (sentences)") axs[0].set_title("UN Corpus - Spanish Translations") axs[1].hist(fr_df.iloc[:, 0], bins=25, edgecolor='black') axs[1].set_xlabel("TER") #axs[1].ylabel("Frequency (sentences)") axs[1].set_title("UN Corpus - French Translations") plt.show()
# can either predict time ('days') or rate ('perday') target = 'perday' if target == 'perday': units = 'words per day' min_lim, max_lim = 400, 2200 # best fit line limits else: units = target min_lim, max_lim = 0, 20 # Join releveant data into one dataframe reg_df = pd.concat([reliable[[target, 'words']], reliable1_dim], axis=1) # Remove outliers rem_out = False if rem_out == True: reg_df = remove_outliers(reg_df, target, lq=0.05, uq=0.95) # Convert categorical features into numerical labels use_cat = False if use_cat == True: enc = preprocessing.LabelEncoder() cat = reliable['category'] enc.fit(cat) reg_df['category'] = enc.transform(cat) # Drop columns with a large number of zeros: drop_zero_cols = False if drop_zero_cols == True: drop_cols = reg_df.columns[(reg_df == 0).sum() > 0.5 * reg_df.shape[0]] reg_df.drop(drop_cols, axis=1, inplace=True)
sep='\t') ter.columns = ["score"] # Join important columns to single dataframe df = pd.concat([ter, time], axis=1) # Calculate translation rate (and normalise) #df['perms'] = df['words'] / df['time (ms)'] # words per ms df['spw'] = (df['time (ms)']) / 1000 / df['words'] # seconds per word #df['rate'] = (df['perms'] - df['perms'].min()) / (df['perms'].max() - df['perms'].min()) # Remove perfect translations dft = df.loc[df['score'] != 0] # Remove outliers dfr = remove_outliers(df, 'spw', lq=0.05, uq=0.95) # Correlation print(dfr.corr().round(3)['score']) # Quantiles def quantiles(df): """ Output distribution of each quantile in the data set. """ q1 = df.loc[df['perms'] <= df['perms'].quantile(0.25)] q2 = df.loc[(df['perms'] >= df['perms'].quantile(0.25)) & (df['perms'] <= df['perms'].quantile(0.50))] q3 = df.loc[(df['perms'] >= df['perms'].quantile(0.50)) & (df['perms'] <= df['perms'].quantile(0.75))] q4 = df.loc[df['perms'] >= df['perms'].quantile(0.75)]