예제 #1
0
def sk_classification(df, rm_out=False):

    # Remove outliers
    if rm_out == True:
        df = remove_outliers(df, 'score', lq=0.05, uq=0.95)
        print("data points below 0.05 or above 0.95 quantiles removed")

    # Classify scores depedning on percentile
    df["class"] = 1  # average translation
    df.loc[df["score"] >= df["score"].quantile(0.67),
           "class"] = 0  # bad translation
    df.loc[df["score"] <= df["score"].quantile(0.33),
           "class"] = 2  # good translation

    # Split data into training and tests sets, set random_state for reproducibility
    X_train, X_test, y_train, y_test = train_test_split(
        df.drop(columns=["score", "class"]),
        df["class"],
        test_size=0.2,
        random_state=42)

    print("running k-neighbors classifier...")

    results_dict = {}
    for n in range(3, 31):

        # Create classifier
        neigh = KNeighborsClassifier(n_neighbors=n, algorithm='auto')

        # Fit classifier to train data
        neigh.fit(X_train, y_train)

        results_dict[n] = neigh.score(X_test, y_test)

    results_df = pd.DataFrame.from_dict(results_dict,
                                        orient='index',
                                        columns=['kn-score'])
    max_score = max(results_df['kn-score'])

    print("maximum score obtained: %0.2f%%" % (max_score * 100))

    max_list = results_df.loc[results_df['kn-score'] == max_score]

    for n in max_list.index:

        # Create classifier
        neigh = KNeighborsClassifier(n_neighbors=n, algorithm='auto')

        # Fit classifier to train data
        neigh.fit(X_train, y_train)

        print("\nnumber of neighbours: %d" % n)

        # Predict using test data
        y_pred = neigh.predict(X_test)
        y_pred_prob = pd.DataFrame(neigh.predict_proba(X_test)).round(2)
        y_pred_prob.columns = ["prob 0", "prob 1", "prob 2"]

        # Evaluate results
        diff = {
            "bad translation": 0,
            "average translation": 1,
            "good translation": 2
        }

        y_res = pd.DataFrame(y_pred, columns=['y_pred'])
        y_res['y_test'] = y_test.values

        for key in diff.keys():

            key_val = y_res.loc[y_res["y_pred"] == diff[key]]
            print("Accuracy for %s: %0.2f%%" %
                  (key,
                   accuracy_score(key_val["y_test"], key_val["y_pred"]) * 100))
예제 #2
0
use_biber = False
if use_biber == True:
    biber = pd.read_csv("data/en-fr-100/en-fr-100.dim", sep='\t')

    drop_cols = biber.columns[(biber == 0).sum() > 0.5 * biber.shape[0]]
    biber.drop(drop_cols, axis=1, inplace=True)

    features = features.merge(biber, left_index=True, right_index=True)

# Join data into single dataframe
df = ter.merge(features, left_index=True, right_index=True)

# Remove outliers
rm_out = False
if rm_out == True:
    df = remove_outliers(df, 'score', lq=0.05, uq=0.95)
    print("data points below 0.05 or above 0.95 quantiles removed")


# Classify scores based on percentile
def classify_scores(df, num_classes=3):

    if num_classes == 3:
        df["class"] = 1  # average translation
        df.loc[df["score"] >= df["score"].quantile(0.66),
               "class"] = 0  # bad translation
        df.loc[df["score"] <= df["score"].quantile(0.33),
               "class"] = 2  # good translation
        diff = {
            "bad translation": 0,
            "average translation": 1,
예제 #3
0
timed = pd.read_csv("data/timed-un/reliable.dat", sep=' ')
biber = pd.read_csv("data/timed-un/reliable1-dim.dat", sep='\t')

encode_category = False
if encode_category == True:

    enc = preprocessing.LabelEncoder()
    cat = timed['category']
    enc.fit(cat)
    timed['cat'] = enc.transform(cat)

# Join data into a single dataframe
df = pd.concat([timed['perday'], biber], axis=1)

# Remove outliers
df = remove_outliers(df, 'perday', lq=0.05, uq=0.95)
df.reset_index(inplace=True)

# Change regression problem into classification
n_class = 3
if n_class == 3:
    df["class"] = 1  # average
    df.loc[df["perday"] >= df["perday"].quantile(0.67), "class"] = 0  # easy
    df.loc[df["perday"] <= df["perday"].quantile(0.33), "class"] = 2  # hard
else:
    df["class"] = 1  # easy
    df.loc[df["perday"] > df["perday"].quantile(0.75),
           "class"] = 0  # very easy
    df.loc[df["perday"] <= df["perday"].quantile(0.25),
           "class"] = 3  # very hard
    df.loc[(df["perday"] > df["perday"].quantile(0.25)) &
예제 #4
0
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scripts.utils import remove_outliers

wto = pd.read_csv("data/wto/wto_timed_all.csv")

wto_french = remove_outliers(
    wto, 'PERDAY FRENCH', lq=0.05,
    uq=0.94).drop(columns=['DAYS SPANISH', 'PERDAY SPANISH'])
wto_spanish = remove_outliers(
    wto, 'PERDAY SPANISH', lq=0.05,
    uq=0.94).drop(columns=['DAYS FRENCH', 'PERDAY FRENCH'])

un = pd.read_csv("data/timed-un/reliable.dat", sep=' ')
#un = remove_outliers(un, 'perday', lq=0.05, uq=0.94)


def combined_plot():

    fig, axs = plt.subplots(1, 3, sharey=True, figsize=(15, 15))

    sns.distplot(un['perday'],
                 hist=True,
                 kde=True,
                 bins=20,
                 hist_kws={'edgecolor': 'black'},
                 kde_kws={'bw': 200},
                 ax=axs[0])
    axs[0].set_xlabel("Translation rate (words per day)")
    axs[0].set_ylabel("Density")
예제 #5
0
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scripts.utils import remove_outliers

es_df = pd.read_csv("data/un-parallel/es-mt-score.txt", header=None, sep='\n')
es_df.columns = ['score']

es_df = remove_outliers(es_df, 'score', lq=0.05, uq=0.95)

fr_df = pd.read_csv("data/un-parallel/fr-mt-score.txt", header=None, sep='\n')
fr_df.columns = ['score']

fr_df = remove_outliers(fr_df, 'score', lq=0.05, uq=0.95)


def histogram():

    fig, axs = plt.subplots(1, 2, sharey=True, figsize=(15, 15))

    axs[0].hist(es_df.iloc[:, 0], bins=25, edgecolor='black')
    axs[0].set_xlabel("TER")
    axs[0].set_ylabel("Frequency (sentences)")
    axs[0].set_title("UN Corpus - Spanish Translations")

    axs[1].hist(fr_df.iloc[:, 0], bins=25, edgecolor='black')
    axs[1].set_xlabel("TER")
    #axs[1].ylabel("Frequency (sentences)")
    axs[1].set_title("UN Corpus - French Translations")

    plt.show()
예제 #6
0
# can either predict time ('days') or rate ('perday')
target = 'perday'
if target == 'perday':
    units = 'words per day'
    min_lim, max_lim = 400, 2200  # best fit line limits
else:
    units = target
    min_lim, max_lim = 0, 20

# Join releveant data into one dataframe
reg_df = pd.concat([reliable[[target, 'words']], reliable1_dim], axis=1)

# Remove outliers
rem_out = False
if rem_out == True:
    reg_df = remove_outliers(reg_df, target, lq=0.05, uq=0.95)

# Convert categorical features into numerical labels
use_cat = False
if use_cat == True:
    enc = preprocessing.LabelEncoder()
    cat = reliable['category']
    enc.fit(cat)
    reg_df['category'] = enc.transform(cat)

# Drop columns with a large number of zeros:
drop_zero_cols = False
if drop_zero_cols == True:
    drop_cols = reg_df.columns[(reg_df == 0).sum() > 0.5 * reg_df.shape[0]]
    reg_df.drop(drop_cols, axis=1, inplace=True)
예제 #7
0
                      sep='\t')
ter.columns = ["score"]

# Join important columns to single dataframe
df = pd.concat([ter, time], axis=1)

# Calculate translation rate (and normalise)
#df['perms'] = df['words'] / df['time (ms)'] # words per ms
df['spw'] = (df['time (ms)']) / 1000 / df['words']  # seconds per word
#df['rate'] = (df['perms'] - df['perms'].min()) / (df['perms'].max() - df['perms'].min())

# Remove perfect translations
dft = df.loc[df['score'] != 0]

# Remove outliers
dfr = remove_outliers(df, 'spw', lq=0.05, uq=0.95)

# Correlation
print(dfr.corr().round(3)['score'])


# Quantiles
def quantiles(df):
    """ Output distribution of each quantile in the data set. """

    q1 = df.loc[df['perms'] <= df['perms'].quantile(0.25)]
    q2 = df.loc[(df['perms'] >= df['perms'].quantile(0.25))
                & (df['perms'] <= df['perms'].quantile(0.50))]
    q3 = df.loc[(df['perms'] >= df['perms'].quantile(0.50))
                & (df['perms'] <= df['perms'].quantile(0.75))]
    q4 = df.loc[df['perms'] >= df['perms'].quantile(0.75)]