Exemplo n.º 1
0
def init():

    asl = AslDb()
    #dimensions
    hand = ['right', 'left']
    side = ['r', 'l']
    cartesian = ['x', 'y']
    polar = ['r', 'theta']

    #rename the raw data for consistency
    raw_names = {
        h + '-' + c: 'raw-' + h[0] + c
        for h in hand for c in cartesian
    }
    asl.df = asl.df.rename(columns=raw_names)

    cartesian_features = ['grnd', 'norm', 'delta']
    features = {
        k: [k + '-' + h[0] + c for h in hand for c in cartesian]
        for k in cartesian_features
    }

    features['polar'] = ['polar' + '-' + s + c for s in side for c in polar]
    #derive the features
    for f in features['grnd']:
        asl.df[f] = asl.df['raw' + f[-3:]] - asl.df['nose-' + f[-1:]]

    df_means = asl.df.groupby('speaker').mean()
    df_std = asl.df.groupby('speaker').std()

    for f in features['norm']:
        ref = 'raw' + f[-3:]
        asl.df[f] = (asl.df[ref] - asl.df['speaker'].map(
            df_means[ref])) / asl.df['speaker'].map(df_std[ref])

    for f in features['delta']:
        ref = 'grnd' + f[-3:]
        asl.df[f] = (asl.df[ref].diff()).fillna(0)

    ref = 'grnd'
    asl.df['polar-rtheta'] = (np.arctan2(asl.df[ref + '-rx'],
                                         asl.df[ref + '-ry']))
    asl.df['polar-ltheta'] = (np.arctan2(asl.df[ref + '-lx'],
                                         asl.df[ref + '-ly']))
    asl.df['polar-rr'] = np.sqrt(asl.df[ref + '-rx']**2 +
                                 asl.df[ref + '-ry']**2)
    asl.df['polar-lr'] = np.sqrt(asl.df[ref + '-lx']**2 +
                                 asl.df[ref + '-ly']**2)
    training = {k: asl.build_training(v) for k, v in features.items()}

    xlens = training['grnd'].get_all_Xlengths()
    lens_stats = [(k, len(v[1]), min(v[1]), sum(v[1]) / len(v[1]), max(v[1]),
                   max(v[1]) - min(v[1])) for k, v in xlens.items()]
    words_stats = pd.DataFrame.from_records(
        lens_stats, columns=['word', 'count', 'min', 'avg', 'max',
                             'range']).set_index('word')
    words_stats['spread'] = words_stats['range'] / words_stats['avg']

    #include all words
    min_len = 0
    words = words_stats[words_stats['min'] > min_len].sort_values(
        by='count', ascending=False).index.tolist()

    samples = dict()
    for f in features:
        samples[f] = {k: get_word(training, features, f, k) for k in words}

    threshold = 1e-9
    separated = {
        k: ([s for s in v if min(s.std()) < threshold],
            [s for s in v if min(s.std()) > threshold])
        for k, v in samples['norm'].items()
    }
    separated_stats = pd.DataFrame.from_records(
        {k: (len(v[0]), len(v[1]))
         for k, v in separated.items()}).T.rename(columns={
             0: 'single',
             1: 'double'
         })
    return asl, features, training, samples, words_stats.join(
        separated_stats), separated

asl.df['grnd-ry'] = asl.df['right-y'] - asl.df['nose-y']
asl.df['grnd-rx'] = asl.df['right-x'] - asl.df['nose-x']
asl.df['grnd-ly'] = asl.df['left-y'] - asl.df['nose-y']
asl.df['grnd-lx'] = asl.df['left-x'] - asl.df['nose-x']

# collect the features into a list groud features
features_ground = ['grnd-rx', 'grnd-ry', 'grnd-lx', 'grnd-ly']

# Normaized features
features_norm = ['norm-rx', 'norm-ry', 'norm-lx', 'norm-ly']

lookup = asl.df.groupby('speaker').transform(lambda df: (df - df.mean()) / df.std())
asl.df = asl.df.assign(**{'norm-rx': lookup['right-x'],
                          'norm-lx': lookup['left-x'],
                          'norm-ry': lookup['right-y'],
                          'norm-ly': lookup['left-y']})

# Polar features
features_polar = ['polar-rr', 'polar-rtheta', 'polar-lr', 'polar-ltheta']
asl.df['polar-lr'] = np.sqrt((asl.df['left-x'] - asl.df['nose-x']) ** 2 + (asl.df['left-y'] - asl.df['nose-y']) ** 2)
asl.df['polar-rr'] = np.sqrt((asl.df['right-x'] - asl.df['nose-x']) ** 2 + (asl.df['right-y'] - asl.df['nose-y']) ** 2)
asl.df['polar-ltheta'] = np.arctan2((asl.df['left-x'] - asl.df['nose-x']), (asl.df['left-y'] - asl.df['nose-y']))
asl.df['polar-rtheta'] = np.arctan2((asl.df['right-x'] - asl.df['nose-x']), (asl.df['right-y'] - asl.df['nose-y']))

# Delta features
features_delta = ['delta-rx', 'delta-ry', 'delta-lx', 'delta-ly']

asl.df['delta-rx'] = asl.df['right-x'].diff().fillna(0)
asl.df['delta-ry'] = asl.df['right-y'].diff().fillna(0)
asl.df['delta-lx'] = asl.df['left-x'].diff().fillna(0)