示例#1
0
def subword_enrichment(df, n=4):

    # 1. Use `vsm.ngram_vsm` to create a character-level
    # VSM from `df`, using the above parameter `n` to
    # set the size of the ngrams.

    ##### YOUR CODE HERE
    ngram_df = vsm.ngram_vsm(df, n)

    # 2. Use `vsm.character_level_rep` to get the representation
    # for every word in `df` according to the character-level
    # VSM you created above.

    ##### YOUR CODE HERE
    reps = np.stack(
        df.index.map(lambda w: vsm.character_level_rep(w, ngram_df, n)))

    # 3. For each representation created at step 2, add in its
    # original representation from `df`. (This should use
    # element-wise addition; the dimensionality of the vectors
    # will be unchanged.)

    ##### YOUR CODE HERE
    result = df + reps

    # 4. Return a `pd.DataFrame` with the same index and column
    # values as `df`, but filled with the new representations
    # created at step 3.

    ##### YOUR CODE HERE
    return result
def subword_enrichment(df, n=4):

    # 1. Use `vsm.ngram_vsm` to create a character-level
    # VSM from `df`, using the above parameter `n` to
    # set the size of the ngrams.

    ##### YOUR CODE HERE
    df_ngrams = vsm.ngram_vsm(df, n)

    # 2. Use `vsm.character_level_rep` to get the representation
    # for every word in `df` according to the character-level
    # VSM you created above.

    ##### YOUR CODE HERE
    new_matrix = []
    for i in df.index:
        new_vec = vsm.character_level_rep(i, df_ngrams, n)
        new_matrix.append(new_vec)

    # 3. For each representation created at step 2, add in its
    # original representation from `df`. (This should use
    # element-wise addition; the dimensionality of the vectors
    # will be unchanged.)

    ##### YOUR CODE HERE
    df_sub = pd.DataFrame(new_matrix, df.index)
    df_sub_sum = df.add(df_sub)

    # 4. Return a `pd.DataFrame` with the same index and column
    # values as `df`, but filled with the new representations
    # created at step 3.

    ##### YOUR CODE HERE
    return df_sub_sum
示例#3
0
def subword_enrichment(df, n=4):

    # 1. Use `vsm.ngram_vsm` to create a character-level
    # VSM from `df`, using the above parameter `n` to
    # set the size of the ngrams.

    vsm_char = vsm.ngram_vsm(df, n=n)
    print(vsm_char)

    # 2. Use `vsm.character_level_rep` to get the representation
    # for every word in `df` according to the character-level
    # VSM you created above.

    result = {}
    new_df = pd.DataFrame(columns=df.index)
    for word in df.index:
        result[word] = np.add(vsm.character_level_rep(word, vsm_char, n=n),
                              np.array(df.loc[word]))

    # 3. For each representation created at step 2, add in its
    # original representation from `df`. (This should use
    # element-wise addition; the dimensionality of the vectors
    # will be unchanged.)

    new_df = pd.DataFrame.from_dict(result, orient='index')

    # 4. Return a `pd.DataFrame` with the same index and column
    # values as `df`, but filled with the new representations
    # created at step 3.

    return new_df
示例#4
0
def subword_enrichment(df, n=4):

    # 1. Use `vsm.ngram_vsm` to create a character-level
    # VSM from `df`, using the above parameter `n` to
    # set the size of the ngrams.

    ##### YOUR CODE HERE
    df_ngrams = vsm.ngram_vsm(df, n=n)
    print(df_ngrams)

    # 2. Use `vsm.character_level_rep` to get the representation
    # for every word in `df` according to the character-level
    # VSM you created above.

    ##### YOUR CODE HERE
    char_reps = {}
    for word in df.index:
        char_reps[word] = vsm.character_level_rep(word, df_ngrams, n=n)

    # 3. For each representation created at step 2, add in its
    # original representation from `df`. (This should use
    # element-wise addition; the dimensionality of the vectors
    # will be unchanged.)

    ##### YOUR CODE HERE
    for word in df.index:
        char_reps[word] += df.loc[word].values
        print(char_reps[word])

    # 4. Return a `pd.DataFrame` with the same index and column
    # values as `df`, but filled with the new representations
    # created at step 3.

    ##### YOUR CODE HERE
    ret_df = df.copy()
    for word in df.index:
        ret_df.loc[word] = char_reps[word]

    return ret_df
# This has the same column dimension as the `imdb5`, but the rows are expanded with all the 4-grams, including boundary symbols `<w>` and `</w>`. 
# 
# `vsm.character_level_rep` is a simple function for creating new word representations from the associated character-level ones. Many variations on that function are worth trying – for example, you could include the original word vector where available, change the aggregation method from `sum` to something else, use a real morphological parser instead of just n-grams, and so on.

# One very powerful thing about this is that we can represent words that are not even in the original VSM:

# In[39]:


'superbly' in imdb5.index


# In[40]:


superbly = vsm.character_level_rep("superbly", imdb5_ngrams)


# In[41]:


superb = vsm.character_level_rep("superb", imdb5_ngrams)


# In[42]:


vsm.cosine(superb, superbly)


# ## Visualization