예제 #1
0
def run_ppmi_lsa_pipeline(count_df, k):

    ##### YOUR CODE HERE
    count_df_pmi = vsm.pmi(count_df)
    count_df_pmi_lsa = vsm.lsa(count_df_pmi, k)
    eval_results = full_word_similarity_evaluation(count_df_pmi_lsa)
    return eval_results
예제 #2
0
def run_ppmi_lsa_pipeline(count_df, k):

    ##### YOUR CODE HERE
    counts_ppmi = vsm.pmi(count_df, positive=True)
    counts_ppmi_lsa = vsm.lsa(counts_ppmi, k=k)
    results = full_word_similarity_evaluation(counts_ppmi_lsa)
    display(results)
    return results
예제 #3
0
def run_original_system(count_df, k=10):
    wn_edges = get_wordnet_edges()
    wn_index_edges = convert_edges_to_indices(wn_edges, count_df.fillna(0))
    import IPython
    IPython.embed()
    ppmi_df = vsm.pmi(count_df)
    ppmi_df_lsa100 = vsm.lsa(ppmi_df, k)
    return full_word_similarity_evaluation(ppmi_df_lsa100)
예제 #4
0
def run_ppmi_lsa_pipeline(count_df, k):

    #part2
    count_ppmi = vsm.pmi(count_df)

    #part3
    count_ppmi_lsa = vsm.lsa(count_ppmi, k)
    print(count_ppmi_lsa)

    #part4
    output = full_word_similarity_evaluation(count_ppmi_lsa)
    print(output)
    return output
예제 #5
0
if 'IS_GRADESCOPE_ENV' not in os.environ:
    giga20 = pd.read_csv(os.path.join(VSM_HOME, 'giga_window20-flat.csv.gz'),
                         index_col=0)
    giga20_ppmi = vsm.pmi(giga20, positive=True)
    print("giga20_ppmi")
    display(full_word_similarity_evaluation(giga20_ppmi))

# ### PPMI + LSA

# %%

if 'IS_GRADESCOPE_ENV' not in os.environ:
    print("giga20_ppmi_lsa")
    for k in (5, 10, 20, 50, 100):
        giga20_ppmi_lsa = vsm.lsa(giga20_ppmi, k=k)
        print("========", k, "========")
        display(full_word_similarity_evaluation(giga20_ppmi_lsa))

# %%

if 'IS_GRADESCOPE_ENV' not in os.environ:
    print("giga20_ppmi_lsa")
    for k in (200, 500, 1000):
        giga20_ppmi_lsa = vsm.lsa(giga20_ppmi, k=k)
        print("========", k, "========")
        display(full_word_similarity_evaluation(giga20_ppmi_lsa))

# %%

if 'IS_GRADESCOPE_ENV' not in os.environ:
예제 #6
0
def test_lsa(df):
    vsm.lsa(df, k=2)
# ### IMDB representations
#
# Our IMDB VSMs seems pretty well-attuned to the Stanford Sentiment Treebank, so we might think that they can do even better than the general-purpose GloVe inputs. Here are two quick assessments of that idea:

# In[10]:

imdb20 = pd.read_csv(os.path.join(VSMDATA_HOME, 'imdb_window20-flat.csv.gz'),
                     index_col=0)

# In[11]:

imdb20_ppmi = vsm.pmi(imdb20, positive=False)

# In[12]:

imdb20_ppmi_svd = vsm.lsa(imdb20_ppmi, k=50)

# In[13]:

imdb_lookup = dict(zip(imdb20_ppmi_svd.index, imdb20_ppmi_svd.values))

# In[14]:


def imdb_phi(tree, np_func=np.sum):
    return vsm_leaves_phi(tree, imdb_lookup, np_func=np_func)


# In[15]:

_ = sst.experiment(
예제 #8
0
# In[10]:

vsm.neighbors('gnarly', gnarly_df)

# Reweighting doesn't help. For example, here is the attempt with Positive PMI:

# In[11]:

vsm.neighbors('gnarly', vsm.pmi(gnarly_df))

# However, both words tend to occur with _awesome_ and not with _lame_ or _terrible_, so there is an important sense in which they are similar. LSA to the rescue:

# In[12]:

gnarly_lsa_df = vsm.lsa(gnarly_df, k=2)

# In[13]:

vsm.neighbors('gnarly', gnarly_lsa_df)

# ### Applying LSA to real VSMs
#
# Here's an example that begins to convey the effect that this can have empirically.
#
# First, the original count matrix:

# In[14]:

vsm.neighbors('superb', imdb5).head()
def run_ppmi_lsa_pipeline(count_df, k):
    ##### YOUR CODE HERE
    ppmi_reweight_df = vsm.pmi(count_df)
    lsa_df = vsm.lsa(ppmi_reweight_df, k)
    results = full_word_similarity_evaluation(lsa_df)
    return results
예제 #10
0
def run_ppmi_lsa_pipeline(count_df, k):

    ##### YOUR CODE HERE
    ppmi_df = vsm.pmi(count_df)
    ppmi_df_lsa100 = vsm.lsa(ppmi_df, k)
    return full_word_similarity_evaluation(ppmi_df_lsa100)
예제 #11
0
            return self.X[index], self.Y[index]

        def __len__(self):
            return self.len

    #defining a cosine loss function
    def cosine_loss(output, target):
        num = torch.mm(output, torch.t(target))
        den = torch.sqrt(torch.sum(output**2) * torch.sum(target**2))
        return (1 - num / den)

    #loading data and pre-processing
    giga = pd.read_csv(os.path.join(VSM_HOME, "giga_window20-flat.csv.gz"),
                       index_col=0)
    giga = vsm.pmi(giga)
    giga = vsm.lsa(giga, k=750)

    #defining parameters
    num_epochs = 200
    batch_size = 128
    step_rate = 0.15
    learning_rate = 1e-4
    examples = giga.shape[0]
    features = giga.shape[1]

    #Preparing data
    step = int(features * step_rate)
    model = autoencoder(step)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)