Пример #1
0
def run_giga_ppmi_baseline():

    ##### YOUR CODE HERE
    giga20 = pd.read_csv(os.path.join(VSM_HOME, 'giga_window20-flat.csv.gz'),
                         index_col=0)
    giga20_ppmi = vsm.pmi(giga20)
    return full_word_similarity_evaluation(giga20_ppmi)
Пример #2
0
def run_ppmi_lsa_pipeline(count_df, k):

    ##### YOUR CODE HERE
    count_df_pmi = vsm.pmi(count_df)
    count_df_pmi_lsa = vsm.lsa(count_df_pmi, k)
    eval_results = full_word_similarity_evaluation(count_df_pmi_lsa)
    return eval_results
def run_giga_ppmi_baseline():
    ##### YOUR CODE HERE
    giga20 = pd.read_csv(os.path.join(VSM_HOME, 'giga_window20-flat.csv.gz'),
                         index_col=0)
    giga20_ppmi_reweights = vsm.pmi(giga20, positive=True)
    result = full_word_similarity_evaluation(giga20_ppmi_reweights)
    return result
Пример #4
0
def run_original_system(count_df, k=10):
    wn_edges = get_wordnet_edges()
    wn_index_edges = convert_edges_to_indices(wn_edges, count_df.fillna(0))
    import IPython
    IPython.embed()
    ppmi_df = vsm.pmi(count_df)
    ppmi_df_lsa100 = vsm.lsa(ppmi_df, k)
    return full_word_similarity_evaluation(ppmi_df_lsa100)
Пример #5
0
def run_ppmi_lsa_pipeline(count_df, k):

    ##### YOUR CODE HERE
    counts_ppmi = vsm.pmi(count_df, positive=True)
    counts_ppmi_lsa = vsm.lsa(counts_ppmi, k=k)
    results = full_word_similarity_evaluation(counts_ppmi_lsa)
    display(results)
    return results
Пример #6
0
 def model():
     df = pd.read_csv(os.path.join(VSM_HOME, 'imdb_window5-scaled.csv.gz'),
                      index_col=0)
     df = vsm.pmi(df)
     df = ttest(df)
     # df = subword_enrichment(df)
     df = df.apply(vsm.length_norm, axis=1)
     df = TorchAutoencoder(hidden_dim=500, max_iter=200, eta=1e-3).fit(df)
     return full_word_similarity_evaluation(df)
Пример #7
0
def run_giga_ppmi_baseline():

    ##### YOUR CODE HERE
    giga20 = pd.read_csv(os.path.join(VSM_HOME, "giga_window20-flat.csv.gz"),
                         index_col=0)
    giga20_pmi = vsm.pmi(giga20)
    eval_results = full_word_similarity_evaluation(giga20_pmi)
    print(eval_results)
    return eval_results
Пример #8
0
def run_ppmi_lsa_pipeline(count_df, k):

    #part2
    count_ppmi = vsm.pmi(count_df)

    #part3
    count_ppmi_lsa = vsm.lsa(count_ppmi, k)
    print(count_ppmi_lsa)

    #part4
    output = full_word_similarity_evaluation(count_ppmi_lsa)
    print(output)
    return output
Пример #9
0
def run_giga_ppmi_baseline():

    #part 1
    giga20 = pd.read_csv(os.path.join(VSM_HOME, "giga_window20-flat.csv.gz"),
                         index_col=0)

    #part 2
    giga20_ppmi = vsm.pmi(giga20)

    #part3 - research why we have a combination of cosine and ppmi distance. Ans: ppmi does the reweights for
    #the giga20 matrix. Cosine distance calculates the distance between giga2_ppmi and the different readers
    output = full_word_similarity_evaluation(giga20_ppmi)
    print(output)
    return output
Пример #10
0
def test_pmi(arg, expected, positive):
    result = vsm.pmi(arg, positive=positive).round(8)
    assert np.array_equal(result, expected.round(8))
    class_func=sst.ternary_class_func,
    vectorize=False
)  # Tell `experiment` that we already have our feature vectors.

# ### IMDB representations
#
# Our IMDB VSMs seems pretty well-attuned to the Stanford Sentiment Treebank, so we might think that they can do even better than the general-purpose GloVe inputs. Here are two quick assessments of that idea:

# In[10]:

imdb20 = pd.read_csv(os.path.join(VSMDATA_HOME, 'imdb_window20-flat.csv.gz'),
                     index_col=0)

# In[11]:

imdb20_ppmi = vsm.pmi(imdb20, positive=False)

# In[12]:

imdb20_ppmi_svd = vsm.lsa(imdb20_ppmi, k=50)

# In[13]:

imdb_lookup = dict(zip(imdb20_ppmi_svd.index, imdb20_ppmi_svd.values))

# In[14]:


def imdb_phi(tree, np_func=np.sum):
    return vsm_leaves_phi(tree, imdb_lookup, np_func=np_func)
# This basic definition runs into a problem for $0$ count cells. The usual response is to set $\log(0) = 0$, but this is arguably confusing – cell counts that are smaller than expected get negative values, cell counts that are larger than expected get positive values, and 0-count values are placed in the middle of this ranking without real justification.
# 
# For this reason, it is more typical to use __Positive PMI__, which maps all negative PMI values to $0$:
# 
# $$\textbf{ppmi}(X, i, j) = 
# \begin{cases}
# \textbf{pmi}(X, i, j) & \textrm{if } \textbf{pmi}(X, i, j) > 0 \\
# 0 & \textrm{otherwise}
# \end{cases}$$
# 
# This is the default for `vsm.pmi`.

# In[31]:


imdb5_pmi = vsm.pmi(imdb5)


# In[32]:


imdb20_pmi = vsm.pmi(imdb20)


# In[33]:


vsm.neighbors('good', imdb5_pmi).head()


# In[34]:
Пример #13
0
             dtype='float64'),
    index=['gnarly', 'wicked', 'awesome', 'lame', 'terrible'])

gnarly_df

# No column context includes both _gnarly_ and _wicked_ together so our count matrix places them far apart:

# In[10]:

vsm.neighbors('gnarly', gnarly_df)

# Reweighting doesn't help. For example, here is the attempt with Positive PMI:

# In[11]:

vsm.neighbors('gnarly', vsm.pmi(gnarly_df))

# However, both words tend to occur with _awesome_ and not with _lame_ or _terrible_, so there is an important sense in which they are similar. LSA to the rescue:

# In[12]:

gnarly_lsa_df = vsm.lsa(gnarly_df, k=2)

# In[13]:

vsm.neighbors('gnarly', gnarly_lsa_df)

# ### Applying LSA to real VSMs
#
# Here's an example that begins to convey the effect that this can have empirically.
#
def run_ppmi_lsa_pipeline(count_df, k):
    ##### YOUR CODE HERE
    ppmi_reweight_df = vsm.pmi(count_df)
    lsa_df = vsm.lsa(ppmi_reweight_df, k)
    results = full_word_similarity_evaluation(lsa_df)
    return results
Пример #15
0
gre = data_loading.GRE()


# In[3]:

dev = gre.dev_sentence_completion()
print(dev.shape)
dev.head()


# In[ ]:

msr = data_loading.MSR()
gutenberg = msr.train_word_word_cooccurence(window=5, vocab_size=30000)
guten_ppmi = vsm.pmi(gutenberg)


# In[ ]:

dev["question"][5]


# In[ ]:

def get_synonyms(word):
    synonyms = []
    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
            synonyms.append(l.name())
    return synonyms
Пример #16
0
        def __getitem__(self, index):
            return self.X[index], self.Y[index]

        def __len__(self):
            return self.len

    #defining a cosine loss function
    def cosine_loss(output, target):
        num = torch.mm(output, torch.t(target))
        den = torch.sqrt(torch.sum(output**2) * torch.sum(target**2))
        return (1 - num / den)

    #loading data and pre-processing
    giga = pd.read_csv(os.path.join(VSM_HOME, "giga_window20-flat.csv.gz"),
                       index_col=0)
    giga = vsm.pmi(giga)
    giga = vsm.lsa(giga, k=750)

    #defining parameters
    num_epochs = 200
    batch_size = 128
    step_rate = 0.15
    learning_rate = 1e-4
    examples = giga.shape[0]
    features = giga.shape[1]

    #Preparing data
    step = int(features * step_rate)
    model = autoencoder(step)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
Пример #17
0
# Requirements:
#
# 1. Your code must operate on one of the count matrices in `data/vsmdata`. You can choose which one. __Other pretrained vectors cannot be introduced__.
#
# 1. Your code must be self-contained, so that we can work with your model directly in your homework submission notebook. If your model depends on external data or other resources, please submit a ZIP archive containing these resources along with your submission.
#
# In the cell below, please provide a brief technical description of your original system, so that the teaching team can gain an understanding of what it does. This will help us to understand your code and analyze all the submissions to identify patterns and strategies.

# ### PPMI

# %%

if 'IS_GRADESCOPE_ENV' not in os.environ:
    giga20 = pd.read_csv(os.path.join(VSM_HOME, 'giga_window20-flat.csv.gz'),
                         index_col=0)
    giga20_ppmi = vsm.pmi(giga20, positive=True)
    print("giga20_ppmi")
    display(full_word_similarity_evaluation(giga20_ppmi))

# ### PPMI + LSA

# %%

if 'IS_GRADESCOPE_ENV' not in os.environ:
    print("giga20_ppmi_lsa")
    for k in (5, 10, 20, 50, 100):
        giga20_ppmi_lsa = vsm.lsa(giga20_ppmi, k=k)
        print("========", k, "========")
        display(full_word_similarity_evaluation(giga20_ppmi_lsa))

# %%
Пример #18
0
def run_ppmi_lsa_pipeline(count_df, k):

    ##### YOUR CODE HERE
    ppmi_df = vsm.pmi(count_df)
    ppmi_df_lsa100 = vsm.lsa(ppmi_df, k)
    return full_word_similarity_evaluation(ppmi_df_lsa100)
Пример #19
0
    class DistFuncRegressor:
        def __init__(self):
            #self.model = linear_model.LinearRegression()
            self.model = lightgbm.LGBMRegressor()

        def train(self, X, y):
            self.model.fit(X, y)

        def distfunc(self, a, b):
            X_test = a - b
            return self.model.predict([X_test])

    giga5 = pd.read_csv(os.path.join(VSM_HOME, "giga_window5-scaled.csv.gz"),
                        index_col=0)
    giga5_pmi = vsm.pmi(giga5)

    #lets try to train a model
    X = []
    y = []
    for reader in READERS:
        y_temp = []
        for w1, w2, score in reader():
            w1_values = giga5_pmi.loc[w1].values
            w2_values = giga5_pmi.loc[w2].values

            item = w1_values - w2_values
            X.append(item)
            y_temp.append(score)

        # normalize y