Python pmi примеры, vsm.pmi Python примеры использования

Пример #1

0

Показать файл

def run_giga_ppmi_baseline():

    ##### YOUR CODE HERE
    giga20 = pd.read_csv(os.path.join(VSM_HOME, 'giga_window20-flat.csv.gz'),
                         index_col=0)
    giga20_ppmi = vsm.pmi(giga20)
    return full_word_similarity_evaluation(giga20_ppmi)

Пример #2

0

Показать файл

Файл: hw_wordsim_trials1.py Проект: abgoswam/cs224u

def run_ppmi_lsa_pipeline(count_df, k):

    ##### YOUR CODE HERE
    count_df_pmi = vsm.pmi(count_df)
    count_df_pmi_lsa = vsm.lsa(count_df_pmi, k)
    eval_results = full_word_similarity_evaluation(count_df_pmi_lsa)
    return eval_results

Пример #3

0

Показать файл

Файл: hw_wordsim.py Проект: duanzhihua/standford_NLU_project-

def run_giga_ppmi_baseline():
    ##### YOUR CODE HERE
    giga20 = pd.read_csv(os.path.join(VSM_HOME, 'giga_window20-flat.csv.gz'),
                         index_col=0)
    giga20_ppmi_reweights = vsm.pmi(giga20, positive=True)
    result = full_word_similarity_evaluation(giga20_ppmi_reweights)
    return result

Пример #4

0

Показать файл

def run_original_system(count_df, k=10):
    wn_edges = get_wordnet_edges()
    wn_index_edges = convert_edges_to_indices(wn_edges, count_df.fillna(0))
    import IPython
    IPython.embed()
    ppmi_df = vsm.pmi(count_df)
    ppmi_df_lsa100 = vsm.lsa(ppmi_df, k)
    return full_word_similarity_evaluation(ppmi_df_lsa100)

Пример #5

0

Показать файл

def run_ppmi_lsa_pipeline(count_df, k):

    ##### YOUR CODE HERE
    counts_ppmi = vsm.pmi(count_df, positive=True)
    counts_ppmi_lsa = vsm.lsa(counts_ppmi, k=k)
    results = full_word_similarity_evaluation(counts_ppmi_lsa)
    display(results)
    return results

Пример #6

0

Показать файл

 def model():
     df = pd.read_csv(os.path.join(VSM_HOME, 'imdb_window5-scaled.csv.gz'),
                      index_col=0)
     df = vsm.pmi(df)
     df = ttest(df)
     # df = subword_enrichment(df)
     df = df.apply(vsm.length_norm, axis=1)
     df = TorchAutoencoder(hidden_dim=500, max_iter=200, eta=1e-3).fit(df)
     return full_word_similarity_evaluation(df)

Пример #7

0

Показать файл

def run_giga_ppmi_baseline():

    ##### YOUR CODE HERE
    giga20 = pd.read_csv(os.path.join(VSM_HOME, "giga_window20-flat.csv.gz"),
                         index_col=0)
    giga20_pmi = vsm.pmi(giga20)
    eval_results = full_word_similarity_evaluation(giga20_pmi)
    print(eval_results)
    return eval_results

Пример #8

0

Показать файл

def run_ppmi_lsa_pipeline(count_df, k):

    #part2
    count_ppmi = vsm.pmi(count_df)

    #part3
    count_ppmi_lsa = vsm.lsa(count_ppmi, k)
    print(count_ppmi_lsa)

    #part4
    output = full_word_similarity_evaluation(count_ppmi_lsa)
    print(output)
    return output

Пример #9

0

Показать файл

def run_giga_ppmi_baseline():

    #part 1
    giga20 = pd.read_csv(os.path.join(VSM_HOME, "giga_window20-flat.csv.gz"),
                         index_col=0)

    #part 2
    giga20_ppmi = vsm.pmi(giga20)

    #part3 - research why we have a combination of cosine and ppmi distance. Ans: ppmi does the reweights for
    #the giga20 matrix. Cosine distance calculates the distance between giga2_ppmi and the different readers
    output = full_word_similarity_evaluation(giga20_ppmi)
    print(output)
    return output

Пример #10

0

Показать файл

def test_pmi(arg, expected, positive):
    result = vsm.pmi(arg, positive=positive).round(8)
    assert np.array_equal(result, expected.round(8))

Пример #11

0

Показать файл

Файл: sst_03_neural_networks_trials1.py Проект: abgoswam/cs224u

    class_func=sst.ternary_class_func,
    vectorize=False
)  # Tell `experiment` that we already have our feature vectors.

# ### IMDB representations
#
# Our IMDB VSMs seems pretty well-attuned to the Stanford Sentiment Treebank, so we might think that they can do even better than the general-purpose GloVe inputs. Here are two quick assessments of that idea:

# In[10]:

imdb20 = pd.read_csv(os.path.join(VSMDATA_HOME, 'imdb_window20-flat.csv.gz'),
                     index_col=0)

# In[11]:

imdb20_ppmi = vsm.pmi(imdb20, positive=False)

# In[12]:

imdb20_ppmi_svd = vsm.lsa(imdb20_ppmi, k=50)

# In[13]:

imdb_lookup = dict(zip(imdb20_ppmi_svd.index, imdb20_ppmi_svd.values))

# In[14]:


def imdb_phi(tree, np_func=np.sum):
    return vsm_leaves_phi(tree, imdb_lookup, np_func=np_func)

Пример #12

0

Показать файл

Файл: vsm_01_distributional_trials1.py Проект: abgoswam/cs224u

# This basic definition runs into a problem for $0$ count cells. The usual response is to set $\log(0) = 0$, but this is arguably confusing – cell counts that are smaller than expected get negative values, cell counts that are larger than expected get positive values, and 0-count values are placed in the middle of this ranking without real justification.
# 
# For this reason, it is more typical to use __Positive PMI__, which maps all negative PMI values to $0$:
# 
# $$\textbf{ppmi}(X, i, j) = 
# \begin{cases}
# \textbf{pmi}(X, i, j) & \textrm{if } \textbf{pmi}(X, i, j) > 0 \\
# 0 & \textrm{otherwise}
# \end{cases}$$
# 
# This is the default for `vsm.pmi`.

# In[31]:


imdb5_pmi = vsm.pmi(imdb5)


# In[32]:


imdb20_pmi = vsm.pmi(imdb20)


# In[33]:


vsm.neighbors('good', imdb5_pmi).head()


# In[34]:

Пример #13

0

Показать файл

             dtype='float64'),
    index=['gnarly', 'wicked', 'awesome', 'lame', 'terrible'])

gnarly_df

# No column context includes both _gnarly_ and _wicked_ together so our count matrix places them far apart:

# In[10]:

vsm.neighbors('gnarly', gnarly_df)

# Reweighting doesn't help. For example, here is the attempt with Positive PMI:

# In[11]:

vsm.neighbors('gnarly', vsm.pmi(gnarly_df))

# However, both words tend to occur with _awesome_ and not with _lame_ or _terrible_, so there is an important sense in which they are similar. LSA to the rescue:

# In[12]:

gnarly_lsa_df = vsm.lsa(gnarly_df, k=2)

# In[13]:

vsm.neighbors('gnarly', gnarly_lsa_df)

# ### Applying LSA to real VSMs
#
# Here's an example that begins to convey the effect that this can have empirically.
#

Пример #14

0

Показать файл

Файл: hw_wordsim.py Проект: duanzhihua/standford_NLU_project-

def run_ppmi_lsa_pipeline(count_df, k):
    ##### YOUR CODE HERE
    ppmi_reweight_df = vsm.pmi(count_df)
    lsa_df = vsm.lsa(ppmi_reweight_df, k)
    results = full_word_similarity_evaluation(lsa_df)
    return results

Пример #15

0

Показать файл

gre = data_loading.GRE()


# In[3]:

dev = gre.dev_sentence_completion()
print(dev.shape)
dev.head()


# In[ ]:

msr = data_loading.MSR()
gutenberg = msr.train_word_word_cooccurence(window=5, vocab_size=30000)
guten_ppmi = vsm.pmi(gutenberg)


# In[ ]:

dev["question"][5]


# In[ ]:

def get_synonyms(word):
    synonyms = []
    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
            synonyms.append(l.name())
    return synonyms

Пример #16

0

Показать файл

        def __getitem__(self, index):
            return self.X[index], self.Y[index]

        def __len__(self):
            return self.len

    #defining a cosine loss function
    def cosine_loss(output, target):
        num = torch.mm(output, torch.t(target))
        den = torch.sqrt(torch.sum(output**2) * torch.sum(target**2))
        return (1 - num / den)

    #loading data and pre-processing
    giga = pd.read_csv(os.path.join(VSM_HOME, "giga_window20-flat.csv.gz"),
                       index_col=0)
    giga = vsm.pmi(giga)
    giga = vsm.lsa(giga, k=750)

    #defining parameters
    num_epochs = 200
    batch_size = 128
    step_rate = 0.15
    learning_rate = 1e-4
    examples = giga.shape[0]
    features = giga.shape[1]

    #Preparing data
    step = int(features * step_rate)
    model = autoencoder(step)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

Пример #17

0

Показать файл

# Requirements:
#
# 1. Your code must operate on one of the count matrices in `data/vsmdata`. You can choose which one. __Other pretrained vectors cannot be introduced__.
#
# 1. Your code must be self-contained, so that we can work with your model directly in your homework submission notebook. If your model depends on external data or other resources, please submit a ZIP archive containing these resources along with your submission.
#
# In the cell below, please provide a brief technical description of your original system, so that the teaching team can gain an understanding of what it does. This will help us to understand your code and analyze all the submissions to identify patterns and strategies.

# ### PPMI

# %%

if 'IS_GRADESCOPE_ENV' not in os.environ:
    giga20 = pd.read_csv(os.path.join(VSM_HOME, 'giga_window20-flat.csv.gz'),
                         index_col=0)
    giga20_ppmi = vsm.pmi(giga20, positive=True)
    print("giga20_ppmi")
    display(full_word_similarity_evaluation(giga20_ppmi))

# ### PPMI + LSA

# %%

if 'IS_GRADESCOPE_ENV' not in os.environ:
    print("giga20_ppmi_lsa")
    for k in (5, 10, 20, 50, 100):
        giga20_ppmi_lsa = vsm.lsa(giga20_ppmi, k=k)
        print("========", k, "========")
        display(full_word_similarity_evaluation(giga20_ppmi_lsa))

# %%

Пример #18

0

Показать файл

def run_ppmi_lsa_pipeline(count_df, k):

    ##### YOUR CODE HERE
    ppmi_df = vsm.pmi(count_df)
    ppmi_df_lsa100 = vsm.lsa(ppmi_df, k)
    return full_word_similarity_evaluation(ppmi_df_lsa100)

Пример #19

0

Показать файл

    class DistFuncRegressor:
        def __init__(self):
            #self.model = linear_model.LinearRegression()
            self.model = lightgbm.LGBMRegressor()

        def train(self, X, y):
            self.model.fit(X, y)

        def distfunc(self, a, b):
            X_test = a - b
            return self.model.predict([X_test])

    giga5 = pd.read_csv(os.path.join(VSM_HOME, "giga_window5-scaled.csv.gz"),
                        index_col=0)
    giga5_pmi = vsm.pmi(giga5)

    #lets try to train a model
    X = []
    y = []
    for reader in READERS:
        y_temp = []
        for w1, w2, score in reader():
            w1_values = giga5_pmi.loc[w1].values
            w2_values = giga5_pmi.loc[w2].values

            item = w1_values - w2_values
            X.append(item)
            y_temp.append(score)

        # normalize y

Python pmi примеры использования