Пример #1
0
from scipy.spatial import distance
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
import pyemd
import pickle
import pathlib
import os

stop_words = stopwords.words("english")

# Initialize GloVe embeddings
print("Loading GloVe embeddings...")
glove_embeddings = gensim.downloader.load("glove-wiki-gigaword-300")

# Load corpus
corpus = utils.load_corpus()

# this function returns the average vector of a sentence
def get_vec(tokenized_sentence):
    return np.mean(np.array([glove_embeddings[word] for word in tokenized_sentence]), axis=0)

def calculate_similarity(cc_embedding, tokenized):
    similarity_scores = []
    for article in tokenized:
        current_article = []
        file = article[0]
        for tokenized_sentence in article[1]:
            sentence_embedding = get_vec(tokenized_sentence)
            score = distance.cosine(cc_embedding, sentence_embedding)
            if score > 1.0 :
                score = 1.0
Пример #2
0
if args.n_basis_kb < 0:
    args.n_basis_kb = args.n_basis

# Set the random seed manually for reproducibility.
seed_all_randomness(args.seed, args.cuda, randomness=args.randomness)

########################
print("Loading data")
########################

device = torch.device("cuda" if args.cuda else "cpu")

idx2word_freq, target_idx2word_freq, dataloader_train_arr, dataloader_val, dataloader_val_shuffled, max_sent_len = load_corpus(
    args.data,
    args.batch_size,
    args.batch_size,
    device,
    skip_training=args.skip_train,
    want_to_shuffle_val=False)
dataloader_train = dataloader_train_arr[0]

kb_rels = []
kb_markers = []
kb_num_basis = []
kb_rel_idx = set()
# kb relation index to no. of bases mapping
kbidx2num_basis = defaultdict(int)
for batch in tqdm(dataloader_train):
    feature, target, kb_marker, num_basis = batch
    # indices of kb relations
    kb_indices = (kb_marker == 1).nonzero().flatten()
Пример #3
0
            f_log.write(s + '\n')


# Set the random seed manually for reproducibility.
seed_all_randomness(args.seed, args.cuda, randomness=args.randomness)

logging('Args: {}'.format(args))

########################
print("Loading data")
########################

device = torch.device("cuda" if args.cuda else "cpu")

idx2word_freq, target_idx2word_freq, dataloader_train_arr, dataloader_val, dataloader_val_shuffled, max_sent_len = \
    load_corpus(args.data, args.batch_size, args.batch_size, device, args.tensor_folder, args.training_file, args.training_split_num, args.copy_training, skip_val= args.skip_val)


def counter_to_tensor(idx2word_freq, device, rare, smooth_alpha=0):
    total = len(idx2word_freq)
    w_freq = torch.zeros(total,
                         dtype=torch.float,
                         device=device,
                         requires_grad=False)
    for i in range(total):
        # w_freq[i] = math.sqrt(idx2word_freq[x][1])
        if rare:
            if i == 0: print("Emphasizing on the RARE")
            if smooth_alpha == 0:
                if i == 0: print("No alpha-smoothing")
                w_freq[i] = idx2word_freq[i][1]