def eval_and_log_w2v(t_train, w2v_space_file): """ Evaluates the trained Word2Vec model. The evaluation metric is the Spearman correlation. Evaluation is designed to use the MEN dataset described in Bruni et al. (2014). :param t_train: float -- time taken to train the Word2Vec model :param w2v_space_file: str -- file path of the w2v model that was trained """ global performance_summary if verbose: print("evaluating and logging the Word2Vec model ...") # evaluate the w2v model try: w2v_space = utils.readDM(w2v_space_file) spcorr, pairs = MEN.compute_men_spearman(w2v_space, testset_file) with open(w2v_results_file, "a+") as f: f.write("RUN " + str(run) + ":" + \ "\tSP_CORR: " + str(spcorr) + \ "\tTEST_PAIRS: " + str(pairs) + \ "\tTRAIN_TIME: " + str(t_train) + "\n") # keep internal log performance_summary[run].extend([spcorr, pairs, t_train]) except Exception as e: with open(errorlog, "a") as f: f.write(str(e)[:500] + "\n") print("An error occured while evaluating Word2Vec. Check", errorlog, "for further information.")
def read_incremental_parts(self, outspace, outcols, flyfile, verbose=False): """ Returns a co-occurrence matrix, a corresponding vocabulary and its index, and a Fruitfly object. The matrix and the vocabulary can be newly instantiated or taken from existing files. The Fruitfly object can be optionally created alongside, also either new or from an existing file. All these options are handled by attributes of the Incrementor object from which this method is called. :param outspace: str -- file path to a co-occurrence count :param outcols: str -- file path to the corresponding vocabulary :param flyfile: str -- file path to a Fruitfly config (parameters and connections) :param verbose: bool -- comment on the workings via print statements :return: ndarray [[]] -- co-occurrence matrix (two axes, each of length n) :return: {str:int} -- mapping of vocabulary to matrix positions (length: n) :return: {int:str} -- mapping of matrix indices to vocabulary (length: n) :return: Fruitfly -- Fruitfly object (or None if not wanted) """ if self.is_incremental: if verbose: print("\nLoading existing co-occurrence count from", outspace, "...") # returns dict of word : vector unhashed_space = utils.readDM(outspace) i_to_words, words_to_i = utils.readCols(outcols) dimensions = sorted(words_to_i, key=words_to_i.get) cooc = np.stack(tuple([unhashed_space[w] for w in dimensions])) else: cooc = np.array([[]]) words_to_i = {} i_to_words = {} if self.is_grow_fly: if self.is_new_fly: if verbose: print("creating new fruitfly...") # default config: (50,40000,6,5,log) fruitfly = Fruitfly.from_scratch(max_pn_size=self.fly_max_pn) else: if verbose: print("loading fruitfly from", flyfile, "...") fruitfly = Fruitfly.from_config(flyfile) self.fly_max_pn = fruitfly.max_pn_size else: fruitfly = None return cooc, words_to_i, i_to_words, fruitfly
def prepare_flight(): """ read in the count vectors etc. and choose which ones to fly based on a list of required words. :return fly_these: {str:[float]} -- possibly a subset of unhashed_space :return unhashed_space: {str:[float]} -- words and their corresponding co-occurrence counts :return_words_to_i: {str:int} -- mapping of context words to their position in the count """ if verbose: print("Preparing hashing ...") unhashed_space = utils.readDM(breeder.outspace) i_to_words, words_to_i = utils.readCols(breeder.outcols) # only select words that will be needed for evaluation: if overlap_file is None: # in this case, fly() is applied to the whole of unhashed_space fly_these = unhashed_space else: words_for_flight = breeder.read_checklist(overlap_file) fly_these = { w: unhashed_space[w] for w in words_for_flight if w in unhashed_space } return fly_these, unhashed_space, words_to_i
print("improvement:", round(internal_log[run[0]][4], 5), "with configuration:", all_ff_specs[run[0]]) """ Parameter Input """ data, column_labels = get_text_resources_from_argv() goldstandard = get_testset_from_argv() log_dest = get_logging_from_argv() flattening = get_flattening_from_argv() kc_factor_min, kc_factor_max, kc_steps = get_ranges_from_argv("-kc") projections_min, projections_max, proj_steps = get_ranges_from_argv("-proj") hash_perc_min, hash_perc_max, hash_steps = get_ranges_from_argv("-hash") # returns {str:[float]} in_space = utils.readDM(data) # returns {int:str} and {str:i} i_to_cols, cols_to_i = utils.readCols(column_labels) # length of word vector (= input dimension) pn_size = len(i_to_cols) # for reporting purposes verbose = "-v" in sys.argv no_overall_summary_wanted = "-no-summary" in sys.argv # {run:ff_specs} all_ff_specs = {} # {run:results} internal_log = {} sp_vals = {} """ Grid Search""" run = 0
num_dims = 4000 #Num dims in BNC dm file target = sys.argv[1] #A kind (e.g. toad_N) chars_file = sys.argv[2] num_chars = int(sys.argv[3]) context_weight = int(sys.argv[4]) '''Get character name''' character = "" m = re.search(".*/(.*).chars", chars_file) if m: character = m.group(1) else: character = chars_file[:-6] character = character.lower() + "_char_N" '''Load files''' background_space = utils.readDM("BNC.w10.4000c.5000r.ppmi.rownorm.dm") background_cols = utils.readDims("BNC.w10.4000c.5000r.ppmi.rownorm.cols") chars = utils.readChars(chars_file) '''Compute contextualisation''' c = 1 reweighted_vectors = [] for context in sorted(chars, key=chars.get, reverse=True): ppmi = chars[context] i = 0 context_vector = np.zeros(num_dims) #print("Reweighting vector with context",context) for col in background_cols: if context in background_space and col in background_space: #in case core space does not include context (e.g. bnc.2000 does not include 'rat') context_vector[i] = pow( utils.cosine_similarity(background_space[context],
import numpy as np import utils import sys def mk_training_matrices(pairs, en_dimension, cat_dimension, semanticspace, catalan_space): en_mat = np.zeros((len(pairs),en_dimension)) cat_mat = np.zeros((len(pairs),cat_dimension)) c = 0 for p in pairs: en_word,cat_word = p.split() en_mat[c] = semanticspace[en_word] cat_mat[c] = catalan_space[cat_word] c+=1 return en_mat,cat_mat if len(sys.argv) == 4: space=sys.argv[1] if space=='reducedcolors': semanticspace=utils.readDM("data/reducedcolors.dm")s if space =='fullcolors': semanticspace=utils.readDM("data/full.dm") word = sys.argv[2] num_neighbours = int(sys.argv[3]) print(utils.neighbours(semanticspace, semanticspace[word],num_neighbours)) english_neighbours = utils.neighbours(semanticspace,semanticspace[word],num_neighbours) utils.run_PCAneighbours(semanticspace,[word]+english_neighbours,"english_neighbours"+word+".png")
print("\nUSAGE: python3 projection.py bnc|wiki [num-kc] [size-proj] [percent-hash]\n\ - num-kc: the number of Kenyon cells\n\ - size-proj: how many projection neurons are used for each projection\n\ - percent-hash: how much of the Kenyon layer to keep in the final hash.\n") sys.exit() if sys.argv[1] == "bnc": data = "data/BNC-MEN.dm" column_labels = "data/BNC-MEN.cols" MEN_annot = "data/MEN_dataset_lemma_form_full" else: data = "data/wiki_all.dm" column_labels = "data/wiki_all.cols" MEN_annot = "data/MEN_dataset_natural_form_full" english_space = utils.readDM(data) i_to_cols, cols_to_i = utils.readCols(column_labels) PN_size = len(english_space.popitem()[1]) KC_size = int(sys.argv[2]) proj_size = int(sys.argv[3]) percent_hash = int(sys.argv[4]) print("SIZES PN LAYER:",PN_size,"KC LAYER:",KC_size) print("SIZE OF PROJECTIONS:",proj_size) print("SIZE OF FINAL HASH:",percent_hash,"%") projection_layer = np.zeros(PN_size) kenyon_layer = np.zeros(KC_size) projection_functions = []
from flask import render_template, request, jsonify from sklearn.decomposition import PCA import logging from utils import sim_to_matrix, sim_to_matrix_url, readDM, make_figure, readUrls from htmlparser import extract_from_url import mk_page_vector from openviz import app pca = PCA(n_components=2) target_word = "meaning" dm_dict_en = readDM("./openviz/spaces/english.dm") dm_dict_ca = readDM("./openviz/spaces/catalan.dm") url_dict_en = readUrls("./openviz/spaces/url_english.csv") url_dict_ca = readUrls("./openviz/spaces/url_english.csv") language_codes = {} language_codes["English"] = [dm_dict_en, url_dict_en, "en"] language_codes["Catalan"] = [dm_dict_ca, url_dict_ca, "ca"] def compute(target_word, language): error = "" if language != "": dm_dict = language_codes[language][0] dictionary = language_codes[language][2] else: dm_dict = language_codes["English"][0] dictionary = language_codes["English"][2] logging.exception(language) if target_word not in dm_dict:
import sys import MEN import utils # parameter input while True: spacefiles = utils.loop_input( rtype=str, default=None, msg="Space to be used (without file extension): ") try: data = spacefiles + ".dm" column_labels = spacefiles + ".cols" # returns {word:word_vector} unhashed_space = utils.readDM(data) # returns both-ways dicts of the vocabulary (word:index_in_vector) i_to_cols, cols_to_i = utils.readCols(column_labels) except FileNotFoundError as e: print("Unable to find files for input space and/or vocabulary.\n\ - correct file path?\n\ - are the file extensions '.dm' and '.cols'?\n\ - don't specify the file extension.") continue else: break MEN_annot = utils.loop_input(rtype=str, default=None, msg="Testset to be used: ") evaluate_mode = True if input( "Only evaluate the space (without flying)? [y/n] ").upper(
en_word, cat_word = p.split() en_mat[c] = english_space[en_word] cat_mat[c] = catalan_space[cat_word] c += 1 return en_mat, cat_mat def linalg(mat_english, mat_catalan): w = np.linalg.lstsq(mat_english, mat_catalan)[0] # obtaining the parameters print(mat_english.shape, mat_catalan.shape, w.shape) return w '''Read semantic spaces''' english_space = utils.readDM("data/english.subset.dm") catalan_space = utils.readDM("data/catalan.subset.dm") utils.run_PCA(english_space, english_space.keys(), "english_space.png") utils.run_PCA(catalan_space, catalan_space.keys(), "catalan_space.png") '''Read all word pairs''' all_pairs = [] f = open("data/pairs.txt") for l in f: l = l.rstrip('\n') all_pairs.append(l) f.close() '''Make training/test fold''' training_pairs = all_pairs[:120] test_pairs = all_pairs[121:] '''Make training/test matrices''' en_mat, cat_mat = mk_training_matrices(training_pairs, 400, 300, english_space,
import numpy as np from utils import sim_to_matrix, readDM, run_PCA import sys def visualise(words, space): run_PCA(space,words,words[0]+"_space.png") '''Read semantic space''' space = readDM(sys.argv[1]) neighbours = sim_to_matrix(space, space[sys.argv[2]], int(sys.argv[3])) #print(neighbours) visualise(neighbours,space)
def main(): if len(sys.argv) == 3: # listtop500 = open('data/listtop500.txt','r') #listlast500=open('data/listlast500.txt','r') #['reykjanes','danielli','underdrawing','halichoerus','hepler','change-'] #['widgeon','colourpoint','water-lilies','kingbirds','gallinules','pebbledash'] #['flowers.','nard','hearing-aid','filsham','trumpet-shaped','crecca' ] #['kerchief', 'kingbirds','cerise','biretta','pale-blue','v-necked','pebbledash'] #['crecca','flowers.','corollas','shovelers','supercilium','crocuses'] listtop500=['village','ponk','catspaw','lycaenid','orangey-pink','saponaria'] if sys.argv[2]=='full': semanticspace1=utils.readDM("data/colorswithoutremovedtargets.dm") if sys.argv[2]=='nonzero': semanticspace1=utils.readDM("data/reducedcolors.dm") dicttop=defaultdict(list) dictlast=defaultdict(list) for line in listtop500: #word = line.strip() word=line num_neighbours = int(sys.argv[1]) neighbours1=[] for i in utils.neighbours(semanticspace1,semanticspace1[word],num_neighbours): neighbours1.append(i.strip(".")) neighbours2=[] cosinefull=[] for i in functionneighbours(word,num_neighbours): neighbours2.append(i[0]) cosinefull.append(i[1]) densityfull=(sum(cosinefull))/(len(cosinefull)) ##compare neigbours of 2 different spaces intersection = set(neighbours1) & set(neighbours2) print(word, intersection,len(intersection)) #dicttop[word]=len(intersection) # for line in listlast500: # word = line.strip() # num_neighbours = int(sys.argv[1]) # neighbours1=[] # for i in utils.neighbours(semanticspace1,semanticspace1[word],num_neighbours): # neighbours1.append(i.strip(".")) # neighbours2=[] # cosinefull=[] # for i in functionneighbours(word,num_neighbours): # neighbours2.append(i[0]) # cosinefull.append(i[1]) # densityfull=(sum(cosinefull))/(len(cosinefull)) # ##compare neigbours of 2 different spaces # intersection = set(neighbours1) & set(neighbours2) # print(intersection,len(intersection)) # dictlast[word]=len(intersection) #density color space listdensity=[] listcoherence=[] neighbours=utils.neighbours(semanticspace1,semanticspace1[word],num_neighbours) for i in neighbours: cosine=utils.cosine_similarity(semanticspace1[word],semanticspace1[i]) # if np.isnan(cosine): # pass # else: listdensity.append(cosine) density=sum(listdensity)/(len(listdensity)) print('density color space: ',density) print('density full space: ', densityfull) dicttop[word]=[len(intersection),density]
import sys sys.path.append('..') import utils from scipy.stats import spearmanr dm_dict = utils.readDM(sys.argv[1]) eval_dataset = sys.argv[2] system = [] gold = [] if eval_dataset == 'men': lines = open("MEN_dataset_lemma_form_full", 'r') sep = ' ' elif eval_dataset == 'simlex': with open("SimLex-999.txt", 'r') as f: lines = f.read().splitlines()[1:] sep = '\t' for l in lines: fields = l.rstrip('\n').split(sep) w1 = fields[0][:-2] w2 = fields[1][:-2] score = float(fields[2]) if w1 in dm_dict and w2 in dm_dict: try: cos = utils.cosine_similarity(dm_dict[w1], dm_dict[w2]) system.append(cos) gold.append(score) print(w1, w2, cos, score) except: continue f.close()
#This test the definitional nonces on sum. #python3 test_def_nonces.py spaces/ukwac_reduced.txt /definitions/nonce.definitions.300.test import sys import re import utils background = sys.argv[1] dataset = sys.argv[2] mrr = 0.0 human_responses = [] system_responses = [] dm_dict = utils.readDM(background) c = 0 f=open(dataset) for l in f: if c < 1: c+=1 continue else: fields=l.rstrip('\n').split('\t') nonce = fields[0] sentence = fields[1].replace("___","").split() print("--") print(nonce) print("SENTENCE:",sentence) if nonce in dm_dict: