def _test_pairwise_parallel(data): from scipy.spatial.distance import euclidean M = U.pairwise(euclidean, data, normalize=False, dtype=np.float64, parallel=False) L = U.pairwise(euclidean, data, normalize=False, dtype=np.float64, parallel=True) assert np.all(M == L)
def weighted_average_RSA(directory='.', layers=[], attention='linear', test_size=1/2, attention_hidden_size=None, standardize=False, epochs=1, device='cpu'): from sklearn.model_selection import train_test_split torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False splitseed = random.randint(0, 1024) result = [] logging.info("Loading transcription data") data = pickle.load(open("{}/global_input.pkl".format(directory), "rb")) trans = data['ipa'] act = [ torch.tensor([item[:, :]]).float().to(device) for item in data['audio'] ] trans, trans_val, act, act_val = train_test_split(trans, act, test_size=test_size, random_state=splitseed) if standardize: logging.info("Standardizing data") act, act_val = normalize(act, act_val) logging.info("Computing edit distances") edit_sim = torch.tensor(U.pairwise(S.stringsim, trans)).float().to(device) edit_sim_val = torch.tensor(U.pairwise(S.stringsim, trans_val)).float().to(device) logging.info("Training for input features") this = train_wa(edit_sim, edit_sim_val, act, act_val, attention=attention, attention_hidden_size=None, epochs=epochs, device=device) result.append({**this, 'model': 'random', 'layer': 'mfcc'}) result.append({**this, 'model': 'trained', 'layer': 'mfcc'}) del act, act_val logging.info("Maximum correlation on val: {} at epoch {}".format(result[-1]['cor'], result[-1]['epoch'])) for mode in ["trained", "random"]: for layer in layers: logging.info("Loading activations for {} {}".format(mode, layer)) data = pickle.load(open("{}/global_{}_{}.pkl".format(directory, mode, layer), "rb")) logging.info("Training for {} {}".format(mode, layer)) act = [ torch.tensor([item[:, :]]).float().to(device) for item in data[layer] ] act, act_val = train_test_split(act, test_size=test_size, random_state=splitseed) if standardize: logging.info("Standardizing data") act, act_val = normalize(act, act_val) this = train_wa(edit_sim, edit_sim_val, act, act_val, attention=attention, attention_hidden_size=None, epochs=epochs, device=device) result.append({**this, 'model': mode, 'layer': layer}) del act, act_val logging.info("Maximum correlation on val: {} at epoch {}".format(result[-1]['cor'], result[-1]['epoch'])) return result
def pairwise(self, trees1: Sequence[TreeLike], trees2: Optional[Sequence[TreeLike]]=None, normalize: bool=False, dtype: type=np.float64): """Returns the value of the tree kernel between sequence of trees1 and trees2, using the Fast Tree Kernel algorithm of Moschitti, A. (2006). """ nodes1 = [ self.nodemap(t) for t in trees1 ] if trees2 is not None: nodes2: Optional[List[Dict[tuple, List[TreeLike]]]] = [ self.nodemap(t) for t in trees2 ] else: nodes2 = None # For some reason this doesn't parallelize well: we'll call the sequential version of U.pairwise return U.pairwise(self.ftk, nodes1, data2=nodes2, normalize=normalize, dtype=dtype, parallel=False)
def weighted_average_RSA_partial(directory='.', layers=[], test_size=1 / 2, standardize=False, epochs=1, device='cpu'): from sklearn.model_selection import train_test_split from platalea.dataset import Flickr8KData torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False splitseed = random.randint(0, 1024) result = [] logging.info("Loading transcription data") data = pickle.load(open("{}/global_input.pkl".format(directory), "rb")) trans = data['ipa'] act = [ torch.tensor([item[:, :]]).float().to(device) for item in data['audio'] ] val = Flickr8KData(root='/roaming/gchrupal/datasets/flickr8k/', split='val') image_map = {item['audio_id']: item['image'] for item in val} image = np.stack([image_map[item] for item in data['audio_id']]) trans, trans_val, act, act_val, image, image_val = train_test_split( trans, act, image, test_size=test_size, random_state=splitseed) if standardize: logging.info("Standardizing data") act, act_val = normalize(act, act_val) logging.info("Computing edit distances") edit_sim = torch.tensor(U.pairwise(S.stringsim, trans)).float().to(device) edit_sim_val = torch.tensor(U.pairwise(S.stringsim, trans_val)).float().to(device) logging.info("Computing image similarities") image = torch.tensor(image).float() image_val = torch.tensor(image_val).float() sim_image = S.cosine_matrix(image, image) sim_image_val = S.cosine_matrix(image_val, image_val) logging.info( "Computing partial correlation for input features (mean pooling)") wa = platalea.attention.MeanPool().to(device) avg_pool = torch.cat([wa(item) for item in act]) avg_pool_sim = S.cosine_matrix(avg_pool, avg_pool) avg_pool_val = torch.cat([wa(item) for item in act_val]) avg_pool_sim_val = S.cosine_matrix(avg_pool_val, avg_pool_val) # Training data # Edit ~ Act + Image Edit = S.triu(edit_sim).cpu().numpy() Image = S.triu(sim_image).cpu().numpy() Act = S.triu(avg_pool_sim).cpu().numpy() # Val data Edit_val = S.triu(edit_sim_val).cpu().numpy() Image_val = S.triu(sim_image_val).cpu().numpy() Act_val = S.triu(avg_pool_sim_val).cpu().numpy() e_full, e_base, e_mean = partial_r2(Edit, Act, Image, Edit_val, Act_val, Image_val) logging.info("Full, base, mean error: {} {}".format( e_full, e_base, e_mean)) r2 = (e_base - e_full) / e_base this = { 'epoch': None, 'error': e_full, 'baseline': e_base, 'error_mean': e_mean, 'r2': r2 } #this = train_wa(edit_sim, edit_sim_val, act, act_val, attention=attention, attention_hidden_size=None, epochs=epochs, device=device) result.append({**this, 'model': 'random', 'layer': 'mfcc'}) result.append({**this, 'model': 'trained', 'layer': 'mfcc'}) del act, act_val logging.info("Partial r2 on val: {} at epoch {}".format( result[-1]['r2'], result[-1]['epoch'])) for mode in ["trained", "random"]: for layer in layers: logging.info("Loading activations for {} {}".format(mode, layer)) data = pickle.load( open("{}/global_{}_{}.pkl".format(directory, mode, layer), "rb")) logging.info("Training for {} {}".format(mode, layer)) act = [ torch.tensor([item[:, :]]).float().to(device) for item in data[layer] ] act, act_val = train_test_split(act, test_size=test_size, random_state=splitseed) if standardize: logging.info("Standardizing data") act, act_val = normalize(act, act_val) avg_pool = torch.cat([wa(item) for item in act]) avg_pool_sim = S.cosine_matrix(avg_pool, avg_pool) avg_pool_val = torch.cat([wa(item) for item in act_val]) avg_pool_sim_val = S.cosine_matrix(avg_pool_val, avg_pool_val) Act = S.triu(avg_pool_sim).cpu().numpy() Act_val = S.triu(avg_pool_sim_val).cpu().numpy() e_full, e_base, e_mean = partial_r2(Edit, Act, Image, Edit_val, Act_val, Image_val) logging.info("Full, base, mean error: {} {}".format( e_full, e_base, e_mean)) r2 = (e_base - e_full) / e_base this = { 'epoch': None, 'error': e_full, 'baseline': e_base, 'error_mean': e_mean, 'r2': r2 } pickle.dump(dict(Edit=Edit, Act=Act, Image=Image, Edit_val=Edit_val, Act_val=Act_val, Image_val=Image_val), open("fufi_{}_{}.pkl".format(mode, layer), "wb"), protocol=4) result.append({**this, 'model': mode, 'layer': layer}) del act, act_val logging.info("Partial R2 on val: {} at epoch {}".format( result[-1]['r2'], result[-1]['epoch'])) return result
def _test_pairwise_symmetric(data): from scipy.spatial.distance import euclidean M = U.pairwise(euclidean, data, normalize=False, dtype=np.float64) assert np.all(M == M.T)
def test_pairwise_diagonal(data): from scipy.spatial.distance import euclidean assert np.allclose( U.pairwise(euclidean, data, normalize=False, dtype=np.float64).diagonal(), 0.0)
def embed(X, ref, sim, parallel=True): return U.pairwise(sim, X, ref)
def test_pairwise_ftk(trees, normalize): K = Kernel() M_naive = U.pairwise(K, trees, parallel=False, normalize=normalize) M_ftk = K.pairwise(trees, normalize=normalize) assert np.allclose(M_naive, M_ftk)