示例#1
0
def _test_pairwise_parallel(data):
    from scipy.spatial.distance import euclidean
    M = U.pairwise(euclidean,
                   data,
                   normalize=False,
                   dtype=np.float64,
                   parallel=False)
    L = U.pairwise(euclidean,
                   data,
                   normalize=False,
                   dtype=np.float64,
                   parallel=True)
    assert np.all(M == L)
示例#2
0
def weighted_average_RSA(directory='.', layers=[], attention='linear', test_size=1/2,  attention_hidden_size=None, standardize=False, epochs=1, device='cpu'):
    from sklearn.model_selection import train_test_split
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    splitseed = random.randint(0, 1024)
    result = []
    logging.info("Loading transcription data")
    data = pickle.load(open("{}/global_input.pkl".format(directory), "rb"))
    trans = data['ipa']
    act = [ torch.tensor([item[:, :]]).float().to(device) for item in data['audio'] ]

    trans, trans_val, act, act_val = train_test_split(trans, act, test_size=test_size, random_state=splitseed)
    if standardize:
        logging.info("Standardizing data")
        act, act_val = normalize(act, act_val)
    logging.info("Computing edit distances")
    edit_sim = torch.tensor(U.pairwise(S.stringsim, trans)).float().to(device)
    edit_sim_val = torch.tensor(U.pairwise(S.stringsim, trans_val)).float().to(device)
    logging.info("Training for input features")
    this = train_wa(edit_sim, edit_sim_val, act, act_val, attention=attention, attention_hidden_size=None, epochs=epochs, device=device)
    result.append({**this, 'model': 'random', 'layer': 'mfcc'})
    result.append({**this, 'model': 'trained', 'layer': 'mfcc'})
    del act, act_val
    logging.info("Maximum correlation on val: {} at epoch {}".format(result[-1]['cor'], result[-1]['epoch']))
    for mode in ["trained", "random"]:
        for layer in layers:
            logging.info("Loading activations for {} {}".format(mode, layer))
            data = pickle.load(open("{}/global_{}_{}.pkl".format(directory, mode, layer), "rb"))
            logging.info("Training for {} {}".format(mode, layer))
            act = [ torch.tensor([item[:, :]]).float().to(device) for item in data[layer] ]
            act, act_val = train_test_split(act, test_size=test_size, random_state=splitseed)
            if standardize:
                logging.info("Standardizing data")
                act, act_val = normalize(act, act_val)
            this = train_wa(edit_sim, edit_sim_val, act, act_val, attention=attention, attention_hidden_size=None, epochs=epochs, device=device)
            result.append({**this, 'model': mode, 'layer': layer})
            del act, act_val
            logging.info("Maximum correlation on val: {} at epoch {}".format(result[-1]['cor'], result[-1]['epoch']))
    return result
示例#3
0
文件: kernel.py 项目: gchrupala/ursa
 def pairwise(self,
              trees1: Sequence[TreeLike],
              trees2: Optional[Sequence[TreeLike]]=None,
              normalize: bool=False,
              dtype: type=np.float64):
     """Returns the value of the tree kernel between sequence of trees1 and trees2, 
     using the Fast Tree Kernel algorithm of Moschitti, A. (2006). 
     """
     nodes1 = [ self.nodemap(t) for t in trees1 ]
     if trees2 is not None:
         nodes2: Optional[List[Dict[tuple, List[TreeLike]]]] = [ self.nodemap(t) for t in trees2 ]
     else:
         nodes2 = None
     # For some reason this doesn't parallelize well: we'll call the sequential version of U.pairwise
     return U.pairwise(self.ftk, nodes1, data2=nodes2, normalize=normalize, dtype=dtype, parallel=False)
示例#4
0
def weighted_average_RSA_partial(directory='.',
                                 layers=[],
                                 test_size=1 / 2,
                                 standardize=False,
                                 epochs=1,
                                 device='cpu'):
    from sklearn.model_selection import train_test_split
    from platalea.dataset import Flickr8KData
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    splitseed = random.randint(0, 1024)
    result = []
    logging.info("Loading transcription data")
    data = pickle.load(open("{}/global_input.pkl".format(directory), "rb"))
    trans = data['ipa']
    act = [
        torch.tensor([item[:, :]]).float().to(device) for item in data['audio']
    ]
    val = Flickr8KData(root='/roaming/gchrupal/datasets/flickr8k/',
                       split='val')
    image_map = {item['audio_id']: item['image'] for item in val}
    image = np.stack([image_map[item] for item in data['audio_id']])

    trans, trans_val, act, act_val, image, image_val = train_test_split(
        trans, act, image, test_size=test_size, random_state=splitseed)
    if standardize:
        logging.info("Standardizing data")
        act, act_val = normalize(act, act_val)
    logging.info("Computing edit distances")
    edit_sim = torch.tensor(U.pairwise(S.stringsim, trans)).float().to(device)
    edit_sim_val = torch.tensor(U.pairwise(S.stringsim,
                                           trans_val)).float().to(device)
    logging.info("Computing image similarities")
    image = torch.tensor(image).float()
    image_val = torch.tensor(image_val).float()
    sim_image = S.cosine_matrix(image, image)
    sim_image_val = S.cosine_matrix(image_val, image_val)

    logging.info(
        "Computing partial correlation for input features (mean pooling)")
    wa = platalea.attention.MeanPool().to(device)
    avg_pool = torch.cat([wa(item) for item in act])
    avg_pool_sim = S.cosine_matrix(avg_pool, avg_pool)
    avg_pool_val = torch.cat([wa(item) for item in act_val])
    avg_pool_sim_val = S.cosine_matrix(avg_pool_val, avg_pool_val)
    # Training data
    #  Edit ~ Act + Image
    Edit = S.triu(edit_sim).cpu().numpy()
    Image = S.triu(sim_image).cpu().numpy()
    Act = S.triu(avg_pool_sim).cpu().numpy()
    # Val data
    Edit_val = S.triu(edit_sim_val).cpu().numpy()
    Image_val = S.triu(sim_image_val).cpu().numpy()
    Act_val = S.triu(avg_pool_sim_val).cpu().numpy()
    e_full, e_base, e_mean = partial_r2(Edit, Act, Image, Edit_val, Act_val,
                                        Image_val)
    logging.info("Full, base, mean error: {} {}".format(
        e_full, e_base, e_mean))
    r2 = (e_base - e_full) / e_base
    this = {
        'epoch': None,
        'error': e_full,
        'baseline': e_base,
        'error_mean': e_mean,
        'r2': r2
    }

    #this = train_wa(edit_sim, edit_sim_val, act, act_val, attention=attention, attention_hidden_size=None, epochs=epochs, device=device)
    result.append({**this, 'model': 'random', 'layer': 'mfcc'})
    result.append({**this, 'model': 'trained', 'layer': 'mfcc'})
    del act, act_val
    logging.info("Partial r2 on val: {} at epoch {}".format(
        result[-1]['r2'], result[-1]['epoch']))
    for mode in ["trained", "random"]:
        for layer in layers:
            logging.info("Loading activations for {} {}".format(mode, layer))
            data = pickle.load(
                open("{}/global_{}_{}.pkl".format(directory, mode, layer),
                     "rb"))
            logging.info("Training for {} {}".format(mode, layer))
            act = [
                torch.tensor([item[:, :]]).float().to(device)
                for item in data[layer]
            ]
            act, act_val = train_test_split(act,
                                            test_size=test_size,
                                            random_state=splitseed)
            if standardize:
                logging.info("Standardizing data")
                act, act_val = normalize(act, act_val)
            avg_pool = torch.cat([wa(item) for item in act])
            avg_pool_sim = S.cosine_matrix(avg_pool, avg_pool)
            avg_pool_val = torch.cat([wa(item) for item in act_val])
            avg_pool_sim_val = S.cosine_matrix(avg_pool_val, avg_pool_val)
            Act = S.triu(avg_pool_sim).cpu().numpy()
            Act_val = S.triu(avg_pool_sim_val).cpu().numpy()
            e_full, e_base, e_mean = partial_r2(Edit, Act, Image, Edit_val,
                                                Act_val, Image_val)
            logging.info("Full, base, mean error: {} {}".format(
                e_full, e_base, e_mean))
            r2 = (e_base - e_full) / e_base
            this = {
                'epoch': None,
                'error': e_full,
                'baseline': e_base,
                'error_mean': e_mean,
                'r2': r2
            }
            pickle.dump(dict(Edit=Edit,
                             Act=Act,
                             Image=Image,
                             Edit_val=Edit_val,
                             Act_val=Act_val,
                             Image_val=Image_val),
                        open("fufi_{}_{}.pkl".format(mode, layer), "wb"),
                        protocol=4)
            result.append({**this, 'model': mode, 'layer': layer})
            del act, act_val
            logging.info("Partial R2 on val: {} at epoch {}".format(
                result[-1]['r2'], result[-1]['epoch']))
    return result
示例#5
0
def _test_pairwise_symmetric(data):
    from scipy.spatial.distance import euclidean
    M = U.pairwise(euclidean, data, normalize=False, dtype=np.float64)
    assert np.all(M == M.T)
示例#6
0
def test_pairwise_diagonal(data):
    from scipy.spatial.distance import euclidean
    assert np.allclose(
        U.pairwise(euclidean, data, normalize=False,
                   dtype=np.float64).diagonal(), 0.0)
示例#7
0
def embed(X, ref, sim, parallel=True):
    return U.pairwise(sim, X, ref)
示例#8
0
def test_pairwise_ftk(trees, normalize):
    K = Kernel()
    M_naive = U.pairwise(K, trees, parallel=False, normalize=normalize)
    M_ftk = K.pairwise(trees, normalize=normalize)
    assert np.allclose(M_naive, M_ftk)