예제 #1
0
def factorize_tensors(tensors, iterations, dimensions, ten_path, meta_data,
                      path):
    # loading additional data required to build coupled matrices
    modifiers = meta_data['mod_index']
    mod2idx = dict(zip(modifiers, range(len(modifiers))))
    aspects = meta_data['asp_index']
    asp2idx = dict(zip(aspects, range(len(aspects))))
    extractions = pd.read_csv(join(path, 'extr_index.csv'))
    # creating the coupled matrices
    nlp = spacy.load('en_core_web_md')
    dim1, dim2, dim3 = tensors[0].shape
    dim1_mat = Tensor(np.zeros((dim1, 0)))  # fake matrix with no columns
    dim2_mat = Tensor(build_coupled_matrix(nlp, modifiers))
    dim3_mat = Tensor(build_coupled_matrix(nlp, aspects))
    # factorizing tensors
    for i, ten in tqdm(enumerate(tensors), total=iterations):
        cmtf = CMTF(random_state=0)
        (factors, _, _, _) = cmtf.decompose(ten,
                                            [dim1_mat, dim2_mat, dim3_mat],
                                            (dimensions, ))
        # getting modifier and aspect embeddings
        mod_embds, asp_embds = factors[1], factors[2]
        # computing and indexing the embedding of each extraction
        t = AnnoyIndex(dimensions, 'angular')
        for j, row in extractions.iterrows():
            m, a = row['modifier'], row['aspect']
            embd = np.multiply(mod_embds[mod2idx[m]], asp_embds[asp2idx[a]])
            t.add_item(j, embd)
        t.build(100)
        t.save(join(ten_path, 'embd_' + str(i) + '.ann'))
예제 #2
0
파일: io.py 프로젝트: IlyaKisil/inns-2019
    def get_samples(self, objects=(), angle_1=(), angle_2=(), as_tensor=True):
        """ Get representations of images from the ETH-80 dataset.

        Parameters
        ----------
        objects : list[str]
            List of objects of interest.
        angle_1 : list[str]
            List of angles of interest along longitude (from north to south).
        angle_2 : list[str]
            List of angles of interest along latitude (from west to east).
        as_tensor : bool
            If True, return samples as ``Tensor`` objects, numpy arrays otherwise.

        Returns
        -------
        samples : list[Tensor]
            List of tensors with dimensions representing height, width and color respectively.
        labels : np.ndarray
            Array of corresponding labels.
        """
        df = self.meta_data
        if len(angle_1) > 0:
            df = df[df.Angle_1.isin(angle_1)]

        if len(angle_2) > 0:
            df = df[df.Angle_2.isin(angle_2)]

        if len(objects) > 0:
            labels = [self._objects_to_labels[obj] for obj in objects]
            df = df[df.Label.isin(labels)]

        if df.empty:
            raise ValueError(
                "Selected criteria are not present in this dataset. "
                "Most likely, the specified pair(s) of `angle_1` and `angle_2` does not exist. \n"
                "HINT: Use 'available_angle_pairs' and 'available_objects' to see correct options."
            )

        path = df.apply(lambda x: os.path.join(_ETH80_HOME, "original",
                                               "{}.npz".format(x['id'])),
                        axis=1)
        data_as_series = path.apply(
            lambda x: np.load(x)['image'].astype(np.float))

        labels = df.Label.values

        if as_tensor:
            samples = [
                Tensor(sample, mode_names=["pixel_X", "pixel_Y", "color"])
                for sample in data_as_series.tolist()
            ]
        else:
            samples = data_as_series.tolist()

        return samples, labels
예제 #3
0
def factorize_matrices(matrices, iterations, dimensions, mat_path, path):
    # loading additional data required to build coupled matrices
    ext_ind = pd.read_csv(join(path, 'extr_index.csv'))
    extractions = list(
        ext_ind.apply(lambda x: (x['modifier'] + ' ' + x['aspect']), axis=1))
    # creating the coupled matrices
    nlp = spacy.load('en_core_web_md')
    dim1, dim2 = matrices[0].shape
    dim1_mat = Tensor(np.zeros((dim1, 0)))  # fake matrix with no columns
    dim2_mat = Tensor(build_coupled_matrix(nlp, extractions))
    # factorizing matrices
    for i, mat in tqdm(enumerate(matrices), total=iterations):
        hb_mat = Tensor(mat)
        cmtf = CMTF(random_state=0)
        (factors, _, _, _) = cmtf.decompose(hb_mat, [dim1_mat, dim2_mat],
                                            (dimensions, ))
        # getting extraction embeddings
        ext_embds = factors[1]
        # computing and indexing the embedding of each extraction
        t = AnnoyIndex(dimensions, 'angular')
        for j, embd in enumerate(ext_embds):
            t.add_item(j, embd)
        t.build(100)
        t.save(join(mat_path, 'embd_' + str(i) + '.ann'))
예제 #4
0
def contractor(x: Tensor, w, modes):
    """

    Parameters
    ----------
    x: Tensor object
    w: weights for STM to be contracted against
    modes: modes for STM to be contracted against

    Returns
    -------
    x_vec = contracted tensor along all modes except for one

    """

    temp = x.copy()
    for w, mode in zip(w, modes):
        temp.mode_n_product(np.expand_dims(w, axis=0), mode, inplace=True)
    x_vec = np.expand_dims(temp.data.squeeze(), axis=0)

    return x_vec
예제 #5
0
def create_stm_slice(
    d: Dict[str, DataFrame], start_index, slice_width, tensor_shape
) -> Tuple[List[Tensor], np.array, Tensor, np.float64, timestamps.Timestamp]:
    """

    Notes: Assumes 3rd order tensors for now, and that the order of the idcs are known

    Parameters
    ----------
    data: dict, the data organized for STM. Each entry of the dict is a dataframe.
    The labels are in the first slice

    start_index: int
    slice_width: int
    tensor_shape:list,  the size of your tensor data.

    Returns
    -------
    xs_train
    y_train
    xs_test
    y_test

    """

    # This dict has two Keys, "Volume" and "Price"
    # "Volume" contains a pd.DataFrame with a DatetimeIndex and the Cols: VIX Price   Gold Price   SPX Close   Label
    # "Price"  contains a pd.DataFrame with a DatetimeIndex and the Cols: VIX Volume  Gold Volume  SPX Volume  Label
    dict_stm = copy.deepcopy(d)

    idcs = dict_stm.keys()
    for idx in idcs:  # Price and Volume
        # schneidet die Daten aus dict_stm["Price" & "Index"] weg, die nicht zu diesem Slice gehören
        dict_stm[idx] = dict_stm[idx][start_index:start_index + slice_width]

    # initialize a 3rd dim array. It has the shape [slice_width] + tensor_shape[1:] ==
    # [250, 3, 2]
    # slice_as_3d_np_arr contains 250 (days) x 3 (features) in 2 auspraegungen (Price/Volume)
    slice_as_3d_np_arr = np.zeros([slice_width] + tensor_shape[1:])

    for i, idx in enumerate(idcs):  # Price and Volume
        slice_as_3d_np_arr[:, :, i] = np.array(dict_stm[idx].drop("Label",
                                                                  axis=1))

    xs_train: List[Tensor] = []
    n_tensors: int = slice_width - tensor_shape[0] + 1  # 251 tensors
    y_train: np.array = np.zeros(n_tensors - 1)
    xs_test: Tensor
    y_test: np.float64

    for i in range(n_tensors):
        upper_idx = i + tensor_shape[0]

        # if training data
        if i < n_tensors - 1:
            xs_train.append(Tensor(slice_as_3d_np_arr[i:upper_idx, :, :]))
            y_train[i] = dict_stm["Price"]["Label"][upper_idx - 1]

        # otherwise it's testing data
        else:
            xs_test = Tensor(slice_as_3d_np_arr[i:upper_idx, :, :])
            y_test = dict_stm["Price"]["Label"][upper_idx - 1]

    associated_index: timestamps.Timestamp = dict_stm["Price"].iloc[-2].name

    return xs_train, y_train, xs_test, y_test, associated_index
예제 #6
0
def make_data_stm(d, start_index, L, tensor_size, lag):
    """

    Notes: Assumes 3rd order tensors for now, and that the order of the keys are known

    Parameters
    ----------
    data: dict, the data organized for STM. Each entry of the dict is a dataframe.
    The labels are in the first slice

    start_index: int
    L: int
    tensor_size:list,  the size of your tensor data.

    Returns
    -------
    train_data
    train_labels
    test_data
    test_labels

    """

    dict_stm = copy.deepcopy(d)
    if not lag:
        keys = dict_stm.keys()
        for key in keys:
            dict_stm[key] = dict_stm[key][start_index:start_index + L]

        data_np = np.zeros([L] + tensor_size[1:])
        for i, key in enumerate(keys):
            data_np[:,:,i] = np.array(dict_stm[key].drop('Label', axis=1))

    else:
        #We are lagging the price here
        tensor_size[-1] += 1
        keys = ['Price', 'LaggedPrice', 'Volume']
        dict_stm['LaggedPrice'] = dict_stm['Price'].shift(1).dropna()
        dict_stm['Price'] = dict_stm['Price'][1:]
        dict_stm['Volume'] = dict_stm['Volume'][1:]

        for key in keys:
            dict_stm[key] = dict_stm[key][start_index:start_index + L]

        data_np = np.zeros([L] + tensor_size[1:])
        for i, key in enumerate(keys):
            data_np[:,:,i] = np.array(dict_stm[key].drop('Label', axis=1))

    train_data = []
    n_tensors = L - tensor_size[0]+1
    train_labels = np.zeros(n_tensors-1)

    for i in range(n_tensors):
        # if training data
        if i<n_tensors-1:
            train_data.append(Tensor(data_np[i:i+tensor_size[0],:,:]))
            train_labels[i] = dict_stm['Price']['Label'][i+tensor_size[0]-1]

        #otherwise it's testing data
        else:
            test_data = Tensor(data_np[i:i + tensor_size[0], :, :])
            test_label = dict_stm['Price']['Label'][i + tensor_size[0] - 1]


    associated_index = dict_stm['Price'].iloc[-2].name


    return train_data, train_labels, test_data, test_label, associated_index