def factorize_tensors(tensors, iterations, dimensions, ten_path, meta_data, path): # loading additional data required to build coupled matrices modifiers = meta_data['mod_index'] mod2idx = dict(zip(modifiers, range(len(modifiers)))) aspects = meta_data['asp_index'] asp2idx = dict(zip(aspects, range(len(aspects)))) extractions = pd.read_csv(join(path, 'extr_index.csv')) # creating the coupled matrices nlp = spacy.load('en_core_web_md') dim1, dim2, dim3 = tensors[0].shape dim1_mat = Tensor(np.zeros((dim1, 0))) # fake matrix with no columns dim2_mat = Tensor(build_coupled_matrix(nlp, modifiers)) dim3_mat = Tensor(build_coupled_matrix(nlp, aspects)) # factorizing tensors for i, ten in tqdm(enumerate(tensors), total=iterations): cmtf = CMTF(random_state=0) (factors, _, _, _) = cmtf.decompose(ten, [dim1_mat, dim2_mat, dim3_mat], (dimensions, )) # getting modifier and aspect embeddings mod_embds, asp_embds = factors[1], factors[2] # computing and indexing the embedding of each extraction t = AnnoyIndex(dimensions, 'angular') for j, row in extractions.iterrows(): m, a = row['modifier'], row['aspect'] embd = np.multiply(mod_embds[mod2idx[m]], asp_embds[asp2idx[a]]) t.add_item(j, embd) t.build(100) t.save(join(ten_path, 'embd_' + str(i) + '.ann'))
def get_samples(self, objects=(), angle_1=(), angle_2=(), as_tensor=True): """ Get representations of images from the ETH-80 dataset. Parameters ---------- objects : list[str] List of objects of interest. angle_1 : list[str] List of angles of interest along longitude (from north to south). angle_2 : list[str] List of angles of interest along latitude (from west to east). as_tensor : bool If True, return samples as ``Tensor`` objects, numpy arrays otherwise. Returns ------- samples : list[Tensor] List of tensors with dimensions representing height, width and color respectively. labels : np.ndarray Array of corresponding labels. """ df = self.meta_data if len(angle_1) > 0: df = df[df.Angle_1.isin(angle_1)] if len(angle_2) > 0: df = df[df.Angle_2.isin(angle_2)] if len(objects) > 0: labels = [self._objects_to_labels[obj] for obj in objects] df = df[df.Label.isin(labels)] if df.empty: raise ValueError( "Selected criteria are not present in this dataset. " "Most likely, the specified pair(s) of `angle_1` and `angle_2` does not exist. \n" "HINT: Use 'available_angle_pairs' and 'available_objects' to see correct options." ) path = df.apply(lambda x: os.path.join(_ETH80_HOME, "original", "{}.npz".format(x['id'])), axis=1) data_as_series = path.apply( lambda x: np.load(x)['image'].astype(np.float)) labels = df.Label.values if as_tensor: samples = [ Tensor(sample, mode_names=["pixel_X", "pixel_Y", "color"]) for sample in data_as_series.tolist() ] else: samples = data_as_series.tolist() return samples, labels
def factorize_matrices(matrices, iterations, dimensions, mat_path, path): # loading additional data required to build coupled matrices ext_ind = pd.read_csv(join(path, 'extr_index.csv')) extractions = list( ext_ind.apply(lambda x: (x['modifier'] + ' ' + x['aspect']), axis=1)) # creating the coupled matrices nlp = spacy.load('en_core_web_md') dim1, dim2 = matrices[0].shape dim1_mat = Tensor(np.zeros((dim1, 0))) # fake matrix with no columns dim2_mat = Tensor(build_coupled_matrix(nlp, extractions)) # factorizing matrices for i, mat in tqdm(enumerate(matrices), total=iterations): hb_mat = Tensor(mat) cmtf = CMTF(random_state=0) (factors, _, _, _) = cmtf.decompose(hb_mat, [dim1_mat, dim2_mat], (dimensions, )) # getting extraction embeddings ext_embds = factors[1] # computing and indexing the embedding of each extraction t = AnnoyIndex(dimensions, 'angular') for j, embd in enumerate(ext_embds): t.add_item(j, embd) t.build(100) t.save(join(mat_path, 'embd_' + str(i) + '.ann'))
def contractor(x: Tensor, w, modes): """ Parameters ---------- x: Tensor object w: weights for STM to be contracted against modes: modes for STM to be contracted against Returns ------- x_vec = contracted tensor along all modes except for one """ temp = x.copy() for w, mode in zip(w, modes): temp.mode_n_product(np.expand_dims(w, axis=0), mode, inplace=True) x_vec = np.expand_dims(temp.data.squeeze(), axis=0) return x_vec
def create_stm_slice( d: Dict[str, DataFrame], start_index, slice_width, tensor_shape ) -> Tuple[List[Tensor], np.array, Tensor, np.float64, timestamps.Timestamp]: """ Notes: Assumes 3rd order tensors for now, and that the order of the idcs are known Parameters ---------- data: dict, the data organized for STM. Each entry of the dict is a dataframe. The labels are in the first slice start_index: int slice_width: int tensor_shape:list, the size of your tensor data. Returns ------- xs_train y_train xs_test y_test """ # This dict has two Keys, "Volume" and "Price" # "Volume" contains a pd.DataFrame with a DatetimeIndex and the Cols: VIX Price Gold Price SPX Close Label # "Price" contains a pd.DataFrame with a DatetimeIndex and the Cols: VIX Volume Gold Volume SPX Volume Label dict_stm = copy.deepcopy(d) idcs = dict_stm.keys() for idx in idcs: # Price and Volume # schneidet die Daten aus dict_stm["Price" & "Index"] weg, die nicht zu diesem Slice gehören dict_stm[idx] = dict_stm[idx][start_index:start_index + slice_width] # initialize a 3rd dim array. It has the shape [slice_width] + tensor_shape[1:] == # [250, 3, 2] # slice_as_3d_np_arr contains 250 (days) x 3 (features) in 2 auspraegungen (Price/Volume) slice_as_3d_np_arr = np.zeros([slice_width] + tensor_shape[1:]) for i, idx in enumerate(idcs): # Price and Volume slice_as_3d_np_arr[:, :, i] = np.array(dict_stm[idx].drop("Label", axis=1)) xs_train: List[Tensor] = [] n_tensors: int = slice_width - tensor_shape[0] + 1 # 251 tensors y_train: np.array = np.zeros(n_tensors - 1) xs_test: Tensor y_test: np.float64 for i in range(n_tensors): upper_idx = i + tensor_shape[0] # if training data if i < n_tensors - 1: xs_train.append(Tensor(slice_as_3d_np_arr[i:upper_idx, :, :])) y_train[i] = dict_stm["Price"]["Label"][upper_idx - 1] # otherwise it's testing data else: xs_test = Tensor(slice_as_3d_np_arr[i:upper_idx, :, :]) y_test = dict_stm["Price"]["Label"][upper_idx - 1] associated_index: timestamps.Timestamp = dict_stm["Price"].iloc[-2].name return xs_train, y_train, xs_test, y_test, associated_index
def make_data_stm(d, start_index, L, tensor_size, lag): """ Notes: Assumes 3rd order tensors for now, and that the order of the keys are known Parameters ---------- data: dict, the data organized for STM. Each entry of the dict is a dataframe. The labels are in the first slice start_index: int L: int tensor_size:list, the size of your tensor data. Returns ------- train_data train_labels test_data test_labels """ dict_stm = copy.deepcopy(d) if not lag: keys = dict_stm.keys() for key in keys: dict_stm[key] = dict_stm[key][start_index:start_index + L] data_np = np.zeros([L] + tensor_size[1:]) for i, key in enumerate(keys): data_np[:,:,i] = np.array(dict_stm[key].drop('Label', axis=1)) else: #We are lagging the price here tensor_size[-1] += 1 keys = ['Price', 'LaggedPrice', 'Volume'] dict_stm['LaggedPrice'] = dict_stm['Price'].shift(1).dropna() dict_stm['Price'] = dict_stm['Price'][1:] dict_stm['Volume'] = dict_stm['Volume'][1:] for key in keys: dict_stm[key] = dict_stm[key][start_index:start_index + L] data_np = np.zeros([L] + tensor_size[1:]) for i, key in enumerate(keys): data_np[:,:,i] = np.array(dict_stm[key].drop('Label', axis=1)) train_data = [] n_tensors = L - tensor_size[0]+1 train_labels = np.zeros(n_tensors-1) for i in range(n_tensors): # if training data if i<n_tensors-1: train_data.append(Tensor(data_np[i:i+tensor_size[0],:,:])) train_labels[i] = dict_stm['Price']['Label'][i+tensor_size[0]-1] #otherwise it's testing data else: test_data = Tensor(data_np[i:i + tensor_size[0], :, :]) test_label = dict_stm['Price']['Label'][i + tensor_size[0] - 1] associated_index = dict_stm['Price'].iloc[-2].name return train_data, train_labels, test_data, test_label, associated_index