def enlarge(img, xTimes, yTimes): # todo 未调试 if xTimes > 1: img = np_repeat(img, xTimes, axis=0) if yTimes > 1: img = np_repeat(img, yTimes, axis=1) return img
def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, learn_doctags=True, learn_words=True, learn_hidden=True, word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): """ Update distributed memory model ("PV-DM") by training on a single document. Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`. This method implements the DM model with a projection (input) layer that is either the sum or mean of the context vectors, depending on the model's `dm_mean` configuration field. See `train_dm_concat()` for the DM model with a concatenated input layer. The document is provided as `doc_words`, a list of word tokens which are looked up in the model's vocab dictionary, and `doctag_indexes`, which provide indexes into the doctag_vectors array. Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to prevent learning-updates to those respective model weights, as if using the (partially-)frozen model to infer other compatible vectors. This is the non-optimized, Python version. If you have a C compiler, gensim will use the optimized version from doc2vec_inner instead. """ if word_vectors is None: word_vectors = model.syn0 if word_locks is None: word_locks = model.syn0_lockf if doctag_vectors is None: doctag_vectors = model.docvecs.doctag_syn0 if doctag_locks is None: doctag_locks = model.docvecs.doctag_syn0_lockf word_vocabs = [model.vocab[w] for w in doc_words if w in model.vocab and model.vocab[w].sample_int > model.random.randint(2**32)] doctag_sum = np_sum(doctag_vectors[doctag_indexes], axis=0) doctag_len = len(doctag_indexes) for pos, word in enumerate(word_vocabs): reduced_window = model.random.randint(model.window) # `b` in the original doc2vec code start = max(0, pos - model.window + reduced_window) window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) word2_indexes = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)] l1 = np_sum(word_vectors[word2_indexes], axis=0) + doctag_sum # 1 x layer1_size if word2_indexes and model.cbow_mean: l1 /= (len(word2_indexes) + doctag_len) neu1e = train_cbow_pair(model, word, word2_indexes, l1, alpha, learn_vectors=False, learn_hidden=learn_hidden) if word2_indexes and not model.cbow_mean: neu1e /= (len(word2_indexes) + doctag_len) if learn_doctags: doctag_vectors[doctag_indexes] += neu1e * \ np_repeat(doctag_locks[doctag_indexes], model.vector_size).reshape(-1, model.vector_size) if learn_words: word_vectors[word2_indexes] += neu1e * \ np_repeat(word_locks[word2_indexes], model.vector_size).reshape(-1, model.vector_size) return len(word_vocabs)
def fan_regular_pols(np_verts, np_pols, np_distances, np_faces_id, custom_normals, index_offset=0, use_custom_normals=False, output_old_v_id=True, output_old_face_id=True, output_pols_groups=True): pols_number = np_pols.shape[0] pol_sides = np_pols.shape[1] v_pols = np_verts[np_pols] #shape [num_pols, num_corners, 3] if (len(np_distances) > 1 and np.any(np_distances != 0)) or np_distances != 0: if use_custom_normals: normals = custom_normals else: normals = np_faces_normals(v_pols) average = np.sum( v_pols, axis=1 ) / pol_sides + normals * np_distances[:, np_newaxis] #shape [num_pols, 3] else: average = np.sum(v_pols, axis=1) / pol_sides idx_offset = len(np_verts) + index_offset new_idx = np_arange(idx_offset, pols_number + idx_offset) new_pols = np.zeros([pols_number, pol_sides, 3], dtype=int) new_pols[:, :, 0] = np_pols new_pols[:, :, 1] = np_roll(np_pols, -1, axis=1) new_pols[:, :, 2] = new_idx[:, np_newaxis] old_vert_id = np_pols[:, 0].tolist() if output_old_v_id else [] if output_old_face_id: old_face_id = np_repeat(np_faces_id[:, np_newaxis], pol_sides, axis=1).tolist() else: old_face_id = [] if output_pols_groups: pols_groups = np_repeat(1, len(new_pols) * pol_sides).tolist() else: pols_groups = [] return ( average.tolist(), new_pols.reshape(-1, 3).tolist(), old_vert_id, old_face_id, pols_groups, )
def numpy_full_list(array, desired_length): '''retuns array with desired length by repeating last item''' if not isinstance(array, ndarray): array = np_array(array) length_diff = desired_length - array.shape[0] if length_diff > 0: new_part = np_repeat(array[np_newaxis, -1], length_diff, axis=0) return np_concatenate((array, new_part))[:desired_length] return array[:desired_length]
def numpy_match_long_repeat(list_of_arrays): '''match numpy arrays length by repeating last one''' out = [] maxl = 0 for array in list_of_arrays: maxl = max(maxl, array.shape[0]) for array in list_of_arrays: length_diff = maxl - array.shape[0] if length_diff > 0: new_part = np_repeat(array[np_newaxis, -1], length_diff, axis=0) array = np_concatenate((array, new_part)) out.append(array) return out
def numpy_full_list_cycle(array, desired_length): '''retuns array with desired length by cycling''' length_diff = desired_length - array.shape[0] if length_diff > 0: if length_diff < array.shape[0]: return np_concatenate((array, array[:length_diff])) new_part = np_repeat(array, ceil(length_diff / array.shape[0]), axis=0) if len(array.shape) > 1: shape = (ceil(length_diff / array.shape[0]), 1) else: shape = ceil(length_diff / array.shape[0]) new_part = np_tile(array, shape) return np_concatenate((array, new_part[:length_diff])) return array[:desired_length]
def numpy_match_long_cycle(list_of_arrays): '''match numpy arrays length by cycling over the array''' out = [] maxl = 0 for array in list_of_arrays: maxl = max(maxl, array.shape[0]) for array in list_of_arrays: length_diff = maxl - array.shape[0] if length_diff > 0: if length_diff < array.shape[0]: array = np_concatenate((array, array[:length_diff])) else: new_part = np_repeat(array, ceil(length_diff / array.shape[0]), axis=0) if len(array.shape) > 1: shape = (ceil(length_diff / array.shape[0]), 1) else: shape = ceil(length_diff / array.shape[0]) new_part = np_tile(array, shape) array = np_concatenate((array, new_part[:length_diff])) out.append(array) return out
def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, learn_doctags=True, learn_words=True, learn_hidden=True, word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): """ Update distributed memory model ("PV-DM") by training on a single document, using a concatenation of the context window word vectors (rather than a sum or average). Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`. The document is provided as `doc_words`, a list of word tokens which are looked up in the model's vocab dictionary, and `doctag_indexes`, which provide indexes into the doctag_vectors array. Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to prevent learning-updates to those respective model weights, as if using the (partially-)frozen model to infer other compatible vectors. This is the non-optimized, Python version. If you have a C compiler, gensim will use the optimized version from doc2vec_inner instead. """ if word_vectors is None: word_vectors = model.syn0 if word_locks is None: word_locks = model.syn0_lockf if doctag_vectors is None: doctag_vectors = model.docvecs.doctag_syn0 if doctag_locks is None: doctag_locks = model.docvecs.doctag_syn0_lockf word_vocabs = [model.vocab[w] for w in doc_words if w in model.vocab and model.vocab[w].sample_int > model.random.rand() * 2**32] doctag_len = len(doctag_indexes) if doctag_len != model.dm_tag_count: return 0 # skip doc without expected number of doctag(s) (TODO: warn/pad?) null_word = model.vocab['\0'] pre_pad_count = model.window post_pad_count = model.window padded_document_indexes = ( (pre_pad_count * [null_word.index]) # pre-padding + [word.index for word in word_vocabs if word is not None] # elide out-of-Vocabulary words + (post_pad_count * [null_word.index]) # post-padding ) for pos in range(pre_pad_count, len(padded_document_indexes) - post_pad_count): word_context_indexes = ( padded_document_indexes[(pos - pre_pad_count): pos] # preceding words + padded_document_indexes[(pos + 1):(pos + 1 + post_pad_count)] # following words ) word_context_len = len(word_context_indexes) predict_word = model.vocab[model.index2word[padded_document_indexes[pos]]] # numpy advanced-indexing copies; concatenate, flatten to 1d l1 = concatenate((doctag_vectors[doctag_indexes], word_vectors[word_context_indexes])).ravel() neu1e = train_cbow_pair(model, predict_word, None, l1, alpha, learn_hidden=learn_hidden, learn_vectors=False) # filter by locks and shape for addition to source vectors e_locks = concatenate((doctag_locks[doctag_indexes], word_locks[word_context_indexes])) neu1e_r = (neu1e.reshape(-1, model.vector_size) * np_repeat(e_locks, model.vector_size).reshape(-1, model.vector_size)) if learn_doctags: np_add.at(doctag_vectors, doctag_indexes, neu1e_r[:doctag_len]) if learn_words: np_add.at(word_vectors, word_context_indexes, neu1e_r[doctag_len:]) return len(padded_document_indexes) - pre_pad_count - post_pad_count
def distrib_nn_for_cdf(self, ntss_tmp, bool_print: bool = False): """ Calculates the two indicators, average and standard deviation of the distances, necessary for the use of the CDF of the normal distribution. The computation of these indicators are described in `Scoring Message Stream Anomalies in Railway Communication Systems, L.Foulon et al., 2019, ICDMWorkshop <https://ieeexplore.ieee.org/abstract/document/8955558>`_. :param numpy.ndarray ntss_tmp: Reference sequences :param boolean bool_print: and True, Displays the nodes stats on the standard output :returns: :rtype: list(numpy.ndarray, numpy.array) """ start_time = time_time() node_list, node_list_leaf, node_leaf_ndarray_mean = self.get_list_nodes_and_barycentre( ) if bool_print: print("pretrait node --- %s seconds ---" % (time_time() - start_time)) stdout.flush() print(len(node_list), " nodes whose ", len(node_list_leaf), " leafs in tree") stdout.flush() nb_leaf = len(node_list_leaf) cdf_mean = np_zeros((nb_leaf, len(ntss_tmp))) cdf_std = np_zeros(nb_leaf) nb_ts_by_node = np_zeros(nb_leaf, dtype=np_uint32) centroid_dist = np_square(cdist(node_leaf_ndarray_mean, ntss_tmp)) for num, node in enumerate(node_list_leaf): cdf_std[node.id_numpy_leaf] = np_mean(node.std) nb_ts_by_node[node.id_numpy_leaf] = node.get_nb_sequences() dist_list = np_array([np_zeros(i) for i in nb_ts_by_node], dtype=object) # calcul distance au carre entre [barycentre et ts] du meme nœud """ TODO np.vectorize ?""" for node_nn in node_list_leaf: dist_list[node_nn.id_numpy_leaf] = cdist( [node_nn.mean], node_nn.get_sequences())[0] dist_list = np_square(dist_list) """ TODO np.vectorize ?""" for num, node in enumerate(node_list_leaf): node_id = node.id_numpy_leaf centroid_dist_tmp = centroid_dist[node_id] centroid_dist_tmp = centroid_dist_tmp.reshape( centroid_dist_tmp.shape + (1, )) centroid_dist_tmp = np_repeat(centroid_dist_tmp, nb_ts_by_node[node_id], axis=1) cdf_mean_tmp = np_add(centroid_dist_tmp, dist_list[node_id]) cdf_mean[node_id] = np_sum(cdf_mean_tmp, axis=1) del dist_list del cdf_mean_tmp del centroid_dist_tmp cdf_mean = np_divide(cdf_mean.T, nb_ts_by_node) cdf_mean = np_sqrt(cdf_mean) self.cdf_mean = cdf_mean self.cdf_std = cdf_std
def inset_regular_pols(np_verts, np_pols, np_distances, np_inset_rate, np_make_inners, np_faces_id, custom_normals, matrices, offset_mode='CENTER', proportional=False, concave_support=True, index_offset=0, use_custom_normals=False, output_old_face_id=True, output_old_v_id=True, output_pols_groups=True): pols_number = np_pols.shape[0] pol_sides = np_pols.shape[1] v_pols = np_verts[np_pols] #shape [num_pols, num_corners, 3] if offset_mode == 'SIDES': inner_points = sides_mode_inset(v_pols, np_inset_rate, np_distances, concave_support, proportional, use_custom_normals, custom_normals) elif offset_mode == 'MATRIX': inner_points = matrix_mode_inset(v_pols, matrices, use_custom_normals, custom_normals) else: if any(np_distances != 0): if use_custom_normals: normals = custom_normals else: normals = np_faces_normals(v_pols) average = np.sum( v_pols, axis=1 ) / pol_sides #+ normals*np_distances[:, np_newaxis] #shape [num_pols, 3] inner_points = average[:, np_newaxis, :] + ( v_pols - average[:, np_newaxis, :] ) * np_inset_rate[:, np_newaxis, np_newaxis] + normals[:, np_newaxis, :] * np_distances[:, np_newaxis, np_newaxis] else: average = np.sum(v_pols, axis=1) / pol_sides #shape [num_pols, 3] inner_points = average[:, np_newaxis, :] + ( v_pols - average[:, np_newaxis, :] ) * np_inset_rate[:, np_newaxis, np_newaxis] idx_offset = len(np_verts) + index_offset new_v_idx = np_arange(idx_offset, pols_number * pol_sides + idx_offset).reshape( pols_number, pol_sides) side_pols = np.zeros([pols_number, pol_sides, 4], dtype=int) side_pols[:, :, 0] = np_pols side_pols[:, :, 1] = np_roll(np_pols, -1, axis=1) side_pols[:, :, 2] = np_roll(new_v_idx, -1, axis=1) side_pols[:, :, 3] = new_v_idx side_faces = side_pols.reshape(-1, 4) new_insets = new_v_idx[np_make_inners] if pol_sides == 4: new_faces = np_concatenate([side_faces, new_insets]).tolist() else: new_faces = side_faces.tolist() + new_insets.tolist() old_v_id = np_pols.flatten().tolist() if output_old_v_id else [] if output_old_face_id: side_ids = np.repeat(np_faces_id[:, np_newaxis], pol_sides, axis=1) inset_ids = np_faces_id[np_make_inners] old_face_id = np.concatenate((side_ids.flatten(), inset_ids)).tolist() else: old_face_id = [] if output_pols_groups: pols_groups = np_repeat( [1, 2], [len(side_faces), len(new_insets)]).tolist() else: pols_groups = [] return (inner_points.reshape(-1, 3).tolist(), new_faces, new_insets.tolist(), old_v_id, old_face_id, pols_groups)
def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, learn_doctags=True, learn_words=True, learn_hidden=True, word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): """ Update distributed memory model ("PV-DM") by training on a single document. Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`. This method implements the DM model with a projection (input) layer that is either the sum or mean of the context vectors, depending on the model's `dm_mean` configuration field. See `train_dm_concat()` for the DM model with a concatenated input layer. The document is provided as `doc_words`, a list of word tokens which are looked up in the model's vocab dictionary, and `doctag_indexes`, which provide indexes into the doctag_vectors array. Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to prevent learning-updates to those respective model weights, as if using the (partially-)frozen model to infer other compatible vectors. This is the non-optimized, Python version. If you have a C compiler, gensim will use the optimized version from doc2vec_inner instead. """ if word_vectors is None: word_vectors = model.syn0 if word_locks is None: word_locks = model.syn0_lockf if doctag_vectors is None: doctag_vectors = model.docvecs.doctag_syn0 if doctag_locks is None: doctag_locks = model.docvecs.doctag_syn0_lockf word_vocabs = [ model.vocab[w] for w in doc_words if w in model.vocab and model.vocab[w].sample_int > model.random.randint(2**32) ] doctag_sum = np_sum(doctag_vectors[doctag_indexes], axis=0) doctag_len = len(doctag_indexes) for pos, word in enumerate(word_vocabs): reduced_window = model.random.randint( model.window) # `b` in the original doc2vec code start = max(0, pos - model.window + reduced_window) window_pos = enumerate( word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) word2_indexes = [ word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos) ] l1 = np_sum(word_vectors[word2_indexes], axis=0) + doctag_sum # 1 x layer1_size if word2_indexes and model.cbow_mean: l1 /= (len(word2_indexes) + doctag_len) neu1e = train_cbow_pair(model, word, word2_indexes, l1, alpha, learn_vectors=False, learn_hidden=learn_hidden) if word2_indexes and not model.cbow_mean: neu1e /= (len(word2_indexes) + doctag_len) if learn_doctags: doctag_vectors[doctag_indexes] += neu1e * \ np_repeat(doctag_locks[doctag_indexes], model.vector_size).reshape(-1, model.vector_size) if learn_words: word_vectors[word2_indexes] += neu1e * \ np_repeat(word_locks[word2_indexes], model.vector_size).reshape(-1, model.vector_size) return len(word_vocabs)
def mock_noise(scale, size): """Return `size` copies of the `scale` parameter.""" return np_repeat(scale, size)