def jaccard_similarity_pairwise(corpus: pd.DataFrame) -> list: """ Zwraca listę z wynikami dla podobieństwa Jaccarda Wartość zwrócona: * lista wyników w postaci: [((i1, i2), wynik)] """ print('Obliczanie Jaccard Similarity dla wszystkich par') pairs = utils.get_pairs(corpus) similarities = [] for pair in pairs: i1, row1 = pair[0] i2, row2 = pair[1] t1 = row1['Treść'] t2 = row2['Treść'] score = jaccard_similarity(t1, t2) similarities.append(((i1, i2), score)) similarities = sorted(similarities, key=lambda tup: tup[1], reverse=True) print('Gotowe') return similarities
def calculate_cosine_similarity_for_pairs(corpus: pd.DataFrame, method: str) -> list: print('Obliczanie kosinusowej odległości dla wszystkich par') pairs = utils.get_pairs(corpus) similarities = [] for pair in pairs: i1, row1 = pair[0] i2, row2 = pair[1] t1 = row1['Treść'] t2 = row2['Treść'] texts = [t1, t2] model = Tokenizer() model.fit_on_texts(texts) rep = model.texts_to_matrix(texts, mode=method) similarity = cosine_similarity(rep) result = ((i1, i2), similarity[0, 1]) similarities.append(result) similarities = sorted(similarities, key=lambda tup: tup[1], reverse=True) print('Gotowe') return similarities
def hungary(self, dispatch_observ): if len(dispatch_observ) == 0: return [] driver_id_orig2new, order_id_orig2new, driver_id_new2orig, order_id_new2orig = rehash( dispatch_observ) costs, row_is_driver = build_graph(dispatch_observ, driver_id_orig2new, order_id_orig2new) n = len(costs) m = len(costs[0]) lmate = -np.ones(n, dtype=np.int32) lmate = lmate.ctypes.data_as(ctypes.c_void_p) dataptr = costs.ctypes.data_as(ctypes.c_void_p) self.hung.MaxProfMatching(dataptr, n, m, lmate) array_pointer = ctypes.cast(lmate, ctypes.POINTER(ctypes.c_int * n)) np_arr = np.frombuffer(array_pointer.contents, dtype=np.int32, count=n) lmate = np_arr.reshape((n, )) lmate = list(lmate) dispatch_action = get_pairs(lmate, row_is_driver, driver_id_new2orig, order_id_new2orig) return dispatch_action
def __init__(self, data_source: str, pair_file: str, transform: transforms, preload: bool = False): self.data_source = data_source self.pair_file = pair_file self.transform = transform self.pairs, self.issame = utils.get_pairs(pair_file, data_source) self.preloaded = False if preload: print('Preload images') self.images = {} uniques = np.unique(np.array(self.pairs)) tbar = tqdm.tqdm(uniques) for path in tbar: img = Image.open(path) self.images[path] = img.copy() self.preloaded = True
def set_covariance_matrices(self): correlation_pairs = get_pairs(self.correlation_symbols, join_with='-') for correlation_pair in tqdm(correlation_pairs, desc='covariance matrices'): a1 = correlation_pair[0] a2 = correlation_pair[1] b1 = correlation_pair[3] b2 = correlation_pair[4] covariance_workspace = nmt.NmtCovarianceWorkspace() covariance_workspace.compute_coupling_coefficients( self.fields[a1], self.fields[a2], self.fields[b1], self.fields[b2]) self.covariance_matrices[ correlation_pair] = nmt.gaussian_covariance( covariance_workspace, 0, 0, 0, 0, [self.theory_correlations[''.join(sorted([a1, b1]))]], [self.theory_correlations[''.join(sorted([a1, b2]))]], [self.theory_correlations[''.join(sorted([a2, b1]))]], [self.theory_correlations[''.join(sorted([a2, b2]))]], wa=self.workspaces[a1 + a2], wb=self.workspaces[b1 + b2], ) transpose_corr_symbol = b1 + b2 + '-' + a1 + a2 self.covariance_matrices[transpose_corr_symbol] = np.transpose( self.covariance_matrices[correlation_pair]) if a1 + a2 == b1 + b2: self.correlation_matrices[ correlation_pair] = get_correlation_matrix( self.covariance_matrices[correlation_pair])
import pytest import utils import random import time from utils import os_client @pytest.fixture(scope='session') def local_salt_client(): return utils.init_salt_client() # TODO: fix # should not be executed on any test run nodes = utils.get_pairs() hw_nodes = utils.get_hw_pairs() @pytest.fixture(scope='session', params=nodes.values(), ids=nodes.keys()) def pair(request): return request.param @pytest.fixture(scope='session', params=hw_nodes.values(), ids=hw_nodes.keys()) def hw_pair(request): return request.param @pytest.fixture(scope='session') def openstack_clients(local_salt_client): nodes_info = local_salt_client.cmd( 'keystone:server', 'pillar.get',
def __init__(self, config, set_data=False, set_maps=False, set_correlations=False): # Data parameters self.lss_survey_name = None self.lss_mask_name = None self.flux_min_cut = 0 self.nside = 0 self.z_tail = 0 self.bias = 0 self.scale_bias = None # Correlation parameters self.correlation_symbols = [] self.l_min = {} self.l_max = {} self.ells_per_bin = {} self.ell_lengths = {} self.cosmology_name = None self.cosmology_matter_power_spectrum = None self.cosmology_params = None # MCMC parameters self.continue_sampling = False self.n_walkers = 0 self.max_iterations = 0 self.starting_params = {} # Data containters self.map_symbols = [] self.data = {} self.base_maps = {} self.noise_maps = {} self.weight_maps = {} self.processed_maps = {} self.masks = {} self.noise_curves = defaultdict(int) self.noise_decoupled = defaultdict(int) # Correlation containers self.z_arr = [] self.n_arr = [] self.theory_correlations = {} self.data_correlations = {} self.chi_squared = {} self.sigmas = {} self.fields = {} self.workspaces = {} self.binnings = {} self.covariance_matrices = {} self.correlation_matrices = {} self.l_arr = None self.n_ells = {} # MCMC containers self.inference_covariance = None self.inference_correlation = None self.data_vector = None self.inverted_covariance = None self.p0_walkers = None self.emcee_sampler = None self.arg_names = None self.backend_filename = None self.tau_filename = None self.mcmc_folder = os.path.join(PROJECT_PATH, 'outputs/MCMC') # Pipeline flags self.are_maps_ready = False self.are_correlations_ready = False self.is_sampler_ready = False self.config = config for key, value in config.items(): setattr(self, key, value) # Generate necessary correlation symbols self.map_symbols = list(set(''.join(self.correlation_symbols))) self.all_correlation_symbols = get_pairs(self.map_symbols) # Create experiment name self.arg_names = list(self.starting_params.keys()) experiment_name_parts = [ '-'.join(self.correlation_symbols), '_'.join(self.arg_names) ] if 'experiment_tag' in config and config[ 'experiment_tag'] is not None and len( config['experiment_tag']) > 0: experiment_name_parts.append(config['experiment_tag']) self.experiment_name = '__'.join(experiment_name_parts) # Set maps and correlations if set_data: self.set_data() if set_maps: self.set_maps() if set_correlations: self.set_correlations()
for language in languages: n_docs = len(language_doc_dict[language]) docs = language_doc_dict[language] #your code here tokens_n = [] for doc in docs: path_to_doc = f'{doc}/{language}.drs.xml' tokens = get_tokens(path_to_doc) length = len(tokens) tokens_n.append(length) n_tokens = sum(tokens_n) print(f'{language}: num docs: {n_docs}, num tokens: {n_tokens}') pairs = get_pairs(languages) print(pairs) for lang1, lang2 in pairs: docs_lang1 = language_doc_dict[lang1] docs_lang2 = language_doc_dict[lang2] #print(len(docs_lang1)) #print(len(docs_lang2)) number_of_docs = len(docs_lang1.intersection(docs_lang2)) #print(len(docs_in_two)) print( f'Coverage for parallel data in {lang1} and {lang2}: {number_of_docs}') v1 = input( "Please enter the first language for comparison(for eg. en, it, de, nl): ") v2 = input(
X2mask.append([1] * len(x2token)) X1ids = LongTensor( utils.seq_padding(np.array(X1ids), para.bert_maxlen)) X2ids = LongTensor( utils.seq_padding(np.array(X2ids), para.bert_maxlen)) X1mask = FloatTensor( utils.seq_padding(np.array(X1mask), para.bert_maxlen)) X2mask = FloatTensor( utils.seq_padding(np.array(X2mask), para.bert_maxlen)) # 使用dataParallel之后下面两行需要加module() X1embed = model.bert_embedding([X1ids, X1mask]) X2embed = model.bert_embedding([X2ids, X2mask]) T1embed, T2embed = \ utils.get_pairs([X1embed.cpu().data.numpy(), X2embed.cpu().data.numpy()]) T1embed = FloatTensor(T1embed) T2embed = FloatTensor(T2embed) model.zero_grad() _, loss = model([X1embed, X2embed, T1embed, T2embed], is_training=True) # print('loss: ', loss.sum()) loss_list.append(loss.sum().cpu().data.numpy()) loss.backward() optimizer.step() if step % 100 == 0: print("\nepoch[%d/%d] mean_loss : %0.4f" % (epoch + 1, para.epoch, np.mean(loss_list)))
def train(self, N, row, col, T, n_features, n_pairwise_features, hidden_layer_sizes, n_iterations, batch_size, n_samples, holdout_ratio_valid, learning_rate, root_savedir, log_interval=10, no_train_metric=False, seed=None, debug=False): """ Training routine. Note about the data: the (row, col) tuples of the ON (i.e., one-valued) entries of the graph are to be passed, and they should correspond to the upper triangle of the graph. (Recall we do not allow self-links.) Regardless, the code will make a symmetric graph out of all passed entries (within the upper triangular or not) and only the upper triangle of the resulting matrix will be kept. :param N: Number of nodes in the graph. :param row: row indices corresponding to the ON entries (in the upper triangle). :param col: col indices corresponding to the ON entries (in the upper triangle). :param T: Truncation level for the DP. :param n_features: :param hidden_layer_sizes: :param n_iterations: :param batch_size: HALF the minibatch size. In particular, we will always add the symmetric entry in the graph (i.e., the corresponding entry in the lower triangle) in the minibatch. :param n_samples: :param holdout_ratio_valid: :param learning_rate: :param root_savedir: :param no_train_metric: :param seed: :param debug: :return: """ self.N = N self.T = T self.n_features = n_features self.n_pairwise_features = n_pairwise_features self.hidden_layer_sizes = hidden_layer_sizes if not os.path.exists(root_savedir): os.makedirs(root_savedir) # Data handling. X_sp = sp.csr_matrix((np.ones(len(row)), (row, col)), shape=[N, N]) X_sp = X_sp + X_sp.transpose() X_sp = sp.triu(X_sp, k=1) row, col = X_sp.nonzero() pairs = get_pairs(N, row, col) pairs = pairs.astype(int) batch_generator = BatchGenerator(pairs, batch_size, holdout_ratio=holdout_ratio_valid, seed=seed) # Construct the TF graph. self.construct_graph() all_vars = tf.trainable_variables() print("\nTrainable variables:") pprint([var_.name for var_ in all_vars]) train_op = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize(-self.elbo) ### Create q(Z) variational parameters ### # before this was uniformly initialized # self.qZ_ = np.ones([N, T]) / T self.qZ_ = np.random.dirichlet(np.ones(T), size=N) # (N, T) # the following quantity needs to be passed to the TF graph and must be updated after every update to qZ sum_qZ_above = np.zeros([N, T - 1]) for k in range(T - 1): sum_qZ_above[:, k] = np.sum(self.qZ_[:, k + 1:], axis=1) # Training. if not no_train_metric: train_elbo = tf.placeholder(dtype=tf.float32, shape=[], name='train_elbo') train_elbo_summary = tf.summary.scalar('train_elbo', train_elbo) train_ll = tf.placeholder(dtype=tf.float32, shape=[], name='train_ll') train_ll_summary = tf.summary.scalar('train_ll', train_ll) if holdout_ratio_valid is not None: test_ll = tf.placeholder(dtype=tf.float32, shape=[], name='test_ll') test_ll_summary = tf.summary.scalar('test_ll', test_ll) # Grab all scalar variables, to track in Tensorboard. trainable_vars = tf.trainable_variables() scalar_summaries = [ tf.summary.scalar(tensor_.name, tensor_) for tensor_ in trainable_vars if len(tensor_.shape) == 0 ] tensor_summaries = [ tf.summary.histogram(tensor_.name, tensor_) for tensor_ in trainable_vars if len(tensor_.shape) > 0 ] root_logdir = os.path.join(root_savedir, "tf_logs") writer = tf.summary.FileWriter(root_logdir) saver = tf.train.Saver() init = tf.global_variables_initializer() with tf.Session() as sess: init.run() if not no_train_metric: # add symmetric entries from the lower triangle train_data = batch_generator.train row = np.concatenate([train_data[:, 0], train_data[:, 1]]) col = np.concatenate([train_data[:, 1], train_data[:, 0]]) val = np.concatenate([train_data[:, 2], train_data[:, 2]]) train_dict = { self.row: row, self.col: col, self.val: val, self.batch_scale: 1.0 } if holdout_ratio_valid is not None: test_data = batch_generator.test row = np.concatenate([test_data[:, 0], test_data[:, 1]]) col = np.concatenate([test_data[:, 1], test_data[:, 0]]) val = np.concatenate([test_data[:, 2], test_data[:, 2]]) test_dict = { self.row: row, self.col: col, self.val: val, self.batch_scale: 1.0 } logging.info("Starting training...") for iteration in range(n_iterations): batch = batch_generator.next_batch() batch_dict = { self.row: np.concatenate([batch[:, 0], batch[:, 1]]), self.col: np.concatenate([batch[:, 1], batch[:, 0]]), self.val: np.concatenate([batch[:, 2], batch[:, 2]]), self.qZ: self.qZ_, self.n_samples: n_samples, self.batch_scale: len(pairs) / len(batch), self.sum_qZ_above: sum_qZ_above, } # make a gradient update sess.run(train_op, feed_dict=batch_dict) # analytically self.update_qZ(sess=sess, batch=batch, n_samples=n_samples, debug=debug) # this update to sum_qZ_above was done at the beginning of the iteration. this implementation updates the sum_qZ_above before # logging the intermediate loss functions, and also one more time before saving the model. this actually makes more sense to me. # we could also just add this computation inside the construct graph function? it would have to be recomputed a few times more, but makes the code cleaner for k in range(T - 1): sum_qZ_above[:, k] = np.sum(self.qZ_[:, k + 1:], axis=1) if iteration % log_interval == 0: # Add scalar variables to Tensorboard. for summ_str in sess.run(scalar_summaries): writer.add_summary(summ_str, iteration) # Add tensor variables to Tensorboard. for summ_str in sess.run(tensor_summaries): writer.add_summary(summ_str, iteration) if not no_train_metric: train_dict.update({ self.qZ: self.qZ_, self.sum_qZ_above: sum_qZ_above, self.n_samples: 100 }) train_ll_, train_elbo_ = sess.run( [self.data_loglikel, self.elbo], feed_dict=train_dict) train_ll_summary_str, train_elbo_summary_str = sess.run( [train_ll_summary, train_elbo_summary], feed_dict={ train_ll: train_ll_, train_elbo: train_elbo_ }) writer.add_summary(train_ll_summary_str, iteration) writer.add_summary(train_elbo_summary_str, iteration) if holdout_ratio_valid is not None: test_dict.update({ self.qZ: self.qZ_, self.sum_qZ_above: sum_qZ_above, self.n_samples: 100 }) test_ll_ = sess.run(self.data_loglikel, feed_dict=test_dict) test_ll_summary_str = sess.run( test_ll_summary, feed_dict={test_ll: test_ll_}) writer.add_summary(test_ll_summary_str, iteration) # Log training overview. log_str = "%-4d" % iteration if not no_train_metric: log_str += " ELBO: %.4e Train ll: %.4e" % ( train_elbo_, train_ll_) if holdout_ratio_valid is not None: log_str += " Valid ll: %.4e" % test_ll_ logging.info(log_str) # save the model saver.save(sess, os.path.join(root_savedir, "model.ckpt")) # close the file writer writer.close()
def train_epoch(self): epoch_loss = 0. for sent in self.dataset: sent = [w for w in sent.split() if w in self.vocab] if len(sent) <= 1: continue sent = self.sub_sampler.subsample_sent(sent) if len(sent) <= 1: continue pairs = get_pairs(sent, self.c) # sentence batched version loss = 0. pair_idxs = torch.tensor( [[self.vocab[pair[0]][0], self.vocab[pair[1]][0]] for pair in pairs]) inps = self.vecs[pair_idxs[:, 0]] targs = self.vecs[pair_idxs[:, 1]] t_scores = torch.bmm(inps, torch.transpose(targs, 1, 2)) # find indices of the senses used for each word in the sentence m2, mi2 = torch.max(t_scores, dim=2) senses = torch.argmax(m2, dim=1) # This one also "negative samples" the scores produced by non maximal word sense pairings # _ , t_max_ind = torch.max(t_scores,dim=1) # t_scores *= -1 # t_scores[np.arange(len(t_scores)),t_max_ind] *= -1 # t_probs = self.sigmoid(t_scores) # This does not penalize other senses t_probs = self.sigmoid( torch.max(t_scores.view(len(pairs), -1), dim=1)[0]) loss += -torch.mean(torch.log(t_probs)) # can I also batch negative samples? :O neg_batch_idxs = self.neg_sampler.sample(self.k * len(pairs), words=False) ntargs = self.vecs[neg_batch_idxs] neg_senses = senses.repeat(self.k) neg_inps = inps.repeat(self.k, 1, 1) # only use the senses indicated by max score neg_inps = neg_inps[np.arange(len(neg_inps)), neg_senses, :] n_scores = torch.bmm(neg_inps.view(len(neg_inps), 1, -1), torch.transpose(ntargs, 1, 2)) #negative sample every sense pairing? # n_probs = self.sigmoid(-1.0*n_scores) # only negative sample the max? n_probs = self.sigmoid( -1.0 * torch.max(n_scores.view(len(n_scores), -1), dim=1)[0]) # only negative sample the sense that inp was actually used in. (the row which max occurs in) # n_probs = self.sigmoid(-1.0*n_scores) loss += -torch.sum(torch.log(n_probs)) / len(n_probs) loss.backward() self.optimizer.step() epoch_loss += float(loss) # # non batched version # for pair in pairs: # loss = 0. # self.optimizer.zero_grad() # inp = self.vecs[self.vocab[pair[0]][0]] # targ = self.vecs[self.vocab[pair[1]][0]] # t_scores = torch.mm(inp, targ.t()) # t_prob = self.sigmoid(torch.max(t_scores)) # loss += -torch.log(t_prob) # # negative sampling # neg_samples = self.neg_sampler.sample(self.k) # for ns in neg_samples: # ntarg = self.vecs[self.vocab[ns][0]] # n_scores = torch.mm(inp, ntarg.t()) # n_prob = self.sigmoid(-1.0*torch.max(n_scores)) # loss += -torch.log(n_prob) # loss.backward() # self.optimizer.step() # epoch_loss += float(loss) return epoch_loss