示例#1
0
def jaccard_similarity_pairwise(corpus: pd.DataFrame) -> list:
    """ Zwraca listę z wynikami dla podobieństwa Jaccarda
  
  Wartość zwrócona:
  * lista wyników w postaci:
    [((i1, i2), wynik)]
  
  """

    print('Obliczanie Jaccard Similarity dla wszystkich par')
    pairs = utils.get_pairs(corpus)

    similarities = []

    for pair in pairs:
        i1, row1 = pair[0]
        i2, row2 = pair[1]

        t1 = row1['Treść']
        t2 = row2['Treść']

        score = jaccard_similarity(t1, t2)

        similarities.append(((i1, i2), score))

    similarities = sorted(similarities, key=lambda tup: tup[1], reverse=True)
    print('Gotowe')

    return similarities
示例#2
0
def calculate_cosine_similarity_for_pairs(corpus: pd.DataFrame,
                                          method: str) -> list:
    print('Obliczanie kosinusowej odległości dla wszystkich par')
    pairs = utils.get_pairs(corpus)

    similarities = []

    for pair in pairs:
        i1, row1 = pair[0]
        i2, row2 = pair[1]

        t1 = row1['Treść']
        t2 = row2['Treść']

        texts = [t1, t2]

        model = Tokenizer()
        model.fit_on_texts(texts)
        rep = model.texts_to_matrix(texts, mode=method)
        similarity = cosine_similarity(rep)

        result = ((i1, i2), similarity[0, 1])

        similarities.append(result)

    similarities = sorted(similarities, key=lambda tup: tup[1], reverse=True)
    print('Gotowe')

    return similarities
 def hungary(self, dispatch_observ):
     if len(dispatch_observ) == 0:
         return []
     driver_id_orig2new, order_id_orig2new, driver_id_new2orig, order_id_new2orig = rehash(
         dispatch_observ)
     costs, row_is_driver = build_graph(dispatch_observ, driver_id_orig2new,
                                        order_id_orig2new)
     n = len(costs)
     m = len(costs[0])
     lmate = -np.ones(n, dtype=np.int32)
     lmate = lmate.ctypes.data_as(ctypes.c_void_p)
     dataptr = costs.ctypes.data_as(ctypes.c_void_p)
     self.hung.MaxProfMatching(dataptr, n, m, lmate)
     array_pointer = ctypes.cast(lmate, ctypes.POINTER(ctypes.c_int * n))
     np_arr = np.frombuffer(array_pointer.contents, dtype=np.int32, count=n)
     lmate = np_arr.reshape((n, ))
     lmate = list(lmate)
     dispatch_action = get_pairs(lmate, row_is_driver, driver_id_new2orig,
                                 order_id_new2orig)
     return dispatch_action
示例#4
0
    def __init__(self,
                 data_source: str,
                 pair_file: str,
                 transform: transforms,
                 preload: bool = False):
        self.data_source = data_source
        self.pair_file = pair_file
        self.transform = transform

        self.pairs, self.issame = utils.get_pairs(pair_file, data_source)

        self.preloaded = False
        if preload:
            print('Preload images')
            self.images = {}
            uniques = np.unique(np.array(self.pairs))
            tbar = tqdm.tqdm(uniques)
            for path in tbar:
                img = Image.open(path)
                self.images[path] = img.copy()
            self.preloaded = True
示例#5
0
    def set_covariance_matrices(self):
        correlation_pairs = get_pairs(self.correlation_symbols, join_with='-')
        for correlation_pair in tqdm(correlation_pairs,
                                     desc='covariance matrices'):
            a1 = correlation_pair[0]
            a2 = correlation_pair[1]
            b1 = correlation_pair[3]
            b2 = correlation_pair[4]

            covariance_workspace = nmt.NmtCovarianceWorkspace()
            covariance_workspace.compute_coupling_coefficients(
                self.fields[a1], self.fields[a2], self.fields[b1],
                self.fields[b2])

            self.covariance_matrices[
                correlation_pair] = nmt.gaussian_covariance(
                    covariance_workspace,
                    0,
                    0,
                    0,
                    0,
                    [self.theory_correlations[''.join(sorted([a1, b1]))]],
                    [self.theory_correlations[''.join(sorted([a1, b2]))]],
                    [self.theory_correlations[''.join(sorted([a2, b1]))]],
                    [self.theory_correlations[''.join(sorted([a2, b2]))]],
                    wa=self.workspaces[a1 + a2],
                    wb=self.workspaces[b1 + b2],
                )

            transpose_corr_symbol = b1 + b2 + '-' + a1 + a2
            self.covariance_matrices[transpose_corr_symbol] = np.transpose(
                self.covariance_matrices[correlation_pair])

            if a1 + a2 == b1 + b2:
                self.correlation_matrices[
                    correlation_pair] = get_correlation_matrix(
                        self.covariance_matrices[correlation_pair])
示例#6
0
import pytest
import utils
import random
import time
from utils import os_client

@pytest.fixture(scope='session')
def local_salt_client():
    return utils.init_salt_client()


# TODO: fix
# should not be executed on any test run
nodes = utils.get_pairs()
hw_nodes = utils.get_hw_pairs()


@pytest.fixture(scope='session', params=nodes.values(), ids=nodes.keys())
def pair(request):
    return request.param


@pytest.fixture(scope='session', params=hw_nodes.values(), ids=hw_nodes.keys())
def hw_pair(request):
    return request.param


@pytest.fixture(scope='session')
def openstack_clients(local_salt_client):
    nodes_info = local_salt_client.cmd(
        'keystone:server', 'pillar.get',
示例#7
0
    def __init__(self,
                 config,
                 set_data=False,
                 set_maps=False,
                 set_correlations=False):
        # Data parameters
        self.lss_survey_name = None
        self.lss_mask_name = None
        self.flux_min_cut = 0
        self.nside = 0
        self.z_tail = 0
        self.bias = 0
        self.scale_bias = None

        # Correlation parameters
        self.correlation_symbols = []
        self.l_min = {}
        self.l_max = {}
        self.ells_per_bin = {}
        self.ell_lengths = {}
        self.cosmology_name = None
        self.cosmology_matter_power_spectrum = None
        self.cosmology_params = None

        # MCMC parameters
        self.continue_sampling = False
        self.n_walkers = 0
        self.max_iterations = 0
        self.starting_params = {}

        # Data containters
        self.map_symbols = []
        self.data = {}
        self.base_maps = {}
        self.noise_maps = {}
        self.weight_maps = {}
        self.processed_maps = {}
        self.masks = {}
        self.noise_curves = defaultdict(int)
        self.noise_decoupled = defaultdict(int)

        # Correlation containers
        self.z_arr = []
        self.n_arr = []
        self.theory_correlations = {}
        self.data_correlations = {}
        self.chi_squared = {}
        self.sigmas = {}
        self.fields = {}
        self.workspaces = {}
        self.binnings = {}
        self.covariance_matrices = {}
        self.correlation_matrices = {}
        self.l_arr = None
        self.n_ells = {}

        # MCMC containers
        self.inference_covariance = None
        self.inference_correlation = None
        self.data_vector = None
        self.inverted_covariance = None
        self.p0_walkers = None
        self.emcee_sampler = None
        self.arg_names = None
        self.backend_filename = None
        self.tau_filename = None
        self.mcmc_folder = os.path.join(PROJECT_PATH, 'outputs/MCMC')

        # Pipeline flags
        self.are_maps_ready = False
        self.are_correlations_ready = False
        self.is_sampler_ready = False

        self.config = config
        for key, value in config.items():
            setattr(self, key, value)

        # Generate necessary correlation symbols
        self.map_symbols = list(set(''.join(self.correlation_symbols)))
        self.all_correlation_symbols = get_pairs(self.map_symbols)

        # Create experiment name
        self.arg_names = list(self.starting_params.keys())
        experiment_name_parts = [
            '-'.join(self.correlation_symbols), '_'.join(self.arg_names)
        ]
        if 'experiment_tag' in config and config[
                'experiment_tag'] is not None and len(
                    config['experiment_tag']) > 0:
            experiment_name_parts.append(config['experiment_tag'])
        self.experiment_name = '__'.join(experiment_name_parts)

        # Set maps and correlations
        if set_data:
            self.set_data()
        if set_maps:
            self.set_maps()
        if set_correlations:
            self.set_correlations()
for language in languages:
    n_docs = len(language_doc_dict[language])
    docs = language_doc_dict[language]
    #your code here
    tokens_n = []

    for doc in docs:
        path_to_doc = f'{doc}/{language}.drs.xml'
        tokens = get_tokens(path_to_doc)
        length = len(tokens)
        tokens_n.append(length)
    n_tokens = sum(tokens_n)
    print(f'{language}: num docs: {n_docs}, num tokens: {n_tokens}')

pairs = get_pairs(languages)
print(pairs)

for lang1, lang2 in pairs:
    docs_lang1 = language_doc_dict[lang1]
    docs_lang2 = language_doc_dict[lang2]
    #print(len(docs_lang1))
    #print(len(docs_lang2))
    number_of_docs = len(docs_lang1.intersection(docs_lang2))
    #print(len(docs_in_two))
    print(
        f'Coverage for parallel data in {lang1} and {lang2}: {number_of_docs}')

v1 = input(
    "Please enter the first language for comparison(for eg. en, it, de, nl): ")
v2 = input(
示例#9
0
                X2mask.append([1] * len(x2token))
            X1ids = LongTensor(
                utils.seq_padding(np.array(X1ids), para.bert_maxlen))
            X2ids = LongTensor(
                utils.seq_padding(np.array(X2ids), para.bert_maxlen))
            X1mask = FloatTensor(
                utils.seq_padding(np.array(X1mask), para.bert_maxlen))
            X2mask = FloatTensor(
                utils.seq_padding(np.array(X2mask), para.bert_maxlen))

            # 使用dataParallel之后下面两行需要加module()
            X1embed = model.bert_embedding([X1ids, X1mask])
            X2embed = model.bert_embedding([X2ids, X2mask])

            T1embed, T2embed = \
                utils.get_pairs([X1embed.cpu().data.numpy(), X2embed.cpu().data.numpy()])

            T1embed = FloatTensor(T1embed)
            T2embed = FloatTensor(T2embed)

            model.zero_grad()
            _, loss = model([X1embed, X2embed, T1embed, T2embed],
                            is_training=True)
            # print('loss: ', loss.sum())
            loss_list.append(loss.sum().cpu().data.numpy())
            loss.backward()
            optimizer.step()

            if step % 100 == 0:
                print("\nepoch[%d/%d] mean_loss : %0.4f" %
                      (epoch + 1, para.epoch, np.mean(loss_list)))
    def train(self,
              N,
              row,
              col,
              T,
              n_features,
              n_pairwise_features,
              hidden_layer_sizes,
              n_iterations,
              batch_size,
              n_samples,
              holdout_ratio_valid,
              learning_rate,
              root_savedir,
              log_interval=10,
              no_train_metric=False,
              seed=None,
              debug=False):
        """
        Training routine.

        Note about the data: the (row, col) tuples of the ON (i.e., one-valued) entries of the graph are to be passed,
        and they should correspond to the upper triangle of the graph. (Recall we do not allow self-links.) Regardless,
        the code will make a symmetric graph out of all passed entries (within the upper triangular or not) and only the
        upper triangle of the resulting matrix will be kept.

        :param N: Number of nodes in the graph.
        :param row: row indices corresponding to the ON entries (in the upper triangle).
        :param col: col indices corresponding to the ON entries (in the upper triangle).
        :param T: Truncation level for the DP.
        :param n_features:
        :param hidden_layer_sizes:
        :param n_iterations:
        :param batch_size: HALF the minibatch size. In particular, we will always add the symmetric entry in the graph
            (i.e., the corresponding entry in the lower triangle) in the minibatch.
        :param n_samples:
        :param holdout_ratio_valid:
        :param learning_rate:
        :param root_savedir:
        :param no_train_metric:
        :param seed:
        :param debug:
        :return:
        """
        self.N = N
        self.T = T
        self.n_features = n_features
        self.n_pairwise_features = n_pairwise_features
        self.hidden_layer_sizes = hidden_layer_sizes

        if not os.path.exists(root_savedir):
            os.makedirs(root_savedir)

        # Data handling.
        X_sp = sp.csr_matrix((np.ones(len(row)), (row, col)), shape=[N, N])
        X_sp = X_sp + X_sp.transpose()
        X_sp = sp.triu(X_sp, k=1)
        row, col = X_sp.nonzero()

        pairs = get_pairs(N, row, col)
        pairs = pairs.astype(int)

        batch_generator = BatchGenerator(pairs,
                                         batch_size,
                                         holdout_ratio=holdout_ratio_valid,
                                         seed=seed)

        # Construct the TF graph.
        self.construct_graph()
        all_vars = tf.trainable_variables()
        print("\nTrainable variables:")
        pprint([var_.name for var_ in all_vars])

        train_op = tf.train.AdamOptimizer(
            learning_rate=learning_rate).minimize(-self.elbo)

        ###  Create q(Z) variational parameters  ###

        # before this was uniformly initialized
        # self.qZ_ = np.ones([N, T]) / T
        self.qZ_ = np.random.dirichlet(np.ones(T), size=N)  # (N, T)

        # the following quantity needs to be passed to the TF graph and must be updated after every update to qZ
        sum_qZ_above = np.zeros([N, T - 1])
        for k in range(T - 1):
            sum_qZ_above[:, k] = np.sum(self.qZ_[:, k + 1:], axis=1)

        # Training.
        if not no_train_metric:
            train_elbo = tf.placeholder(dtype=tf.float32,
                                        shape=[],
                                        name='train_elbo')
            train_elbo_summary = tf.summary.scalar('train_elbo', train_elbo)

            train_ll = tf.placeholder(dtype=tf.float32,
                                      shape=[],
                                      name='train_ll')
            train_ll_summary = tf.summary.scalar('train_ll', train_ll)

        if holdout_ratio_valid is not None:
            test_ll = tf.placeholder(dtype=tf.float32,
                                     shape=[],
                                     name='test_ll')
            test_ll_summary = tf.summary.scalar('test_ll', test_ll)

        # Grab all scalar variables, to track in Tensorboard.
        trainable_vars = tf.trainable_variables()
        scalar_summaries = [
            tf.summary.scalar(tensor_.name, tensor_)
            for tensor_ in trainable_vars if len(tensor_.shape) == 0
        ]
        tensor_summaries = [
            tf.summary.histogram(tensor_.name, tensor_)
            for tensor_ in trainable_vars if len(tensor_.shape) > 0
        ]

        root_logdir = os.path.join(root_savedir, "tf_logs")
        writer = tf.summary.FileWriter(root_logdir)

        saver = tf.train.Saver()
        init = tf.global_variables_initializer()

        with tf.Session() as sess:
            init.run()

            if not no_train_metric:

                # add symmetric entries from the lower triangle
                train_data = batch_generator.train
                row = np.concatenate([train_data[:, 0], train_data[:, 1]])
                col = np.concatenate([train_data[:, 1], train_data[:, 0]])
                val = np.concatenate([train_data[:, 2], train_data[:, 2]])
                train_dict = {
                    self.row: row,
                    self.col: col,
                    self.val: val,
                    self.batch_scale: 1.0
                }

            if holdout_ratio_valid is not None:
                test_data = batch_generator.test
                row = np.concatenate([test_data[:, 0], test_data[:, 1]])
                col = np.concatenate([test_data[:, 1], test_data[:, 0]])
                val = np.concatenate([test_data[:, 2], test_data[:, 2]])
                test_dict = {
                    self.row: row,
                    self.col: col,
                    self.val: val,
                    self.batch_scale: 1.0
                }

            logging.info("Starting training...")
            for iteration in range(n_iterations):

                batch = batch_generator.next_batch()
                batch_dict = {
                    self.row: np.concatenate([batch[:, 0], batch[:, 1]]),
                    self.col: np.concatenate([batch[:, 1], batch[:, 0]]),
                    self.val: np.concatenate([batch[:, 2], batch[:, 2]]),
                    self.qZ: self.qZ_,
                    self.n_samples: n_samples,
                    self.batch_scale: len(pairs) / len(batch),
                    self.sum_qZ_above: sum_qZ_above,
                }

                # make a gradient update
                sess.run(train_op, feed_dict=batch_dict)

                # analytically
                self.update_qZ(sess=sess,
                               batch=batch,
                               n_samples=n_samples,
                               debug=debug)

                # this update to sum_qZ_above was done at the beginning of the iteration. this implementation updates the sum_qZ_above before
                # logging the intermediate loss functions, and also one more time before saving the model. this actually makes more sense to me.
                # we could also just add this computation inside the construct graph function? it would have to be recomputed a few times more, but makes the code cleaner
                for k in range(T - 1):
                    sum_qZ_above[:, k] = np.sum(self.qZ_[:, k + 1:], axis=1)

                if iteration % log_interval == 0:

                    # Add scalar variables to Tensorboard.
                    for summ_str in sess.run(scalar_summaries):
                        writer.add_summary(summ_str, iteration)
                    # Add tensor variables to Tensorboard.
                    for summ_str in sess.run(tensor_summaries):
                        writer.add_summary(summ_str, iteration)

                    if not no_train_metric:
                        train_dict.update({
                            self.qZ: self.qZ_,
                            self.sum_qZ_above: sum_qZ_above,
                            self.n_samples: 100
                        })
                        train_ll_, train_elbo_ = sess.run(
                            [self.data_loglikel, self.elbo],
                            feed_dict=train_dict)
                        train_ll_summary_str, train_elbo_summary_str = sess.run(
                            [train_ll_summary, train_elbo_summary],
                            feed_dict={
                                train_ll: train_ll_,
                                train_elbo: train_elbo_
                            })
                        writer.add_summary(train_ll_summary_str, iteration)
                        writer.add_summary(train_elbo_summary_str, iteration)

                    if holdout_ratio_valid is not None:
                        test_dict.update({
                            self.qZ: self.qZ_,
                            self.sum_qZ_above: sum_qZ_above,
                            self.n_samples: 100
                        })
                        test_ll_ = sess.run(self.data_loglikel,
                                            feed_dict=test_dict)
                        test_ll_summary_str = sess.run(
                            test_ll_summary, feed_dict={test_ll: test_ll_})
                        writer.add_summary(test_ll_summary_str, iteration)

                    # Log training overview.
                    log_str = "%-4d" % iteration
                    if not no_train_metric:
                        log_str += "  ELBO: %.4e  Train ll: %.4e" % (
                            train_elbo_, train_ll_)
                    if holdout_ratio_valid is not None:
                        log_str += "  Valid ll: %.4e" % test_ll_
                    logging.info(log_str)

            # save the model
            saver.save(sess, os.path.join(root_savedir, "model.ckpt"))

        # close the file writer
        writer.close()
示例#11
0
    def train_epoch(self):
        epoch_loss = 0.
        for sent in self.dataset:
            sent = [w for w in sent.split() if w in self.vocab]
            if len(sent) <= 1:
                continue
            sent = self.sub_sampler.subsample_sent(sent)
            if len(sent) <= 1:
                continue
            pairs = get_pairs(sent, self.c)

            # sentence batched version
            loss = 0.
            pair_idxs = torch.tensor(
                [[self.vocab[pair[0]][0], self.vocab[pair[1]][0]]
                 for pair in pairs])

            inps = self.vecs[pair_idxs[:, 0]]
            targs = self.vecs[pair_idxs[:, 1]]
            t_scores = torch.bmm(inps, torch.transpose(targs, 1, 2))

            # find indices of the senses used for each word in the sentence
            m2, mi2 = torch.max(t_scores, dim=2)
            senses = torch.argmax(m2, dim=1)

            # This one also "negative samples" the scores produced by non maximal word sense pairings
            # _ , t_max_ind = torch.max(t_scores,dim=1)
            # t_scores *= -1
            # t_scores[np.arange(len(t_scores)),t_max_ind] *= -1
            # t_probs = self.sigmoid(t_scores)

            # This does not penalize other senses
            t_probs = self.sigmoid(
                torch.max(t_scores.view(len(pairs), -1), dim=1)[0])

            loss += -torch.mean(torch.log(t_probs))

            # can I also batch negative samples? :O
            neg_batch_idxs = self.neg_sampler.sample(self.k * len(pairs),
                                                     words=False)
            ntargs = self.vecs[neg_batch_idxs]
            neg_senses = senses.repeat(self.k)
            neg_inps = inps.repeat(self.k, 1, 1)

            # only use the senses indicated by max score
            neg_inps = neg_inps[np.arange(len(neg_inps)), neg_senses, :]

            n_scores = torch.bmm(neg_inps.view(len(neg_inps), 1, -1),
                                 torch.transpose(ntargs, 1, 2))

            #negative sample every sense pairing?
            # n_probs = self.sigmoid(-1.0*n_scores)

            # only negative sample the max?
            n_probs = self.sigmoid(
                -1.0 * torch.max(n_scores.view(len(n_scores), -1), dim=1)[0])

            # only negative sample the sense that inp was actually used in. (the row which max occurs in)
            # n_probs = self.sigmoid(-1.0*n_scores)

            loss += -torch.sum(torch.log(n_probs)) / len(n_probs)
            loss.backward()
            self.optimizer.step()
            epoch_loss += float(loss)

            # # non batched version
            # for pair in pairs:
            #     loss = 0.
            #     self.optimizer.zero_grad()

            #     inp = self.vecs[self.vocab[pair[0]][0]]
            #     targ = self.vecs[self.vocab[pair[1]][0]]

            #     t_scores = torch.mm(inp, targ.t())
            #     t_prob = self.sigmoid(torch.max(t_scores))
            #     loss += -torch.log(t_prob)

            #     # negative sampling
            #     neg_samples = self.neg_sampler.sample(self.k)
            #     for ns in neg_samples:
            #         ntarg = self.vecs[self.vocab[ns][0]]
            #         n_scores = torch.mm(inp, ntarg.t())
            #         n_prob = self.sigmoid(-1.0*torch.max(n_scores))
            #         loss += -torch.log(n_prob)

            #     loss.backward()
            #     self.optimizer.step()
            #     epoch_loss += float(loss)

        return epoch_loss