# choose to corrupt one part of the triple
                    to_mod = data_rng.choice(3)

                    # corrupt with some other part
                    if to_mod == 0:
                        while word_a_new == word_a:
                            word_a_new = data_rng.choice(relationships.indices_of_words_in_synsets)  #sample_cumulative_discrete_distribution(ngram_reader.cumulative_word_frequencies, rng=data_rng)
                    elif to_mod == 1:
                        while word_b_new == word_b:
                            word_b_new = data_rng.choice(relationships.indices_of_words_in_synsets)  #sample_cumulative_discrete_distribution(ngram_reader.cumulative_word_frequencies, rng=data_rng)
                    elif to_mod == 2:
                        while rel_index_new == rel_index:
                            # rel_index_new = data_rng.randint(N_relationships)
                            rel_index_new = data_rng.randint(relationships.N_relationships)

                    augmented_cost, cost = model.update_v(word_a, word_b, rel_index, word_a_new, word_b_new, rel_index_new)
                    if not np.isfinite(cost):
                        print 'nan detected'
                        save_model('nan_dump.pkl.gz')
                        import IPython
                        IPython.embed()
                    costs.append(cost)
                    augmented_costs.append(augmented_cost)

                    if i % print_freq == 0:
                        sys.stdout.write('\r k %i: pair : %d / %d' % (model.k, i, block_size))
                        sys.stdout.flush()

                if args['semantic_blocks_to_run'] > 1:
                    print
                    print  '%i intermediate mean %f' % (block_num, np.mean(costs[-block_size:]))
예제 #2
0
            this_count = 0
            augmented_costs = []
            costs = []
            for block_num in xrange(args['semantic_blocks_to_run']):
                for i in xrange(args['semantic_block_size']):
                    train_i = -1
                    while train_i not in indices_in_intersection:
                        train_i = sample_cumulative_discrete_distribution(ngram_reader.cumulative_word_frequencies, rng=data_rng)
                    for j in xrange(args['k_nearest']):
                        train_j = -1
                        while train_j not in indices_in_intersection:
                            train_j = sample_cumulative_discrete_distribution(ngram_reader.cumulative_word_frequencies, rng=data_rng)
                        if word_similarity.word_pairwise_sims[train_i, train_j] == -np.inf:
                            continue
                        sim = word_similarity.word_pairwise_sims[train_i, train_j]
                        augmented_cost, cost = model.update_v(train_i, train_j, sim)
                        augmented_costs.append(augmented_cost)
                        costs.append(cost)

                    if i % print_freq == 0:
                        sys.stdout.write('\r k %i: pair : %d / %d' % (model.k, i, args['semantic_block_size']))
                        sys.stdout.flush()

                if args['semantic_blocks_to_run'] > 1:
                    print
                    print  '%i intermediate mean %f' % (block_num, np.mean(costs[-args['semantic_block_size']:]))
            print
            stats_for_k['semantic_mean'] = np.mean(costs)
            stats_for_k['semantic_std'] = np.std(costs)
            print 'semantic mean cost \t%f' % stats_for_k['semantic_mean']
            print 'semantic std cost \t%f' % stats_for_k['semantic_std']
예제 #3
0
                    while train_i not in indices_in_intersection:
                        train_i = sample_cumulative_discrete_distribution(
                            ngram_reader.cumulative_word_frequencies,
                            rng=data_rng)
                    for j in xrange(args['k_nearest']):
                        train_j = -1
                        while train_j not in indices_in_intersection:
                            train_j = sample_cumulative_discrete_distribution(
                                ngram_reader.cumulative_word_frequencies,
                                rng=data_rng)
                        if word_similarity.word_pairwise_sims[
                                train_i, train_j] == -np.inf:
                            continue
                        sim = word_similarity.word_pairwise_sims[train_i,
                                                                 train_j]
                        augmented_cost, cost = model.update_v(
                            train_i, train_j, sim)
                        augmented_costs.append(augmented_cost)
                        costs.append(cost)

                    if i % print_freq == 0:
                        sys.stdout.write(
                            '\r k %i: pair : %d / %d' %
                            (model.k, i, args['semantic_block_size']))
                        sys.stdout.flush()

                if args['semantic_blocks_to_run'] > 1:
                    print
                    print '%i intermediate mean %f' % (
                        block_num, np.mean(
                            costs[-args['semantic_block_size']:]))
            print
예제 #4
0
                            word_a_new = data_rng.choice(
                                relationships.indices_of_words_in_synsets
                            )  #sample_cumulative_discrete_distribution(ngram_reader.cumulative_word_frequencies, rng=data_rng)
                    elif to_mod == 1:
                        while word_b_new == word_b:
                            word_b_new = data_rng.choice(
                                relationships.indices_of_words_in_synsets
                            )  #sample_cumulative_discrete_distribution(ngram_reader.cumulative_word_frequencies, rng=data_rng)
                    elif to_mod == 2:
                        while rel_index_new == rel_index:
                            # rel_index_new = data_rng.randint(N_relationships)
                            rel_index_new = data_rng.randint(
                                relationships.N_relationships)

                    augmented_cost, cost = model.update_v(
                        word_a, word_b, rel_index, word_a_new, word_b_new,
                        rel_index_new)
                    if not np.isfinite(cost):
                        print 'nan detected'
                        save_model('nan_dump.pkl.gz')
                        import IPython
                        IPython.embed()
                    costs.append(cost)
                    augmented_costs.append(augmented_cost)

                    if i % print_freq == 0:
                        sys.stdout.write('\r k %i: pair : %d / %d' %
                                         (model.k, i, block_size))
                        sys.stdout.flush()

                if args['semantic_blocks_to_run'] > 1: