def test_kl_divergence(): import pattern_matcher a = {'a': 10, 'b': 20} b = {'a': 20, 'b': 40} kl = pattern_matcher.kl_divergence(a, b, smooth=False) assert(kl == 0.0) b = {'c': 990} kl = pattern_matcher.kl_divergence(a, b, smooth=True) import data rel = 'user.jefft0.default_domain.virus_classification_rank.virus_classifications_at_this_rank' word_entity_types = data.read_word_type_distributions('data/word-entity-type-counts_filtered') wtypes = word_entity_types['genre'] target_types = data.read_relation_target_type_distributions('data/relation-target-type-distributions') rtypes = target_types[rel] wtypes = pattern_matcher.filter_type_distribution(wtypes, n_max=10, min_count=1) rtypes = pattern_matcher.filter_type_distribution(rtypes, n_max=10, min_count=1) print pattern_matcher.kl_divergence(wtypes, rtypes, alpha=0.1) print rtypes print wtypes
def init_from_config(): """ Return an instance with options parsed by a config parser. :param config_options: :return: """ config_options = globals.config sparql_backend = globals.get_sparql_backend(config_options) relation_counts_file = config_options.get('QueryCandidateExtender', 'relation-counts') mediator_names_file = config_options.get('QueryCandidateExtender', 'mediator-names') reverse_relations_file = config_options.get('QueryCandidateExtender', 'reverse-relations') expected_types_file = config_options.get('QueryCandidateExtender', 'relation-expected-types') tt_distributions_file = config_options.get('QueryCandidateExtender', 'relation-target-type-distributions') mediator_relations_file = config_options.get('QueryCandidateExtender', 'mediator-relations') rel_lemmas_file = config_options.get('QueryCandidateExtender', 'relation-lemmas') relation_words_file = config_options.get('QueryCandidateExtender', 'relation-words') mediated_relation_words_file = config_options.get( 'QueryCandidateExtender', 'mediated-relation-words') word_type_counts_file = config_options.get( 'QueryCandidateExtender', 'word-type-counts') word_type_counts = data.read_word_type_distributions( word_type_counts_file) embeddings_model = config_options.get('Alignment', 'word-embeddings') word_deriv_file = config_options.get('Alignment', 'word-derivations') we_synonyms = WordembeddingSynonyms(embeddings_model) word_derivations = WordDerivations(word_deriv_file) mediator_relations = data.read_mediator_relations( mediator_relations_file) relation_counts = data.read_relation_counts(relation_counts_file) mediator_names = data.read_mediator_names(mediator_names_file) mediator_index = MediatorIndexFast.init_from_config() reverse_relations = data.read_reverse_relations(reverse_relations_file) relation_expected_types = data.read_relation_expected_types( expected_types_file) relation_words = data.read_relation_words(relation_words_file, n_top_words=1000) mediated_relation_words = data.read_mediated_relation_words( mediated_relation_words_file, n_top_words=1000) rel_tt_distributions = data.read_relation_target_type_distributions( tt_distributions_file) rel_lemmas = data.read_relation_lemmas(rel_lemmas_file) return QueryCandidateExtender(mediator_index, relation_counts, mediator_names, mediator_relations, reverse_relations, relation_expected_types, sparql_backend, relation_words, mediated_relation_words, rel_tt_distributions, we_synonyms, word_derivations, word_type_counts, rel_lemmas)