예제 #1
0
def save_merged_ground_truth_data(
    full_selection, filename=util.resource(default_ground_truth_data_file)):
    data_processor = TrainingDataProcessing()
    data_ls = data_processor.read_topic_labels(
        util.resource('labeled-topics-1000-ls.txt'))
    data_dm = data_processor.read_topic_labels(
        util.resource('labeled-topics-1000-dm.txt'))
    data = merge_ground_truth(data_ls, data_dm)
    data_processor.save_topic_labels(data.keys(), data.values(), filename)
예제 #2
0
 def __init__(
     self,
     subcat_index_file=util.resource('wikipedia/uri-to-subcats'),
     supercat_index_file=util.resource('wikipedia/uri-to-supercats')):
     self._wiki_graph = WikipediaGraphIndex(
         subcat_index_file=subcat_index_file,
         supercat_index_file=supercat_index_file)
     self._children = collections.OrderedDict()
     self._parents = collections.OrderedDict()
예제 #3
0
def depth_based_selection(root=default_root, max_depth=default_max_depth):
    relation_cache = CategoryRelationCache(
        subcat_index_file=util.resource('wikipedia/uri-to-subcats'),
        supercat_index_file=util.resource('wikipedia/uri-to-supercats'))
    full_selection = CategorySelection(root,
                                       max_depth=max_depth,
                                       relation_cache=relation_cache)
    full_selection.run()
    return full_selection
예제 #4
0
    def __init__(self,
                 acm_concept_file=util.resource("acm-concepts.txt"),
                 acm_rels_file=util.resource("acm-relations.txt"),
                 acm_mapping_file=util.resource("acm-wiki-mapping.txt")):

        self._concepts = self._read_concepts(acm_concept_file)

        acm_ids, wiki_uris = self._read_mapping(acm_mapping_file)
        self._wiki2acm = dict(zip(wiki_uris, acm_ids))
        self._acm2wiki = {v: k for k, v in self._wiki2acm.items()}

        self._children = self._read_relations(acm_rels_file)
예제 #5
0
def read_ground_truth_data(
        filename=util.resource('labeled-relations-new-1000-dm.txt')):
    data = read_relation_type_labels(filename)

    is_class = topic_type.topic_type_prediction(remember_gt=True).is_class

    def node_to_type_char(title):
        uri = title_to_uri(title, category=True)
        return 'Class' if is_class(uri) else 'Individual'

    data['parent_type'] = data['parent'].apply(node_to_type_char)
    data['child_type'] = data['child'].apply(node_to_type_char)

    # fixing the incorrectly classified nodes
    data.ix[data['parent'] == 'Computational linguistics',
            'parent_type'] = 'Individual'
    data.ix[data['child'] == 'Twitter', 'child_type'] = 'Individual'
    data.ix[data['child'] == 'Populous', 'child_type'] = 'Individual'
    data.ix[data['child'] == 'Canary Islands', 'child_type'] = 'Individual'
    data.ix[data['parent'] == 'Algorithms and data structures',
            'parent_type'] = 'Individual'
    data.ix[data['child'] == 'Computational statistics',
            'child_type'] = 'Individual'
    data.ix[data['child'] == 'Digital electronics',
            'child_type'] = 'Individual'
    data.ix[data['parent'] == 'Computer storage media',
            'parent_type'] = 'Class'
    data.ix[data['parent'] == 'IEC 61131', 'parent_type'] = 'Class'
    data.ix[data['child'] == 'IEC 61131', 'child_type'] = 'Class'
    data.ix[data['parent'] == 'Digital cameras', 'parent_type'] = 'Class'
    data.ix[data['child'] == 'Sony cameras', 'child_type'] = 'Class'
    data.ix[data['parent'] == 'Computer companies', 'parent_type'] = 'Class'
    data.ix[data['parent'] == 'Computer companies', 'parent_type'] = 'Class'
    data.ix[data['parent'] == 'Mathematics of computing',
            'parent_type'] = 'Individual'
    data.ix[data['parent'] == '1970s in computer science',
            'parent_type'] = 'Individual'
    data.ix[data['parent'] == '1980s in computer science',
            'parent_type'] = 'Individual'
    data.ix[data['parent'] == 'Health informatics',
            'parent_type'] = 'Individual'
    data.ix[data['parent'] == '1990s in video gaming',
            'parent_type'] = 'Individual'
    data.ix[data['child'] == 'Anime based on video games',
            'child_type'] = 'Class'
    data.ix[data['child'] == 'Android cameras with optical zoom',
            'child_type'] = 'Class'
    data.ix[data['parent'] == 'Algorithms', 'parent_type'] = 'Class'
    data.ix[data['child'] == 'Algorithms', 'child_type'] = 'Class'

    return data
예제 #6
0
def topic_type_prediction(
        topic_uris=None,
        classes=None,
        ground_truth_file=util.resource('labeled-topic-types-1000-dm.txt'),
        n_folds=10,
        param_grid=_PARAM_GRID,
        tuned_clf=LinearSVC(loss='l1'),
        scoring='f1',
        random_state=0,
        remember_gt=False):
    if ground_truth_file and not topic_uris:
        topic_uris, classes = read_ground_truth(ground_truth_file)
    return TopicTypePrediction(topic_uris,
                               classes,
                               n_folds=n_folds,
                               param_grid=param_grid,
                               tuned_clf=tuned_clf,
                               scoring=scoring,
                               random_state=random_state,
                               remember_gt=remember_gt)
예제 #7
0
def generate_topic_classifier():
    topic_uris, labels = read_node_type_labels(
        util.resource('labeled-topic-types-1000-dm.txt'))
    classes = [label_to_class(label) for label in labels]
    features = generate_default_features(topic_uris)

    cv_clf = train_cv_clf(topic_uris, classes, features)
    best_clf = cv_clf.best_estimator_

    best_clf.fit(to_features(features, topic_uris), classes)

    ground_truth_class = dict(zip(topic_uris, classes))

    def is_class(uri):
        if uri in ground_truth_class:
            return ground_truth_class[uri]
        else:
            return best_clf.predict(to_features(features, [uri]))[0]

    return is_class
예제 #8
0
 def __init__(
     self,
     subcat_index_file=util.resource('wikipedia/uri-to-subcats'),
     supercat_index_file=util.resource('wikipedia/uri-to-supercats')):
     self._subcat_index_file = subcat_index_file
     self._supercat_index_file = supercat_index_file
예제 #9
0
def read_ground_truth_data(
        filename=util.resource(default_ground_truth_data_file)):
    return TrainingDataProcessing().read_topic_labels(filename)
예제 #10
0
def read_ground_truth_topic_labels():
    data_processing = topics.TrainingDataProcessing()
    return data_processing.read_topic_labels(
        util.resource('labeled-topics-music-1000-dm.txt'))
예제 #11
0
import logging
logging.basicConfig(level=logging.WARN)
# Silence the verbose urllib logger.
logging.getLogger('requests.packages.urllib3.connectionpool').setLevel(
    logging.WARN)

from dswont import topics
from dswont import util
from dswont import dbpedia

# <codecell>

ROOT_CATEGORY_MUSIC = 'http://dbpedia.org/resource/Category:Music'
DEFAULT_SELECTION_DEPTH = 9
DEFAULT_RELATION_CACHE = topics.CategoryRelationCache(
    subcat_index_file=util.resource('wikipedia/uri-to-subcats-music'),
    supercat_index_file=util.resource('wikipedia/uri-to-supercats-music'))


def music_category_selection(**params):
    updated_params = {
        'root': ROOT_CATEGORY_MUSIC,
        'relation_cache': DEFAULT_RELATION_CACHE
    }
    updated_params.update(params)
    selection = topics.CategorySelection(**updated_params)
    selection.run()
    return selection


def precompute_full_selection(precomputed_data={}):
예제 #12
0
from dswont.dbpedia import title_to_uri, uri_to_title, is_category_uri
from dswont import util
import pandas as pd

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 


WTX_NODE_TYPE_FILE = util.resource('wikitaxonomy/node-types.txt')
WTX_NODE_REL_FILE = util.resource('wikitaxonomy/rel-types.txt')


def nodes_data(file):
    data = pd.read_csv(file, sep=' ', names=['node', 'type'])
    data['node'] = data['node'].str.replace('_', ' ')
    data = data.set_index('node')
    data['is_class'] = data['type'].apply(lambda x: 'class' == x)
    return data


def rel_data(file):
    data = pd.read_csv(file, sep=' -> |\s', names=['parent', 'child'])
    data['parent'] = data['parent'].str.replace('_', ' ')
    data['child'] = data['child'].str.replace('_', ' ')
    data = data.set_index(['parent', 'child'])
    return data


def to_title(title_or_uri):
    if is_category_uri(title_or_uri):
        return uri_to_title(title_or_uri)
예제 #13
0
def print_common_suffixes():
    gt_file = util.resource('labeled-topic-types-1000-dm.txt')
    topic_uris, _ = read_ground_truth(gt_file)
    all_suffixes = generate_all_suffices(topic_uris)
    print("Most common suffixes:", generate_common_suffixes(all_suffixes)[:20])