示例#1
0
def test_dataset_featurize_no_vectorize(corpus, kb):
    dataset = rel_ext.Dataset(corpus, kb)
    kbts_by_rel, _ = dataset.build_dataset(sampling_rate=0.1)

    def featurizer(kbt, corpus):
        return utils.randvec(10)

    dataset.featurize(kbts_by_rel, [featurizer], vectorize=False)
示例#2
0
def test_find_new_relation_instances(corpus, kb, featurizer, vectorize):
    dataset = rel_ext.Dataset(corpus, kb)
    rel_ext.find_new_relation_instances(
        dataset, [featurizer],
        train_split='train',
        test_split='dev',
        model_factory=lambda: LogisticRegression(solver='liblinear'),
        k=10,
        vectorize=vectorize,
        verbose=False)
示例#3
0
def test_experiment(featurizer, vectorize, corpus, kb):
    dataset = rel_ext.Dataset(corpus, kb)
    splits = dataset.build_splits(
        split_names=['tiny_train', 'tiny_dev', 'rest'],
        split_fracs=[0.05, 0.05, 0.90],
        seed=1)
    results = rel_ext.experiment(splits,
                                 train_split='tiny_train',
                                 test_split='tiny_dev',
                                 featurizers=[featurizer],
                                 vectorize=vectorize,
                                 verbose=False)
示例#4
0
def test_dataset_featurize_vectorize(corpus, kb):
    dataset = rel_ext.Dataset(corpus, kb)
    kbts_by_rel, _ = dataset.build_dataset(sampling_rate=0.1)
    featurizers = [lambda kbt, corpus, feature_counter: {"bias": 1}]
    dataset.featurize(kbts_by_rel, featurizers)
示例#5
0
def test_dataset_build_splits(corpus, kb):
    dataset = rel_ext.Dataset(corpus, kb)
    dat = dataset.build_splits(seed=1)
示例#6
0
def test_dataset_build_dataset(corpus, kb):
    dataset = rel_ext.Dataset(corpus, kb)
    dat = dataset.build_dataset(include_positive=True, sampling_rate=0.1)
示例#7
0
# In[3]:

rel_ext_data_home = os.path.join('data', 'rel_ext_data')

# In[4]:

corpus = rel_ext.Corpus(os.path.join(rel_ext_data_home, 'corpus.tsv.gz'))

# In[5]:

kb = rel_ext.KB(os.path.join(rel_ext_data_home, 'kb.tsv.gz'))

# In[6]:

dataset = rel_ext.Dataset(corpus, kb)

# You are not wedded to this set-up for splits. The bake-off will be conducted on a previously unseen test-set, so all of the data in `dataset` is fair game:

# In[7]:

splits = dataset.build_splits(split_names=['tiny', 'train', 'dev'],
                              split_fracs=[0.01, 0.79, 0.20],
                              seed=1)

# In[8]:

splits

# ## Baselines