Пример #1
0
def test_from_entries_and_from_matrix():
    # Note: for convenience from_matrix() is tested here implicitly, rather
    # than in a separate test.

    # Reject outright a space with no entries and a space with insufficient
    # entries
    assert AssocSpace.from_entries([], k=1) is None
    assert AssocSpace.from_entries([(1, 'apple', 'red')], k=1) is None

    # Build with mostly-default parameters and check some simple properties
    assoc_default = AssocSpace.from_entries(ENTRIES, k=4)
    eq_(assoc_default.k, 4)
    eq_(assoc_default.sigma[0], 1.0)
    assert assoc_default.assoc_between_two_terms('apple', 'red') > 0.5
    assert assoc_default.assoc_between_two_terms('red', 'red') > 0.999
    assert assoc_default.assoc_between_two_terms('lemon', 'red') < 0.2

    # Build with strip_a0=False; in this case we have negative eigenvalues,
    # so we lose an eigenvalue from the middle to make room for a0
    assoc_no_strip = AssocSpace.from_entries(ENTRIES, k=4, strip_a0=False)
    eq_(assoc_no_strip.k, 4)
    assert np.allclose(assoc_no_strip.sigma[-1] / assoc_no_strip.sigma[1],
                       assoc_default.sigma[-1])
    assert (np.allclose(assoc_no_strip.u[:, 1], assoc_default.u[:, 0])
            or np.allclose(assoc_no_strip.u[:, 1], -assoc_default.u[:, 0]))

    # Build with normalize_gm=False
    assoc_no_norm = AssocSpace.from_entries(ENTRIES, k=4, normalize_gm=False)
    eq_(assoc_no_norm.k, 4)
Пример #2
0
def merge_vector_spaces(subspace_dir, mergers):
    merged = None
    for sourceA, sourceB, target in mergers:
        print('Merging: %s + %s -> %s' % (sourceA, sourceB, target))
        spaceA = AssocSpace.load_dir(os.path.join(subspace_dir, sourceA))
        spaceB = AssocSpace.load_dir(os.path.join(subspace_dir, sourceB))

        # On the first step, we want to keep all the axes from merging subparts.
        # Through most of the merging, we want to maintain that number of axes.
        # At the end, we want to go back to the original number of axes.

        # For example, when we are merging 300-dimensional spaces, the
        # intermediate merge results will have 600 dimensions, and the final
        # result will have 300 dimensions again.

        # We don't refer to the number of axes in spaceB in this code, because
        # we're assuming all the sub-parts have equal numbers of axes.

        if target.startswith('part'):
            k = spaceA.k * 2
        elif target == 'merged_complete':
            k = spaceA.k // 2
        else:
            k = spaceA.k

        merged = spaceA.merged_with(spaceB, k=k)
        del spaceA
        del spaceB
        merged.save_dir(os.path.join(subspace_dir, target))
    
    magnitudes = (merged.u ** 2).sum(1)
    good_indices = np.flatnonzero(magnitudes >= 1e-5)
    filtered = merged[good_indices]
    filtered.save_dir(os.path.join(subspace_dir, 'merged_filtered'))
    return filtered
Пример #3
0
def test_from_entries_and_from_matrix():
    # Note: for convenience from_matrix() is tested here implicitly, rather
    # than in a separate test.

    # Reject outright a space with no entries and a space with insufficient
    # entries
    assert AssocSpace.from_entries([], k=1) is None
    assert AssocSpace.from_entries([(1, 'apple', 'red')], k=1) is None

    # Build with mostly-default parameters and check some simple properties
    assoc_default = AssocSpace.from_entries(ENTRIES, k=4)
    eq_(assoc_default.k, 4)
    eq_(assoc_default.sigma[0], 1.0)
    assert assoc_default.assoc_between_two_terms('apple', 'red') > 0.5
    assert assoc_default.assoc_between_two_terms('red', 'red') > 0.999
    assert assoc_default.assoc_between_two_terms('lemon', 'red') < 0.2

    # Build with strip_a0=False; in this case we have negative eigenvalues,
    # so we lose an eigenvalue from the middle to make room for a0
    assoc_no_strip = AssocSpace.from_entries(ENTRIES, k=4, strip_a0=False)
    eq_(assoc_no_strip.k, 4)
    assert np.allclose(assoc_no_strip.sigma[-1] / assoc_no_strip.sigma[1],
                       assoc_default.sigma[-1])
    assert (np.allclose(assoc_no_strip.u[:, 1], assoc_default.u[:, 0]) or
            np.allclose(assoc_no_strip.u[:, 1], -assoc_default.u[:, 0]))

    # Build with normalize_gm=False
    assoc_no_norm = AssocSpace.from_entries(ENTRIES, k=4, normalize_gm=False)
    eq_(assoc_no_norm.k, 4)
Пример #4
0
def build_assoc_space(input_file, output_dir):
    print('loading')
    counts = defaultdict(int)
    triples = []

    for line in codecs.open(input_file, encoding='utf-8'):
        left, right, value = line.strip().split('\t')
        if not concept_is_bad(left) and not concept_is_bad(right):
            value = float(value)
            triples.append((value, left, right))
            counts[left] += 1
            counts[right] += 1

    print('filtering entries')
    sparse = SparseEntryStorage()
    for (value, left, right) in triples:
        if concept_is_frequent_enough(left,
                                      counts) and concept_is_frequent_enough(
                                          right, counts) and left != right:
            sparse.add_entry((value, left, right))
    del triples

    # Add links from a concept to itself, and negative links to its opposite if it's there
    for concept in counts:
        if concept_is_frequent_enough(concept, counts):
            sparse.add_entry((1., concept, concept))
            negation = negate_concept(concept)
            if concept_is_frequent_enough(negation, counts):
                sparse.add_entry((-1., concept, negation))

    print('making assoc space')
    space = AssocSpace.from_sparse_storage(sparse, 150, offset_weight=4e-5)

    print('saving')
    space.save_dir(output_dir)
Пример #5
0
def test_vectorizing_and_similar_terms():
    # Simple test for vectorizing weighted terms
    assoc = AssocSpace.from_entries(ENTRIES, k=3)
    weighted_terms = [('apple', 5), ('banana', 22), ('not a term', 17)]
    apple = assoc.row_named('apple')
    banana = assoc.row_named('banana')
    vector = assoc.vector_from_terms(weighted_terms)

    # The similarity of 'apple' to itself is approximately 1
    assert abs(assoc.assoc_between_two_terms('apple', 'apple') - 1.0) < 1e-3

    # 'apple' and 'banana' are at least 10% less similar to each other than
    # to themselves
    assert assoc.assoc_between_two_terms('apple', 'banana') < 0.9

    # The vector is some linear combination of apple and banana. Test this
    # by subtracting out apple and banana components, so that there is nothing
    # left.
    norm_apple = normalize(apple)
    banana_perp_apple = normalize(banana - norm_apple * norm_apple.dot(banana))
    residual = vector - norm_apple * norm_apple.dot(vector)
    residual -= banana_perp_apple * banana_perp_apple.dot(residual)
    assert norm(residual) < 1e-3

    # Simple test for finding similar terms
    labels, scores = zip(*assoc.terms_similar_to_vector(vector))
    eq_(list(scores), sorted(scores, reverse=True))

    most_similar = assoc.most_similar_to_vector(vector)
    eq_(most_similar[0], labels[0])
    eq_(most_similar[1], scores[0])

    assert labels.index('banana') < labels.index('apple')
    assert labels.index('apple') < labels.index('green')
    assert labels.index('apple') < labels.index('celery')
Пример #6
0
def test_vectorizing_and_similar_terms():
    # Simple test for vectorizing weighted terms
    assoc = AssocSpace.from_entries(ENTRIES, k=3)
    weighted_terms = [('apple', 5), ('banana', 22), ('not a term', 17)]
    apple = assoc.row_named('apple')
    banana = assoc.row_named('banana')
    vector = assoc.vector_from_terms(weighted_terms)

    # The similarity of 'apple' to itself is approximately 1
    assert abs(assoc.assoc_between_two_terms('apple', 'apple') - 1.0) < 1e-3

    # 'apple' and 'banana' are at least 10% less similar to each other than
    # to themselves
    assert assoc.assoc_between_two_terms('apple', 'banana') < 0.9

    # The vector is some linear combination of apple and banana. Test this
    # by subtracting out apple and banana components, so that there is nothing
    # left.
    norm_apple = normalize(apple)
    banana_perp_apple = normalize(banana - norm_apple * norm_apple.dot(banana))
    residual = vector - norm_apple * norm_apple.dot(vector)
    residual -= banana_perp_apple * banana_perp_apple.dot(residual)
    assert norm(residual) < 1e-3

    # Simple test for finding similar terms
    labels, scores = zip(*assoc.terms_similar_to_vector(vector))
    eq_(list(scores), sorted(scores, reverse=True))

    most_similar = assoc.most_similar_to_vector(vector)
    eq_(most_similar[0], labels[0])
    eq_(most_similar[1], scores[0])

    assert labels.index('banana') < labels.index('apple')
    assert labels.index('apple') < labels.index('green')
    assert labels.index('apple') < labels.index('celery')
Пример #7
0
def build_assoc_space(input_file, output_dir):
    print('loading')
    counts = defaultdict(int)
    triples = []

    for line in codecs.open(input_file, encoding='utf-8'):
        left, right, value = line.strip().split('\t')[:3]
        if not concept_is_bad(left) and not concept_is_bad(right):
            value = float(value)
            triples.append((value, left, right))
            counts[left] += 1
            counts[right] += 1

    print('filtering entries')
    sparse = SparseEntryStorage()
    for (value, left, right) in triples:
        if concept_is_frequent_enough(left, counts) and concept_is_frequent_enough(right, counts) and left != right:
            sparse.add_entry((value, left, right))
    del triples

    # Add links from a concept to itself, and negative links to its opposite if it's there
    for concept in counts:
        if concept_is_frequent_enough(concept, counts):
            sparse.add_entry((1., concept, concept))
            negation = negate_concept(concept)
            if concept_is_frequent_enough(negation, counts):
                sparse.add_entry((-1., concept, negation))

    print('making assoc space')
    space = AssocSpace.from_sparse_storage(sparse, k=300, offset_weight=1e-4)

    print('saving')
    space.save_dir(output_dir)
def optimizeAllAndInferConceptsModelTwo(assocDir):
	## load assocSpace
	assocSpace = AssocSpace.load_dir(assocDir);
	## targets and image-indices dictionary
	targetsToImageIndicesAndWeights={}; # target-word ->[(index,weight_i)...]
	targetsToCentralities={}; #target-word -> centrality-score
	loadTargetWordsFromAllImages(targetsToCentralities,targetsToImageIndicesAndWeights);
	# Model
	m = Model("psl2")
	variables= set();
	targets = {}
	loadDecisionVariablesForTargets(m,targets,variables,targetsToImageIndicesAndWeights);
	## TODO: populate the rules
	objective = LinExpr();
	objective = createObjective(m,targets,variables,objective,assocSpace,targetsToCentralities,targetsToImageIndicesAndWeights);
				
	m.update();
	m.setObjective(objective);
	
	# The objective is to minimize the costs
	m.modelSense = GRB.MINIMIZE

	# Update model to integrate new variables
	m.update()
	m.optimize();
	m.write('out2.lp');
	m.write('out2.sol');
	outputFile = open(sys.argv[1]+ sys.argv[2]+"_inferred.txt","w");
	printSolution(m,targets,outputFile);
Пример #9
0
def test_truncation():
    # Simple test of truncation
    assoc = AssocSpace.from_entries(ENTRIES, k=3)
    truncated = assoc.truncated_to(2)
    assert np.allclose(truncated.u, assoc.u[:, :2])
    assert np.allclose(truncated.sigma, assoc.sigma[:2])
    eq_(truncated.labels, assoc.labels)
    assert 0.999 < norm(truncated.assoc[0]) < 1.0
Пример #10
0
def test_truncation():
    # Simple test of truncation
    assoc = AssocSpace.from_entries(ENTRIES, k=3)
    truncated = assoc.truncated_to(2)
    assert np.allclose(truncated.u, assoc.u[:, :2])
    assert np.allclose(truncated.sigma, assoc.sigma[:2])
    eq_(truncated.labels, assoc.labels)
    assert 0.999 < norm(truncated.assoc[0]) < 1.0
Пример #11
0
def test_strip_a0():
    """When stripping a0, AssocSpace uses axes [1,k] instead of [0,k-1]."""
    assoc = AssocSpace.from_entries(entries, 3, strip_a0=False)
    assoc_stripped_mat = AssocSpace.from_entries(entries, 3, strip_a0=True)

    # Check for the same number of k
    eq_(assoc.u.shape[1], 3)
    assert np.allclose(np.abs(assoc.u[:,1]), np.abs(assoc_stripped_mat.u[:,0]))

    # check that the ratio between sigma values is preserved
    assert np.allclose(assoc.sigma[1] / assoc.sigma[2],
            assoc_stripped_mat.sigma[0] / assoc_stripped_mat.sigma[1])

    assoc_stripped_dropa0 = AssocSpace.from_entries(entries, 3).with_first_axis_dropped()
    assert np.allclose(np.abs(assoc.u[:,1]),
            np.abs(assoc_stripped_dropa0.u[:,0]))
    assert np.allclose(assoc.sigma[1] / assoc.sigma[2],
            assoc_stripped_dropa0.sigma[0] / assoc_stripped_dropa0.sigma[1])
Пример #12
0
def load_assoc():
    """
    Load the association matrix. Requires the open source Python package
    'assoc_space'.
    """
    global commonsense_assoc
    if commonsense_assoc: return commonsense_assoc
    dirname = ASSOC_DIR
    commonsense_assoc = AssocSpace.load_dir(ASSOC_DIR)
    return commonsense_assoc
Пример #13
0
def load_assoc():
    """
    Load the association matrix. Requires the open source Python package
    'assoc_space'.
    """
    global commonsense_assoc
    if commonsense_assoc: return commonsense_assoc
    dirname = ASSOC_DIR
    commonsense_assoc = AssocSpace.load_dir(ASSOC_DIR)
    return commonsense_assoc
Пример #14
0
def test_merging():
    # The actual math of merging is tested separately in test_eigenmath; here
    # we just spot-verify that AssocSpace is using it reasonably

    # Generate test assoc spaces and merge them
    assoc1 = AssocSpace.from_entries(ENTRIES, k=4)
    assoc2 = AssocSpace.from_entries(MORE_ENTRIES, k=4)
    merged = assoc1.merged_with(assoc2)
    eq_(merged.k, 8)

    # Check some simple things
    merged = assoc1.merged_with(assoc2, k=4)
    eq_(merged.k, 4)

    eq_(' '.join(merged.labels),
        'apple red green celery orange banana yellow lemon blue tasty ferret')
    assert merged.assoc_between_two_terms('ferret', 'yellow') > 0.5
    assert (assoc2.assoc_between_two_terms(
        'apple', 'red') < merged.assoc_between_two_terms('apple', 'red') <
            assoc1.assoc_between_two_terms('apple', 'red'))
Пример #15
0
def test_merging():
    # The actual math of merging is tested separately in test_eigenmath; here
    # we just spot-verify that AssocSpace is using it reasonably

    # Generate test assoc spaces and merge them
    assoc1 = AssocSpace.from_entries(ENTRIES, k=4)
    assoc2 = AssocSpace.from_entries(MORE_ENTRIES, k=4)
    merged = assoc1.merged_with(assoc2)
    eq_(merged.k, 8)

    # Check some simple things
    merged = assoc1.merged_with(assoc2, k=4)
    eq_(merged.k, 4)

    eq_(' '.join(merged.labels),
        'apple red green celery orange banana yellow lemon blue tasty ferret')
    assert merged.assoc_between_two_terms('ferret', 'yellow') > 0.5
    assert (assoc2.assoc_between_two_terms('apple', 'red') <
            merged.assoc_between_two_terms('apple', 'red') <
            assoc1.assoc_between_two_terms('apple', 'red'))
Пример #16
0
    def load(self):
        if self.assoc is not None:
            return

        try:
            from assoc_space import AssocSpace
            self.assoc = AssocSpace.load_dir(self.path)
        except ImportError:
            raise MissingAssocSpace("The assoc_space package is not installed.")
        except ZeroDivisionError:
            raise MissingAssocSpace("The space of term associations could not "
                                    "be loaded.")
Пример #17
0
def test_assoc_constructor():
    # Make a nice, normal AssocSpace
    u = np.asarray([[0, 1, 0.6], [1, 0, 0.8]])
    sigma = np.asarray([0.5, 0.3, 0.2])
    labels = LabelSet(['A', 'B'])
    assoc = AssocSpace(u, sigma, labels)
    eq_(assoc.k, 3)
    assert 'assoc' not in assoc.__dict__

    # Test some error conditions
    with assert_raises(ValueError):
        AssocSpace(u, np.asarray([0.0, -0.2, -0.4]), labels)
    with assert_raises(ValueError):
        AssocSpace(u, np.asarray([0.6, 0.4]), labels)
    with assert_raises(ValueError):
        AssocSpace(u, np.asarray([0.6, 0.7, 0.2]), labels)

    # Test assoc hinting
    assoc_matrix = assoc.assoc.copy()
    assoc_hinted = AssocSpace(u, sigma, labels, assoc=assoc_matrix)
    assert np.allclose(assoc_hinted.row_named('A'), assoc.row_named('A'))
Пример #18
0
    def load(self):
        if self.assoc is not None:
            return

        try:
            from assoc_space import AssocSpace
            self.assoc = AssocSpace.load_dir(self.path)
        except ImportError:
            raise MissingAssocSpace(
                "The assoc_space package is not installed.")
        except ZeroDivisionError:
            raise MissingAssocSpace("The space of term associations could not "
                                    "be loaded.")
Пример #19
0
def test_assoc_constructor():
    # Make a nice, normal AssocSpace
    u = np.asarray([[0, 1, 0.6], [1, 0, 0.8]])
    sigma = np.asarray([0.5, 0.3, 0.2])
    labels = LabelSet(['A', 'B'])
    assoc = AssocSpace(u, sigma, labels)
    eq_(assoc.k, 3)
    assert 'assoc' not in assoc.__dict__

    # Test some error conditions
    with assert_raises(ValueError):
        AssocSpace(u, np.asarray([0.0, -0.2, -0.4]), labels)
    with assert_raises(ValueError):
        AssocSpace(u, np.asarray([0.6, 0.4]), labels)
    with assert_raises(ValueError):
        AssocSpace(u, np.asarray([0.6, 0.7, 0.2]), labels)

    # Test assoc hinting
    assoc_matrix = assoc.assoc.copy()
    assoc_hinted = AssocSpace(u, sigma, labels, assoc=assoc_matrix)
    assert np.allclose(assoc_hinted.row_named('A'), assoc.row_named('A'))
Пример #20
0
def test_filter():
    # Build and filter an assoc space
    assoc = AssocSpace.from_entries(ENTRIES, k=5)
    filtered = assoc.filter(_filter)

    # Check simple properties of the filtered space
    eq_(filtered.k, 5)
    eq_(' '.join(filtered.labels), 'red green celery banana lemon')

    # Check that redecomposition happened
    assert np.allclose(norm(filtered.u[:, 1]), 1.0)

    # Redecomposition can be kind of weird, but this result is intuitive
    assert (assoc.assoc_between_two_terms('red', 'banana') <
            filtered.assoc_between_two_terms('red', 'banana') <
            assoc.assoc_between_two_terms('yellow', 'banana'))
Пример #21
0
def test_filter():
    # Build and filter an assoc space
    assoc = AssocSpace.from_entries(ENTRIES, k=5)
    filtered = assoc.filter(_filter)

    # Check simple properties of the filtered space
    eq_(filtered.k, 5)
    eq_(' '.join(filtered.labels), 'red green celery banana lemon')

    # Check that redecomposition happened
    assert np.allclose(norm(filtered.u[:, 1]), 1.0)

    # Redecomposition can be kind of weird, but this result is intuitive
    assert (assoc.assoc_between_two_terms(
        'red', 'banana') < filtered.assoc_between_two_terms('red', 'banana') <
            assoc.assoc_between_two_terms('yellow', 'banana'))
def run():
    ENTRIES = [
        (4, '/c/en/apple', '/c/en/red'),
        (1, '/c/en/apple', '/c/en/green'),
        (3, '/c/en/apple', '/c/en/orange'),
        (3, '/c/en/banana', '/c/en/orange'),
        (1, '/c/en/banana', '/c/en/yellow'),
        (0.5, '/c/en/lemon', '/c/en/yellow'),
        (1.5, '/c/en/orange', '/c/en/lemon'),
        (0.1, '/c/en/apple', '/c/en/lemon'),
        (0.2, '/c/en/banana', '/c/en/lemon'),
        (0.5, '/c/en/ideas', '/c/en/colorless'),
        (0.5, '/c/en/ideas', '/c/en/green'),
        (1, '/c/en/example', '/c/en/green'),
    ]
    space = AssocSpace.from_entries(ENTRIES, k=4)
    space.save_dir('../conceptnet5/support_data/testdata/input/assoc_space')
from assoc_space import AssocSpace
import sys
import threading
import math


def computeNormalizedValue(value, maxV, minV, addOne=False):
    if addOne:
        return (value - minV + 1) / (maxV - minV + 1)
    return (value - minV) / (maxV - minV)


if len(sys.argv) < 4:
    print "python conceptnetAssocSpace.py <seedsfile> <targetfile> <AssocSpaceDirectory>"
    sys.exit()
assocSpace = AssocSpace.load_dir(sys.argv[3])
words = []
minSimilarity = -0.358846
maxSimilarity = 0.999747
minCentrality = -0.00188222
maxCentrality = 0.00324597
with open(sys.argv[1], "r") as f:
    i = 0
    for line in f:
        if line.startswith("##"):
            continue
        words = line.split("\t")
        word1 = "/c/en/" + words[0].strip()
        with open(sys.argv[2], "r") as f2:
            for line in f2:
                if line.startswith("##"):
Пример #24
0
def test_pickle_round_trip():
    """An AssocSpace survives a round-trip to pickle format and back."""
    assoc = AssocSpace.from_entries(entries, 3)
    pickled = pickle.dumps(assoc)
    assoc2 = pickle.loads(pickled)
    eq_(assoc, assoc2)
Пример #25
0
def test_dir_round_trip():
    assoc = AssocSpace.from_entries(ENTRIES, k=3)
    assoc.save_dir('/tmp/assoc_test')
    assoc2 = AssocSpace.load_dir('/tmp/assoc_test')
    eq_(assoc, assoc2)
Пример #26
0
def test_dir_round_trip():
    assoc = AssocSpace.from_entries(entries, 3)
    assoc.save_dir('/tmp/assoc_test')
    assoc2 = AssocSpace.load_dir('/tmp/assoc_test')
    eq_(assoc, assoc2)
Пример #27
0
def test_pickle_round_trip():
    """An AssocSpace survives a round-trip to pickle format and back."""
    assoc = AssocSpace.from_entries(ENTRIES, k=3)
    pickled = pickle.dumps(assoc)
    assoc2 = pickle.loads(pickled)
    eq_(assoc, assoc2)
Пример #28
0
def test_association_calculations():
    assoc = AssocSpace.from_entries(entries, 3)
    assert abs(assoc.assoc_between_two_terms('apple', 'apple') - 1.0) < 1e-3
    assert assoc.assoc_between_two_terms('apple', 'banana') < 0.9
Пример #29
0
def main(dir):
    assoc = AssocSpace.load_dir(dir)
    test(assoc)
Пример #30
0
def main(dir):
	assoc = AssocSpace.load_dir(dir)
	test(assoc)
Пример #31
0
    if not os.path.isfile(sortedIndicesFileName):
        sim = assocSpace.assoc.dot(assocSpace.row_named("/c/en/" + word))
        indices = np.argsort(sim)[::-1]
        np.savez_compressed(sortedIndicesFileName, indices[:1000])
        sim_first1k = np.array([sim[index] for index in indices[:1000]])
        np.savez_compressed(simFileName, sim_first1k)

    sim = np.load(simFileName)
    indices = np.load(sortedIndicesFileName)
    data = []
    for index in indices:
        if len(data) == limit:
            break
        if filterEnglishWords(names[index]):
            data.append((names[index], sim[index]))
    return data


minSimilarity = -1
maxSimilarity = 1
minCentrality = -0.00188222
maxCentrality = 0.00324597
assocDir = "../conceptnet5/data/assoc/assoc-space-5.4"
assocSpace = AssocSpace.load_dir(assocDir)
names = assocSpace.labels
word2vec_model = models.word2vec.Word2Vec.load_word2vec_format(
    '../../../DATASETS/GoogleNews-vectors-negative300.bin', binary=True)
word2vec_model.init_sims(replace=True)
TOO_RARE_WORD_CODE = -3
NOT_FOUND_IN_CORPUS_CODE = -2