def build_assoc_space(input_file, output_dir): print('loading') counts = defaultdict(int) triples = [] for line in codecs.open(input_file, encoding='utf-8'): left, right, value = line.strip().split('\t') if not concept_is_bad(left) and not concept_is_bad(right): value = float(value) triples.append((value, left, right)) counts[left] += 1 counts[right] += 1 print('filtering entries') sparse = SparseEntryStorage() for (value, left, right) in triples: if concept_is_frequent_enough(left, counts) and concept_is_frequent_enough( right, counts) and left != right: sparse.add_entry((value, left, right)) del triples # Add links from a concept to itself, and negative links to its opposite if it's there for concept in counts: if concept_is_frequent_enough(concept, counts): sparse.add_entry((1., concept, concept)) negation = negate_concept(concept) if concept_is_frequent_enough(negation, counts): sparse.add_entry((-1., concept, negation)) print('making assoc space') space = AssocSpace.from_sparse_storage(sparse, 150, offset_weight=4e-5) print('saving') space.save_dir(output_dir)
def test_sparse_storage(): # Simple tests for SparseEntryStorage. bucket = SparseEntryStorage() # Getting labels and matrix from an empty storage bucket does not crash matrix, labels = bucket.get_matrix_and_labels() eq_(len(labels), 0) eq_(matrix.shape, (0, 0)) # Actually add some things and check again bucket.add_entries(ENTRIES) matrix, labels = bucket.get_matrix_and_labels() eq_(' '.join(labels), 'apple red green celery orange banana yellow lemon') eq_(matrix[0, 1], 4) eq_(matrix[6, 5], 1) eq_(matrix[4, 2], 0)