def single_test_propagate(): # Call the code under test. propagated = propagate( COMBINED_INDEX, FRAME, ADJACENCY_MATRIX, len(NEW_ENGLISH_TERMS) ) # The propagated terms should be the terms from the conbined index, # starting with the terms of the frame and going up to the last new # term from the graph that is not in English. assert len(propagated) == len(FRAME) + len( NEW_NON_ENGLISH_TERMS ), 'Incorrect number {} (should be {}) of propagated terms.'.format( len(propagated), len(FRAME) + len(NEW_NON_ENGLISH_TERMS) ) for i_term in range(len(propagated)): assert ( propagated.index[i_term] == COMBINED_INDEX[i_term] ), 'Propagated output terms do not agree with the input terms.' # The original embedding should not be altered. assert_allclose( propagated.values[: len(FRAME), :], FRAME.values, err_msg='Propagation changed an input embedding vector.', ) # Terms not from the original embedding should be assigned the # average of the vectors of their neighbors of lesser rank, if all # of those neighbors are either from the original embedding or non- # English. for term in NEW_NON_ENGLISH_TERMS: count = 0 sum = np.zeros((EMBEDDING_DIM,), dtype=np.float32) for other_term in COMBINED_INDEX: if (term, other_term) in EDGE_SET and RANKS[other_term] < RANKS[term]: if other_term in NEW_ENGLISH_TERMS: break count += 1 sum = np.add(sum, propagated.loc[other_term]) else: assert_allclose( propagated.loc[term], sum / count, err_msg='Incorrect propagated vector for term {}'.format(term), )
def single_test_propagate(): # Call the code under test. propagated = propagate( COMBINED_INDEX, FRAME, ADJACENCY_MATRIX, len(NEW_ENGLISH_TERMS) ) # The propagated terms should be the terms from the conbined index, # starting with the terms of the frame and going up to the last new # term from the graph that is not in English. assert (len(propagated) == len(FRAME) + len(NEW_NON_ENGLISH_TERMS)), \ 'Incorrect number {} (should be {}) of propagated terms.'.format(len(propagated), len(FRAME) + len(NEW_NON_ENGLISH_TERMS)) for i_term in range(len(propagated)): assert (propagated.index[i_term] == COMBINED_INDEX[i_term]), \ 'Propagated output terms do not agree with the input terms.' # The original embedding should not be altered. assert_allclose(propagated.values[:len(FRAME), :], FRAME.values, err_msg='Propagation changed an input embedding vector.') # Terms not from the original embedding should be assigned the # average of the vectors of their neighbors of lesser rank, if all # of those neighbors are either from the original embedding or non- # English. for term in NEW_NON_ENGLISH_TERMS: count = 0 sum = np.zeros((EMBEDDING_DIM,), dtype=np.float32) for other_term in COMBINED_INDEX: if ( (term, other_term) in EDGE_SET and RANKS[other_term] < RANKS[term] ): if other_term in NEW_ENGLISH_TERMS: break count += 1 sum = np.add(sum, propagated.loc[other_term]) else: assert_allclose( propagated.loc[term], sum/count, err_msg='Incorrect propagated vector for term {}'.format(term) )
def single_test_sharded_propagate(): # Run the sharded propagation code over the test data in 2 shards. # We patch several functions with mock objects: sharded_propagate reads # an assoc edge file, so we patch builtins.open to give sharded_propagate # the test data graph as that input. It reads an embedding (a dataframe) # as well, and we patch load_hdf to give it the test data frame. It writes # a shard file for each shard, so we patch save_hdf with a mock object # that we will later query to retrieve the output shards for testing. # Finally we patch make_adjacency_matrix with a mock object that returns # the known good test data for the adjacency matrix, combined index, and # number of new terms in English, to make this test independent of any # failures of that function. nshards = 2 shard_collector = Mock(return_value=None) # save_hdf returns None with patch('builtins.open', return_value=io.StringIO(ASSOC_FILE_CONTENTS)), \ patch('conceptnet5.vectors.propagate.make_adjacency_matrix', return_value=(ADJACENCY_MATRIX, COMBINED_INDEX, len(NEW_ENGLISH_TERMS))), \ patch('conceptnet5.vectors.propagate.load_hdf', return_value=FRAME), \ patch('conceptnet5.vectors.propagate.save_hdf', shard_collector): sharded_propagate('ignored_assoc_file', 'ignored_embedding_file', 'shard_filename_root', nshards=nshards) # Run unsharded propagation for comparison. propagated = propagate(COMBINED_INDEX, FRAME, ADJACENCY_MATRIX, len(NEW_ENGLISH_TERMS)) # Check that two shard files were written, to the correct filenames. shard_arg = 0 # shard is 1st arg to save_hdf fname_arg = 1 # filename is 2nd arg to save_hdf. assert (len(shard_collector.call_args_list) == nshards), \ 'Incorrect number {} (should be {}) of shards written.'.format( len(shard_collector.call_args_list), nshards) for i_shard in range(nshards): # Get the positional argument in the filename position of the (i_shard)-th # call to the shard_collector Mock object (which mocks save_hdf). filename = extract_positional_arg(shard_collector, i_shard, fname_arg) assert (filename == 'shard_filename_root.shard{}'.format(i_shard)), \ 'Shard {} written to incorrect file name {}.'.format(i_shard, filename) # The shards should agree with the appropriate pieces of the unsharded output. for i_shard in range(nshards): # Get the positional argument in the shard dataframe position of the # (i-shard)-th call to the shard_collector Mock object (which mocks # save_hdf). shard = extract_positional_arg(shard_collector, i_shard, shard_arg) shard_start_dim = i_shard * EMBEDDING_DIM // nshards shard_end_dim = shard_start_dim + EMBEDDING_DIM // nshards assert (len(shard.index) == len(propagated.index)), \ 'Shard {} has incorrect length {} (should be {}).'.format( i_shard, len(shard.index), len(propagated.index)) for shard_term, ref_term in zip(shard.index, propagated.index): assert (shard_term == ref_term), \ 'Shard {} has term {} where reference has {}.'.format( i_shard, shard_term, ref_term) assert_allclose( shard.values, propagated.values[:, shard_start_dim:shard_end_dim], err_msg='Shard {} has incorrect propagated vectors.'.format( i_shard))
def single_test_sharded_propagate(): # Run the sharded propagation code over the test data in 2 shards. # We patch several functions with mock objects: sharded_propagate reads # an assoc edge file, so we patch builtins.open to give sharded_propagate # the test data graph as that input. It reads an embedding (a dataframe) # as well, and we patch load_hdf to give it the test data frame. It writes # a shard file for each shard, so we patch save_hdf with a mock object # that we will later query to retrieve the output shards for testing. # Finally we patch make_adjacency_matrix with a mock object that returns # the known good test data for the adjacency matrix, combined index, and # number of new terms in English, to make this test independent of any # failures of that function. nshards = 2 shard_collector = Mock(return_value=None) # save_hdf returns None with patch('builtins.open', return_value=io.StringIO(ASSOC_FILE_CONTENTS)), \ patch('conceptnet5.vectors.propagate.make_adjacency_matrix', return_value=(ADJACENCY_MATRIX, COMBINED_INDEX, len(NEW_ENGLISH_TERMS))), \ patch('conceptnet5.vectors.propagate.load_hdf', return_value=FRAME), \ patch('conceptnet5.vectors.propagate.save_hdf', shard_collector): sharded_propagate( 'ignored_assoc_file', 'ignored_embedding_file', 'shard_filename_root', nshards=nshards ) # Run unsharded propagation for comparison. propagated = propagate( COMBINED_INDEX, FRAME, ADJACENCY_MATRIX, len(NEW_ENGLISH_TERMS) ) # Check that two shard files were written, to the correct filenames. shard_arg = 0 # shard is 1st arg to save_hdf fname_arg = 1 # filename is 2nd arg to save_hdf. assert (len(shard_collector.call_args_list) == nshards), \ 'Incorrect number {} (should be {}) of shards written.'.format( len(shard_collector.call_args_list), nshards) for i_shard in range(nshards): # Get the positional argument in the filename position of the (i_shard)-th # call to the shard_collector Mock object (which mocks save_hdf). filename = extract_positional_arg(shard_collector, i_shard, fname_arg) assert (filename == 'shard_filename_root.shard{}'.format(i_shard)), \ 'Shard {} written to incorrect file name {}.'.format(i_shard, filename) # The shards should agree with the appropriate pieces of the unsharded output. for i_shard in range(nshards): # Get the positional argument in the shard dataframe position of the # (i-shard)-th call to the shard_collector Mock object (which mocks # save_hdf). shard = extract_positional_arg(shard_collector, i_shard, shard_arg) shard_start_dim = i_shard * EMBEDDING_DIM // nshards shard_end_dim = shard_start_dim + EMBEDDING_DIM // nshards assert (len(shard.index) == len(propagated.index)), \ 'Shard {} has incorrect length {} (should be {}).'.format( i_shard, len(shard.index), len(propagated.index)) for shard_term, ref_term in zip(shard.index, propagated.index): assert (shard_term == ref_term), \ 'Shard {} has term {} where reference has {}.'.format( i_shard, shard_term, ref_term) assert_allclose( shard.values, propagated.values[:, shard_start_dim:shard_end_dim], err_msg='Shard {} has incorrect propagated vectors.'.format(i_shard) )