def plot4(): # YOUR CODE HERE Mlist = [1, 2, 5, 10, 20, 50, 100, 200, 500] pydistance_list = [] for M in Mlist: U, X, Y = data.toy(M, 100, 100) delta = mean( timeit.repeat(lambda: pybatch(U, X, Y), number=1, repeat=3)) pydistance_list.append(delta) npdistance_list = [] for M in Mlist: U, X, Y = data.toy(M, 100, 100) delta = mean( timeit.repeat(lambda: npbatch(U, X, Y), number=1, repeat=3)) npdistance_list.append(delta) fig = plt.figure(figsize=(5, 3)) plt.plot(Mlist, pydistance_list, '-o', color='red', label='pybatch') plt.plot(Mlist, npdistance_list, '-*', color='green', label='npbatch') plt.xscale('log') plt.yscale('log') plt.xlabel('d') plt.ylabel('time') plt.grid(True) plt.show()
def plot3(): # YOUR CODE HERE Nlist = [1, 2, 5, 10, 20, 50, 100, 200, 500] pydistance_list = [] for N in Nlist: U, X, Y = data.toy(100, N, 100) delta = mean( timeit.repeat(lambda: pybatch(U, X, Y, distance=npdistance), number=1, repeat=3)) pydistance_list.append(delta) npdistance_list = [] for N in Nlist: U, X, Y = data.toy(100, N, 100) delta = mean( timeit.repeat(lambda: pybatch( U, X, Y, nearest=npnearest, distance=npdistance), number=1, repeat=3)) npdistance_list.append(delta) fig = plt.figure(figsize=(5, 3)) plt.plot(Nlist, pydistance_list, '-o', color='red', label='pynearest with npdistance') plt.plot(Nlist, npdistance_list, '-*', color='green', label='npnearest with npdistance') plt.xscale('log') plt.yscale('log') plt.xlabel('d') plt.ylabel('time') plt.grid(True) plt.show()
def plot(): # Values for the number of dimensions d to test dlist = [1, 2, 5, 10, 20, 50, 100, 200, 500] # Measure the computation time for each choice of number of dimensions d tlist = [] for d in dlist: U, X, Y = data.toy(100, 100, d) # get the average of three runs delta = mean( timeit.repeat(lambda: pybatch(U, X, Y), number=1, repeat=3)) tlist.append(delta) # Plot the results in a graph fig = plt.figure(figsize=(5, 3)) plt.plot(dlist, tlist, '-o') plt.xscale('log') plt.yscale('log') plt.xlabel('d') plt.ylabel('time') plt.grid(True) plt.show()
if __name__ == "__main__": import data from access import access from access_jit import access as access_jit from access_tab import access as access_tab from datetime import datetime print("-" * 30 + "confirm agreement" + "-" * 30) filter_flows = False a1 = access(data.toy(), filter_flow=filter_flows) j1 = access_jit(data.toy(), filter_flow=filter_flows) t1 = access_tab(data.toy(), filter_flow=filter_flows) numpy.testing.assert_array_equal(t1.accessibility, a1.accessibility) numpy.testing.assert_array_equal(t1.accessibility, j1.accessibility) print("passed toy tabular unfiltered") filter_flows = True a1f = access(data.toy(), filter_flow=filter_flows) j1f = access_jit(data.toy(), filter_flow=filter_flows) t1f = access_tab(data.toy(), filter_flow=filter_flows) numpy.testing.assert_array_equal(t1f.accessibility, a1f.accessibility) numpy.testing.assert_array_equal(t1f.accessibility, j1f.accessibility) print("passed toy tabular unfiltered") filter_flows = False a2 = access(data.flows(n_hubs=20), filter_flow=filter_flows) j2 = access_jit(data.flows(n_hubs=20), filter_flow=filter_flows) t2 = access_tab(data.flows(n_hubs=20), filter_flow=filter_flows)
#vectorized_get_listOfNearestLabels_py = np.vectorize( get_listOfNearestLabels_py ) #listOfLabels = vectorized_get_listOfNearestLabels_py( listOfUnlabeledAttrs , listOfAttrs , listOfLabels , get_labelNearest=get_labelNearest_np , get_distance=get_distance_np ) return listOfLabels import data # unlabeledAttrs formerly called U # labeledAttrs formerly called X # labels formerly called Y numberOfAttrsPerEntity = 50 numberOfLabeledEntities = 100 numberOfUnlabeledEntities = 20 unlabeledAttrs , listOfLabeledAttrs , listOfLabels = data.toy( numberOfUnlabeledEntities , numberOfLabeledEntities , numberOfAttrsPerEntity ) #print( get_listOfNearestLabels_py( unlabeledAttrs , labeledAttrs , labels ) ) # # #labels_by_pydistance = get_listOfNearestLabels_py( unlabeledAttrs , labeledAttrs , labels , get_labelNearest=get_labelNearest_py , get_distance=get_distance_py ) #labels_by_npdistance = get_listOfNearestLabels_py( unlabeledAttrs , labeledAttrs , labels , get_labelNearest=get_labelNearest_py , get_distance=get_distance_np ) #print 'py' + str( labels_by_pydistance ) #print 'np' + str( labels_by_npdistance ) #if labels_by_pydistance == labels_by_npdistance: # print 'py distance and np distance give same result.' #labels_by_pynearest = get_listOfNearestLabels_py( unlabeledAttrs , listOfLabeledAttrs , listOfLabels , get_labelNearest=get_labelNearest_py , get_distance=get_distance_np ) #labels_by_npnearest = get_listOfNearestLabels_py( unlabeledAttrs , listOfLabeledAttrs , listOfLabels , get_labelNearest=get_labelNearest_np , get_distance=get_distance_np )
if step: print( f"\t\t\t A_{oid},{did} gets {this_alternative_from_destination.mass_destination.item()}*{this_alternative_from_destination.distance.item()} from {alternative.destination}" ) accessibility += ak accessibilities.append(accessibility) return flows.assign(accessibility=accessibilities) wants_step = input("Do you want to step through the iterations? [Y/n]") if wants_step.lower().startswith("y"): step = True else: step = False a1 = access_slow(data.toy(), step=step) print(data.toy()) numpy.testing.assert_array_equal( a1.accessibility, [ 0, # A_aa is always zero 30 * 20, # A_ab is mass of c times distance from b to c 25 * 20, # A_ac is mass of b times distance from c to b 30 * 10, # A_ba is mass of c times distance from a to c 0, # A_bb is always zero 60 * 10, # A_bc is mass of a times the distance from c to a 25 * 2, # A_ca is mass of b times distance from a to b 60 * 2, # A_cb is mass of a times distance from b to a 0, # A_cc is always zero ],
# there is no flow from the origin to the competitor # the origin is its own competitor # the origin is its own destination .eval(wdist_specification) # now, grouping by flow o -> d lets us compute the sum of wdist, # which has already zeroed out competitors with no flow from origin o .groupby(["origin", "destination"]) .wdist.sum() # cleaning this up and merging it back into the data frame: .reset_index() .rename(columns=dict(wdist="accessibility")) ) if __name__ == "__main__": import data a1 = access(data.toy()) numpy.testing.assert_array_equal( a1.accessibility, [ 0, # A_aa is always zero 30 * 20, # A_ab is mass of c times distance from b to c 25 * 20, # A_ac is mass of b times distance from c to b 30 * 10, # A_ba is mass of c times distance from a to c 0, # A_bb is always zero 60 * 10, # A_bc is mass of a times the distance from c to a 25 * 2, # A_ca is mass of b times distance from a to b 60 * 2, # A_cb is mass of a times distance from b to a 0, # A_cc is always zero ], )
def pybatch(U, X, Y, nearest=pynearest, distance=pydistance): return [nearest(u, X, Y, distance=distance) for u in U] # Again, such function uses by default the Python nearest neighbor search (with a specified distance function). However, we can also specified a more optimized nearest neighbor function, for example, based on `numpy`. Finally, one could consider an alternative function to `pybatch` that would use `numpy` from the beginning to the end. The implementation of such more optimized functions, and the testing of their correct behavior and higher performance will be the object of this exercise sheet. # ## Testing and correctness # # As a starting point, the code below tests the output of the nearest neighbor algorithm for some toy dataset with fixed parameters. In particular, the function `data.toy(M,N,d)` generates a problem with `M` unlabeled test points stored in a matrix `U` of size `(M x d)`, then `N` labeled training points stored in a matrix `X` of size `(N x d)` and the output label is stored in a vector `Y` of size `N` composed of zeros and ones encoding the two possible classes. The variable `d` denotes the number of dimensions of each point. The toy dataset is pseudo-random, that is, for fixed parameters, it produce a random-looking dataset, but every time the method is called with the same parameters, the dataset is the same. The pseudo-randomness property will be useful to verify that each nearest neighbor implementation performs the same overall computation. Please check the `data.py` file within the exercise folder for the implementation details. # In[5]: import data U, X, Y = data.toy(20, 100, 50) print(pybatch(U, X, Y)) # In particular, the output of this function will help us to verify that the more optimized `numpy`-based versions of nearest neighbor are still valid. # ## Plotting and performance # # We now describe how to build a plot that relates a certain parameter of the dataset (e.g. the number of input dimensions `d` to the time required for the computation. We first initialize the basic plotting environment. # In[6]: import matplotlib from matplotlib import pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline')