def calculate_distances(self): db = Zipcode() if self.from_source.get('va'): patzip = self.from_source['va'].zipcode[:5] elif self.from_source.get('cms'): patzip = self.from_source['cms'].zipcode[:5] else: return pat_latlong = db.zip2geo(patzip) logging.debug(f"Zipcode {patzip}, pat_latlong: {pat_latlong}") logging.debug(f"Checking distances for {len(self.trials)} trials") for trial in self.trials: if trial.sites is None: logging.debug(f"Site list empty for trial {trial.id}") else: logging.debug(f"Trial {trial.id} has {len(trial.sites)} sites") for site in trial.sites: coordinates = site.get("org_coordinates", 0) logging.debug(f"Coordinates: {coordinates}") if coordinates == 0: zipcode = site.get('org_postal_code', '') zipcode = '' if zipcode is None else zipcode site_latlong = db.zip2geo(zipcode[:5]) logging.debug( f"site lat-long (from zip): {site_latlong}") else: site_latlong = (coordinates["lat"], coordinates["lon"]) logging.debug( f"site lat-long (from coords): {site_latlong}") if (site_latlong is None) or (pat_latlong is None): logging.debug( f"no distance for site {site['org_name']} at trial={trial.id}" ) else: site["distance"] = distance(pat_latlong, site_latlong) logging.debug( f"Distance={site['distance']} for Trial={trial.id}" ) if trial.locations is None: logging.debug(f"Location list empty for trial {trial.id}") else: logging.debug( f"Trial {trial.id} has {len(trial.locations)} locations") for site in trial.locations: site_latlong = db.zip2geo( site.get("LocationZip", "00000")[:5]) logging.debug(f"site lat-long (from zip): {site_latlong}") if (site_latlong is None) or (pat_latlong is None): logging.debug( f"no distance for site {site.get('LocationFacility', 'unknown')} at trial={trial.id}" ) else: site["distance"] = distance(pat_latlong, site_latlong) logging.debug( f"Distance={site['distance']} for Trial={trial.id}" )
def find_similarity_of_points_in_radius(closest_vantage_pt, ts1, radius): """ Given a vantage point and a radius, find the points that fall within the circle around the vantage point. Then calculates the distance from all of these points to the timeseries of interest. closest_vantage_pt: number of the vantage point being considered ts1: timeseries of interest radius: radius of circle to consider Returns: list of tuples (distance, timeseries id) in sorted order """ #open database for that vantage point db = BinarySearchDatabase.connect("VantagePointDatabases/" + str(closest_vantage_pt) + ".dbdb") #find all light curves within 2d of the vantage point light_curves_in_radius = db.get_nodes_less_than(radius) light_curves_in_radius.append( str(closest_vantage_pt)) # add in the vantage pt db.close() #find similiarity between these light curves and given light curve distance = [] for l in light_curves_in_radius: with open("GeneratedTimeseries/Timeseries" + str(l), "rb") as f: ts2 = pickle.load(f) dist = distances.distance(distances.stand(ts1, ts1.mean(), ts1.std()), distances.stand(ts2, ts2.mean(), ts2.std()), mult=1) distance.append([dist, "Timeseries" + str(l)]) return distance
def sanity_check(filename, n): """ Function that manually finds the n most similiar timeseries to the given timeseries. Serves as a check of the vantage point method Returns: list of n most similiar filenames """ ans = [] d = [] with open(filename, "rb") as f: ts1 = pickle.load(f) for i in range(1000): with open("GeneratedTimeseries/Timeseries" + str(i), "rb") as f: ts2 = pickle.load(f) dist = distances.distance(distances.stand(ts1, ts1.mean(), ts1.std()), distances.stand(ts2, ts2.mean(), ts2.std()), mult=1) d.append([dist, "Timeseries" + str(i)]) d.sort(key=lambda x: x[0]) for i in range(1, n + 1): ans.append(d[i][1]) return ans
def __init__(self, lambdaa=1, rail=False): self.rail = rail self.locdict = readData() #Split the locations into dealers and vdcs self.vdcDict = {k: v for (k, v) in self.locdict.items() if v.isVDC()} self.dealerDict = { k: v for (k, v) in self.locdict.items() if not v.isVDC() } self.G = nx.Graph() #add all vdcs to graph self.G.add_nodes_from(self.vdcDict.keys()) # Graph distances between vdcs for loc1 in self.vdcDict.keys(): for loc2 in self.vdcDict.keys(): self.G.add_edge(loc1, loc2, weight=distance(self.vdcDict[loc1], self.vdcDict[loc2])**lambdaa) # Generate paths and lengths between VDCs self.vdcPaths = dict(nx.all_pairs_dijkstra_path(self.G)) self.vdcPathLengths = dict(nx.all_pairs_dijkstra_path_length(self.G)) # Add dealers to the graph # Add dealers self.G.add_nodes_from(self.dealerDict.keys()) # For each dealer find nearest VDC for dealer in self.dealerDict.values(): nearestvdc = min(self.vdcDict.values(), key=lambda x: distance(dealer, x)) # Add information to the VDCs/dealers dealer.setVDC(nearestvdc) nearestvdc.addDealer(dealer) # Add edge to graph self.G.add_edge(dealer.getName(), nearestvdc.getName(), weight=distance(dealer, nearestvdc))
def _test(inf): lines = dtf_parse(inf) t = 0 for l in lines: if isinstance(l, Line): m = distances.miles(distances.distance(l.start, l.end)) t += m print dtf_text(distances.midpoint(l.start, l.end), "Head %d for %0.2f miles" % (distances.heading(l.start, l.end), m)), print "\nTotal of %f miles\n" % t
def find_most_similiar(filename, n, vantage_pts): """ Finds n most similiar time series to the time series of interest (filename) by using the supplied vantage points filename: timeseries of interest n: number of similiar timeseries to return (n must be between 1 and 20) vantage_pts: a list of the vantage point numbers Returns: list of n most similiar filenames """ file_names = [] #load the given file with open(filename, "rb") as f: ts1 = pickle.load(f) #find the most similiar vantage point = d vantage_pts_dist = [] for i in vantage_pts: with open("GeneratedTimeseries/Timeseries" + str(i), "rb") as f: ts2 = pickle.load(f) dist = distances.distance(distances.stand(ts1, ts1.mean(), ts1.std()), distances.stand(ts2, ts2.mean(), ts2.std()), mult=1) vantage_pts_dist.append([dist, i]) vantage_pts_dist.sort(key=lambda x: x[0]) all_pts_to_check = [] for i in range(n): closest_vantage_pt = vantage_pts_dist[i][1] radius = 2 * vantage_pts_dist[i][0] pts_in_radius = find_similarity_of_points_in_radius( closest_vantage_pt, ts1, radius) for j in pts_in_radius: if j not in all_pts_to_check: all_pts_to_check.append(j) all_pts_to_check.sort(key=lambda x: x[0]) for i in range(1, n + 1): #ignore given timeseries file_names.append(all_pts_to_check[i][1]) return file_names
from csvreader import * import networkx as nx from distances import distance import matplotlib.pyplot as plt #locdict = readData() locdict = {} locdict["Los Angeles"] = Location("Los Angeles", 34.05, -118.25) locdict["New York"] = Location("New York", 40.7128, -74.0060) locdict["London"] = Location("London", 51.5074, -0.1278) locdict["Tokyo"] = Location("Tokyo", 35.6895, 139.6917) G = nx.Graph() G.add_nodes_from(locdict.keys()) for loc1 in locdict.keys(): for loc2 in locdict.keys(): G.add_edge(loc1, loc2, weight=distance(locdict[loc1], locdict[loc2])) print(G.edges.data('weight')) ''' plt.subplot(121) nx.draw(G, with_labels=True, font_weight='bold') plt.show() '''
def pick_vantage_points(arg): """ Code which picks 20 vantage points and produces a database for each one. The database stores (key,value) pairs where: key = distance from timeseries to vantage point (kernel coefficient) value = id of timeseries (0-999) returns: list of vantage points (integers from 0-999) """ try: parser = argparse.ArgumentParser(description="vantage points") parser.add_argument('--n', help='number of vantage points', type=int, default=20) args = parser.parse_args(arg) num = args.n except: num = arg try: shutil.rmtree('VantagePointDatabases') os.mkdir('VantagePointDatabases') except: os.mkdir('VantagePointDatabases') vantage_pts = random.sample(range(0, 1000), num) for vantage_point in vantage_pts: try: os.remove("VantagePointDatabases/" + str(vantage_point) + ".dbdb") db1 = BinarySearchDatabase.connect("VantagePointDatabases/" + str(vantage_point) + ".dbdb") except: db1 = BinarySearchDatabase.connect("VantagePointDatabases/" + str(vantage_point) + ".dbdb") with open("GeneratedTimeseries/Timeseries" + str(vantage_point), "rb") as f: ts2 = pickle.load(f) for i in range(1000): if i != vantage_point: with open("GeneratedTimeseries/Timeseries" + str(i), "rb") as f: ts1 = pickle.load(f) dist = distances.distance( distances.stand(ts1, ts1.mean(), ts1.std()), distances.stand(ts2, ts2.mean(), ts2.std()), mult=1) db1.set(dist, str(i)) db1.commit() db1.close() f = open('VantagePointDatabases/vp', 'w') for i in vantage_pts: f.write(str(i) + "\n") f.close() return vantage_pts
def test_distance(self): t0 = ts(times=[0,1,2,4,5,6],values=[3,4,5,6,7,8]) t0_stand = distances.stand(t0,t0.mean(),t0.std()) t1 = ts(times=[0,1,2,4,5,6],values=[3,4,5,6,7,8]) t1_stand = distances.stand(t1,t1.mean(), t1.std()) assert distances.distance(t0_stand, t1_stand) == 0
def setVDC(self, vdc): self.vdc = vdc self.vdcDist = distance(self, vdc)
def __init__(self, name, lat, lon, vdc=None): Location.__init__(self, name, lat, lon) self.vdc = vdc if vdc is not None: self.vdcDist = distance(self, vdc)
def find_most_similiar(filename, n, vantage_pts, isfile=True, dbtype='bstree'): """ Finds n most similiar time series to the time series of interest (filename) by using the supplied vantage points filename: timeseries of interest n: number of similiar timeseries to return (n must be between 1 and 20) vantage_pts: a list of the vantage point numbers Returns: list of n most similiar filenames """ file_names = [] #load the given file if isfile: try: with open(filename, "rb") as f: ts1 = pickle.load(f) except: print( 'Requested %s cannot be found in database, returning ERROR INDEX' % filename) return 'ERROR INDEX' else: ts1 = filename ## check data type if not isinstance(ts1, ts): print( 'Requested %s is not a TimeSeries instance, returning ERROR TYPE' % filename) return 'ERROR TYPE' #find the most similiar vantage point = d vantage_pts_dist = [] for i in vantage_pts: with open("GeneratedTimeseries/Timeseries" + str(i), "rb") as f: ts2 = pickle.load(f) ## interpolate the timeseries in the database to have the same times ## as the client input timeseries ts2 = interpolate_to_match_input(ts2, ts1) dist = distances.distance(distances.stand(ts1, ts1.mean(), ts1.std()), distances.stand(ts2, ts2.mean(), ts2.std()), mult=1) vantage_pts_dist.append([dist, i]) if n > len(vantage_pts_dist) or n < 1: print('More neighbours than vantage requested.') return 'ERROR NUMBER | {}'.format(len(vantage_pts_dist)) vantage_pts_dist.sort(key=lambda x: x[0]) all_pts_to_check = [] for i in range(n): closest_vantage_pt = vantage_pts_dist[i][1] radius = 2 * vantage_pts_dist[i][0] pts_in_radius = find_similarity_of_points_in_radius( closest_vantage_pt, ts1, radius, dbtype) for j in pts_in_radius: if j not in all_pts_to_check: all_pts_to_check.append(j) all_pts_to_check.sort(key=lambda x: x[0]) for i in range(0, n): #ignore given timeseries file_names.append(all_pts_to_check[i]) return file_names
from location import * from distances import distance import pandas as pd import numpy as np from math import sin, cos, radians, sqrt, asin locdict = {} locdict["Los Angeles"] = Location("Los Angeles", 34.05, -118.25) locdict["New York"] = Location("New York", 40.7128, -74.0060) print(locdict["Los Angeles"]) print(locdict["New York"]) print(distance(locdict["Los Angeles"], locdict["New York"]) / 1.2)