def main(): print("Gill Bates vs Beff Jezos") print("Who's going to win?") #subtitle_file = input("Subtitles SRT: ") #transcript_file = input("Google Protobuf transcript: ") #amazon_file = input("Amazon JSON transcript: ") subtitle_file = "500_days.srt" transcript_file = "500_days_transcribed_audio.pb" amazon_file = "500_days_amazon.json" # load the data transcript_collection = load_proto_file(transcript_file) subs = parse_subs(subtitle_file) amazon_text, amazon_timings = amazon_transcript(amazon_file) google_timings = get_time_transcript(transcript_collection.audiobits) # get text data ready for processing subs_align = get_text_ready(subs, False) tran_google = get_text_ready(transcript_collection.audiobits, False) tran_amazon = get_text_ready(amazon_text, True) print("Starting the alignment...") dist, cost, acc, path = fastdtw(np.array(subs_align), np.array(tran_google), edit_distance) print("Finished warping ONE") dist2, cost2, acc2, path2 = fastdtw(np.array(subs_align), np.array(tran_amazon), edit_distance) print("Finished warping TWO. Starting aligning...") # find the path from 0,0 to the end to get the timings google_path_pairs = find_timings_for_words(path) amazon_path_pairs = find_timings_for_words(path2) output_timings("google", google_path_pairs, subs_align, tran_google, google_timings) output_timings("amazon", amazon_path_pairs, subs_align, tran_amazon, amazon_timings)
def cluster(self, res, n_cluster, by_feature, method='kmeans', normalized=False, **kwargs): stations = list(set(res.index.get_level_values(0))) series, names = [], [] def transform(x): return x / np.max(np.abs(x)) if normalized else x def feature(data): if by_feature == "Combined": return data[["Checkout", "Return"]].values.flatten() else: return data[by_feature] for st in stations: data = res.loc[st] if len(data) < 24: data = self.clean_record(data) if res.loc[st].Factor.iloc[0] < 30 or np.isnan(feature(data)).any()\ or (feature(data) == 0).all(): continue series.append(transform(feature(data))) names.append(st) if method == 'kmeans': km = KMeans(init='k-means++', n_clusters=n_cluster) labels = km.fit_predict(np.array(series)) elif method == 'agglomerative': n = len(series) simmat = np.zeros((n, n)) for i in range(n): simmat[i, i] = 0 for j in range(i + 1, n): simmat[i, j], _, _, _ = fastdtw( series[i], series[j], dist=lambda x, y: np.abs(x - y)) simmat[j, i] = simmat[i, j] labels = agg_cluster(simmat, n_cluster, linkage=kwargs.get('linkage', 'average')) path_name = '{}_{}_{}'.format(method, self.bikesystem.city, n_cluster) path_name = path_name + "_normalized" if normalized else path_name base_dir = os.path.join('clusters', path_name) if os.path.exists(base_dir): rmtree(base_dir) from_dir = os.path.join('temporal_deltas', '{}_Hourly'.format(self.bikesystem.city)) for i in range(n_cluster): path_name = os.path.join(base_dir, str(i)) os.makedirs(path_name) for name, label in zip(names, labels): file_name = fname(name) + ".pdf" copyfile(os.path.join(from_dir, file_name), os.path.join(base_dir, str(label), file_name)) return names, labels
def predict(self, x_test): pred = [] err = [] for id_test in range(len(x_test)): result = np.zeros(len(self.y)) for id_train in range(len(self.X)): try: min_dist = fastdtw(self.X[id_train], x_test[id_test], self.dist) result[id_train] = min_dist except Exception as e: print(self.y[id_train]) print(self.X[id_train]) print(x_test[id_test]) print(e) if (self.normalize): result = self.norm(result) res_indx = result.argsort()[:self.neighbours] pred.append((self.y[res_indx], result[res_indx])) return pred
np.maximum(x, softphoc_query)) / len(qword) for x in resized_preds ]) elif similarity_type == 4: #dtw resized_preds = [ (cv2.resize(x, (target_size[0], target_size[1]))) for x in softphoc_proposal_preds ] ## for axes oriented boxes # resized_preds = [warped_logits] ## for warped non axes oriented boxes # chars = [char2int[x] for x in qword] # similarities = -np.mean([fastdtw(softphoc_query[:, :, x].transpose(), resized_preds[0][:, :, x].transpose(), 'euclidean')[0] for x in chars]) similarities = -np.mean([ fastdtw(softphoc_query[:, :, x].transpose(), resized_preds[0][:, :, x].transpose(), 'euclidean')[0] for x in range(38) ]) else: raise Exception('Unknown similarity type') sorted_idx = np.argsort(similarities) if take_always_argmax: idx = sorted_idx[-1] else: # idx = sorted_idx[-query_dict[qword]] idx = sorted_idx[np.maximum(-query_dict[qword], -len(sorted_idx))]
np.maximum(x, softphoc_query)) / len(qword) for x in resized_preds ]) elif similarity_type == 4: #dtw similarities = [] im = np.copy(img) for line in houghTransform_proposals: cv2.line(im, (line[0], line[1]), (line[2], line[3]), (0, 255, 0), 2) pred_points_array = get_points_lines( line, probabilities_np) # resized_preds = [(cv2.resize(x, (target_size[0]))) for x in pred_points_array] # similarities = -np.mean([fastdtw(softphoc_query[:, :, x].transpose(), resized_preds[0][:, :, x].transpose(), 'euclidean')[0] for x in range(38)]) dist, cost, acc, path = fastdtw( np.squeeze(softphoc_query), pred_points_array, 'euclidean') similarities.append(-dist) plt.imshow(im) plt.title(qword) plt.show() else: raise Exception('Unknown similarity type') sorted_idx = np.argsort(similarities) if take_always_argmax: idx = sorted_idx[-1] else: # idx = sorted_idx[-query_dict[qword]]
def dtw_metric(x, y): x = x.reshape(-1,1) y = y.reshape(-1,1) dist, cost, acc, path = fastdtw(x, y, dist = lambda x, y: norm(x - y, ord=1)) return dist
priceLow = sortedSmoothedSample[0][1] for j in range(arcLength): normalizedSmoothedSample[j][1] = ( smoothedSample[j][1] - priceLow) / priceRange * circleRadius midpoint = int(arcLength / 2) diff = abs(normalizedSmoothedSample[midpoint][1] - template[midpoint][1]) for j in range(arcLength): normalizedSmoothedSample[j][ 1] = normalizedSmoothedSample[j][1] - diff normalizedSmoothedSample[j][0] = j acc = fastdtw(normalizedSmoothedSample, template) similarity_distance = acc[arcLength - 1][arcLength - 1] sim_list.append(similarity_distance) if similarity_distance < similarity_threshold * arcLength: left = smoothedSample[0][1] right = smoothedSample[-1][1] slope = abs(left - right) / min(left, right) if slope <= horizon_threshold: dateIndex.append(i) # record the pattern found under current time span startday = i endday = i + arcLength - 1
def dtw_worker(i, t_row, df, df_test): # print 'Worker:', i test_list = [] neighbours = [] tlat = [] tlong = [] t_trajectories = ast.literal_eval(t_row[0]) # Collect in list all the trajectories for this trip for j in range(0, len(t_trajectories)): test_list.append( [float(t_trajectories[j][1]), float(t_trajectories[j][2])]) tlong.append(float(t_trajectories[j][1])) tlat.append(float(t_trajectories[j][2])) # Iterate over all trips in tripsClean start = time.time() for k, row in df.iterrows(): clean_list = [] trajectories = ast.literal_eval(row[2]) for l in range(0, len(trajectories)): clean_list.append( [float(trajectories[l][1]), float(trajectories[l][2])]) # Compute DTW for these trips using Haversine as distance metric dist, cost, acc, path = dtw.fastdtw( test_list, clean_list, dist=lambda c1, c2: prep.haversine_dist(c1[0], c1[1], c2[0], c2[1] )) # dist, path = fdtw.fastdtw(test_list, clean_list, dist=lambda c1, c2: prep.haversine_dist(c1[0], c1[1], c2[0], c2[1])) neighbours.append([int(row[0]), acc[-1][-1]]) end = time.time() neighbours = np.asarray(neighbours) neighbours = neighbours[neighbours[:, 1].argsort()][:5] # print neighbours gmap = gmplot.GoogleMapPlotter(tlat[0], tlong[0], 10, 'AIzaSyDf6Dk2_fg0p8XaEhQdFVCXg-AMlm54dAs') gmap.plot(tlat, tlong, 'green', edge_width=5) gmap.draw('Maps/dtwMaps/testTrip' + str(i + 1) + '/test-' + str(i + 1) + '.html') # print "Test Trip ", i, "\n" filename = 'Maps/dtwMaps/testTrip' + str(i + 1) + '/data' + str(i + 1) + '.txt' open(filename, 'w').close() f = open(filename, "a+") for n in range(0, 5): for g, grow in df[df['tripId'] == neighbours[n][0]].iterrows(): gtrajectory = ast.literal_eval(grow[2]) longlist = [] latlist = [] for j in range(0, len(gtrajectory)): longlist.append(float(gtrajectory[j][1])) latlist.append(float(gtrajectory[j][2])) gmap = gmplot.GoogleMapPlotter( latlist[0], longlist[0], 10, 'AIzaSyDf6Dk2_fg0p8XaEhQdFVCXg-AMlm54dAs') gmap.plot(latlist, longlist, 'green', edge_width=5) gmap.draw('Maps/dtwMaps/testTrip' + str(i + 1) + '/neighbour' + str(n + 1) + '-' + str(grow[1]) + '.html') f.write("Neighbor %d \nJP_ID: %s \nDTW: %8.5f\n" % (n, grow[1], float(neighbours[n][1]))) f.write("dt: %8.5f\n\n" % float(end - start)) f.close()
import dtw import numpy as np import matplotlib.pyplot as plt # from numpy.linalg import norm x = np.array(list(range(15, 20))).reshape(-1, 1) y = np.array(list(range(5, 10))).reshape(-1, 1) dist, cost, acc, path = dtw.fastdtw(x, y, dist=lambda x, y: abs(x-y)) dist plt.imshow(acc.T, origin='lower', cmap=plt.cm.gray, interpolation='nearest') plt.plot(path[0], path[1], 'w') plt.xlim((-0.5, acc.shape[0]-0.5)) plt.ylim((-0.5, acc.shape[1]-0.5)) plt.plot(list(range(len(x))), x) plt.plot(y, list(range(len(y)))) plt.show() # 这个dtw距离似乎会随着时间序列的长度改变啊!这个可咋整呢? # 不过经过实验之后发现,虽然会变,但是还是会收敛的
def dtw_dist(self, other): x = np.array(self.coords) y = np.array(other.coords) #distance, path = fastdtw(x, y,radius=20, dist=lambda x, y: abs(x-y)) distance, C, D1, path = fastdtw(x, y, dist=lambda x, y: abs(x - y)) return distance
key=lambda sortedPeriod: sortedPeriod[1]) periodRange = sortedPeriod[lenPeriod - 1][1] - sortedPeriod[0][1] for i in range(lenPeriod): curPeriod[i][1] = (curPeriod[i][1] - sortedPeriod[0][1] ) / periodRange * (lenPeriod / 2.0) # print(periodRange) if startDate == 10 and endDate == 41: print(lenPeriod / 2.0) midpoint = int(lenPeriod / 2) diff = abs(curPeriod[midpoint][1] - curTemplate[midpoint][1]) for i in range(lenPeriod): curPeriod[i][1] = curPeriod[i][1] - diff curPeriod[i][0] = i acc = fastdtw(curPeriod, curTemplate) curSimilarityDist = acc[lenPeriod - 1][lenPeriod - 1] if curSimilarityDist <= SIM_THRESHOLD * lenPeriod: left = smoothedData[startDate][1] right = smoothedData[endDate][1] slope = abs(left - right) / min(left, right) # if match # leftBound = endDate + 1 dateInc = TIMESPAN_MIN save the arc if slope <= CIRCLE_HORIZONTAL_THRESHOLD: distList.append(curSimilarityDist / lenPeriod) dateInc = TIMESPAN_MIN leftBound = endDate + 1 pricePatternList.append([startDate, endDate]) break
import numpy as np import matplotlib.pyplot as plt from dtw import fastdtw import matplotlib as mpl from arc import arc_circle_gen mpl.rcParams['font.sans-serif'] = ['SimHei'] mpl.rcParams['axes.unicode_minus'] = False import math from template import template_gen tem1 = arc_circle_gen(32, 16) tem2 = template_gen('circle', 32, 16, 1.0) print(tem1) print(tem2) tem1rev = [] for i in range(30): tem1rev.append([tem1[i][0], 15-tem1[i][1]]) print(fastdtw(tem1, tem2)[29][29]) print(fastdtw(tem1rev, tem2)[29][29])