def exec_ppjoin(): def parse_query(query): pattern = re.compile('-sim (0\.\d+) -dist (\d+)(.*)') m = pattern.match(query) if m: sim = float(m.group(1)) dist = int(m.group(2)) text = m.group(3) return dist, sim, text else: return None, None, query query = request.args.get('q') # query from search string comes here df = prepare_data('data/miami1000.pkl') inverted_file = get_inverted_file(df) if query: theta, epsilon, text = parse_query(query) if not theta: theta = 0.1 epsilon = 100 if text: res = stTextSearch(df, text, theta) else: res = ppj_c(df, theta, epsilon) else: theta = 0.5 epsilon = 100 res = ppj_c(df, theta, epsilon) return res
def exec_ppjoin(): def parse_query(query): pattern = re.compile('-sim (0\.\d+) -dist (\d+)(.*)') m = pattern.match(query) if m: sim = float(m.group(1)) dist = int(m.group(2)) text = m.group(3) return dist, sim, text else: return None,None,query query = request.args.get('q') # query from search string comes here df = prepare_data('data/miami1000.pkl') inverted_file = get_inverted_file(df) if query: theta, epsilon, text = parse_query(query) if not theta: theta = 0.1 epsilon = 100 if text: res = stTextSearch(df, text, theta) else: res = ppj_c(df, theta, epsilon) else: theta = 0.5 epsilon = 100 res = ppj_c(df, theta, epsilon) return res
def ppj_c_grouping(df, theta, epsilon): pairs = {} print "Grid contruction" grid_dict, grid_shape = construct_grid(df, epsilon) grid_cols = grid_shape[1] print "Grid: done" for cell in grid_dict: neighbor_cells = find_neighbors(cell, grid_cols) for neighbor_cell in neighbor_cells: df_cells = df.loc[grid_dict[neighbor_cell] + grid_dict[cell]] inverted_file_cells = get_inverted_file(df_cells) term_index = {t: [] for t in inverted_file_cells.keys()} group_dict = group_objects(df_cells, theta) for ppref in group_dict: group = group_dict[ppref] id_x = group[0] overlap_x = Counter() text_x = df.loc[id_x].text lat_x = df.loc[id_x].lat lng_x = df.loc[id_x].lng if len(text_x) == 0: continue index_pref_len = len(text_x) - int( ceil(2 * theta * len(text_x) / (theta + 1))) + 1 for pos_x in range(len(ppref)): t = text_x[pos_x] for (id_y, pos_y) in term_index[t]: lat_y = df.loc[id_y].lat lng_y = df.loc[id_y].lng text_y = df.loc[id_y].text if (len(text_y) < theta * len(text_x)) or (vincenty( (lat_x, lng_x), (lat_y, lng_y)).km > epsilon): continue elif (pos_filter(df, id_x, id_y, pos_x,pos_y, theta)) \ & (suf_filter(df, id_x, id_y, pos_x,pos_y, theta)): overlap_x[id_y] += 1 else: overlap_x[id_y] = -10000 if pos_x <= index_pref_len: term_index[t].append((id_x, pos_x)) pairs = verify(df, pairs, id_x, overlap_x, theta) return resultJSON(df, pairs)
def ppj_c_grouping(df, theta, epsilon): pairs = {} print "Grid contruction" grid_dict, grid_shape = construct_grid(df, epsilon) grid_cols = grid_shape[1] print "Grid: done" for cell in grid_dict: neighbor_cells = find_neighbors(cell, grid_cols) for neighbor_cell in neighbor_cells: df_cells = df.loc[grid_dict[neighbor_cell] + grid_dict[cell]] inverted_file_cells = get_inverted_file(df_cells) term_index = {t: [] for t in inverted_file_cells.keys()} group_dict = group_objects(df_cells, theta) for ppref in group_dict: group = group_dict[ppref] id_x = group[0] overlap_x = Counter() text_x = df.loc[id_x].text lat_x = df.loc[id_x].lat lng_x = df.loc[id_x].lng if len(text_x) == 0: continue index_pref_len = len(text_x) - int(ceil(2 * theta * len(text_x)/ (theta+1))) + 1 for pos_x in range(len(ppref)): t = text_x[pos_x] for (id_y, pos_y) in term_index[t]: lat_y = df.loc[id_y].lat lng_y = df.loc[id_y].lng text_y = df.loc[id_y].text if (len(text_y) < theta * len(text_x)) or (vincenty((lat_x, lng_x), (lat_y, lng_y)).km > epsilon): continue elif (pos_filter(df, id_x, id_y, pos_x,pos_y, theta)) \ & (suf_filter(df, id_x, id_y, pos_x,pos_y, theta)): overlap_x[id_y] += 1 else: overlap_x[id_y] = -10000 if pos_x <= index_pref_len: term_index[t].append((id_x, pos_x)) pairs = verify(df, pairs, id_x, overlap_x, theta) return resultJSON(df, pairs)
json_ = { "id": id_, "long": str(obj.lat), "lat": str(obj.lng), "text": obj.raw_text } cell.append(json_) result.append(cell) return json.dumps(result) # ----------------------------------------------------------------------------------------------------------------------- if __name__ == "__main__": df = prepare_data('data/miami1000.pkl') inverted_file = get_inverted_file(df) theta = 0.8 start_time = time.time() pairs = ppjoin(df, inverted_file, theta) print "Time elapsed:", time.time() - start_time print pairs[0] print 'Total: ', len(pairs) for pair in pairs: id1 = pair[0]["id"] id2 = pair[1]["id"] print jaccard_similarity(df.loc[id1].text, df.loc[id2].text) group_dict = group_objects(df, theta) start_time = time.time() pairs = ppjoin_group(df, inverted_file, theta, group_dict)
obj = df.loc[id_] json_ = { "id": id_, "long": str(obj.lat), "lat": str(obj.lng), "text": obj.raw_text } cell.append(json_) result.append(cell) return json.dumps(result) # ----------------------------------------------------------------------------------------------------------------------- if __name__ == "__main__": df = prepare_data('data/miami1000.pkl') inverted_file = get_inverted_file(df) theta = 0.8 start_time = time.time() pairs = ppjoin(df, inverted_file, theta) print "Time elapsed:", time.time() - start_time print pairs[0] print 'Total: ', len(pairs) for pair in pairs: id1 = pair[0]["id"] id2 = pair[1]["id"] print jaccard_similarity(df.loc[id1].text, df.loc[id2].text) group_dict = group_objects(df, theta) start_time = time.time() pairs = ppjoin_group(df, inverted_file, theta, group_dict)