def exec_ppjoin():
    def parse_query(query):
        pattern = re.compile('-sim (0\.\d+) -dist (\d+)(.*)')
        m = pattern.match(query)

        if m:
            sim = float(m.group(1))
            dist = int(m.group(2))
            text = m.group(3)
            return dist, sim, text
        else:
            return None, None, query

    query = request.args.get('q')  # query from search string comes here

    df = prepare_data('data/miami1000.pkl')
    inverted_file = get_inverted_file(df)

    if query:
        theta, epsilon, text = parse_query(query)

        if not theta:
            theta = 0.1
            epsilon = 100

        if text:
            res = stTextSearch(df, text, theta)
        else:
            res = ppj_c(df, theta, epsilon)
    else:
        theta = 0.5
        epsilon = 100
        res = ppj_c(df, theta, epsilon)

    return res
def exec_ppjoin():
    def parse_query(query):
        pattern = re.compile('-sim (0\.\d+) -dist (\d+)(.*)')
        m = pattern.match(query)

        if m:
            sim = float(m.group(1))
            dist = int(m.group(2))
            text = m.group(3)
            return dist, sim, text
        else:
            return None,None,query

    query = request.args.get('q')   # query from search string comes here

    df = prepare_data('data/miami1000.pkl')
    inverted_file = get_inverted_file(df)

    if query:
        theta, epsilon, text = parse_query(query)

        if not theta:
            theta = 0.1
            epsilon = 100

        if text:
            res = stTextSearch(df, text, theta)
        else:
            res = ppj_c(df, theta, epsilon)
    else:
        theta = 0.5
        epsilon = 100
        res = ppj_c(df, theta, epsilon)

    return res
def ppj_c_grouping(df, theta, epsilon):

    pairs = {}

    print "Grid contruction"
    grid_dict, grid_shape = construct_grid(df, epsilon)
    grid_cols = grid_shape[1]
    print "Grid: done"

    for cell in grid_dict:
        neighbor_cells = find_neighbors(cell, grid_cols)

        for neighbor_cell in neighbor_cells:
            df_cells = df.loc[grid_dict[neighbor_cell] + grid_dict[cell]]

            inverted_file_cells = get_inverted_file(df_cells)
            term_index = {t: [] for t in inverted_file_cells.keys()}

            group_dict = group_objects(df_cells, theta)

            for ppref in group_dict:
                group = group_dict[ppref]
                id_x = group[0]
                overlap_x = Counter()
                text_x = df.loc[id_x].text
                lat_x = df.loc[id_x].lat
                lng_x = df.loc[id_x].lng
                if len(text_x) == 0:
                    continue

                index_pref_len = len(text_x) - int(
                    ceil(2 * theta * len(text_x) / (theta + 1))) + 1

                for pos_x in range(len(ppref)):
                    t = text_x[pos_x]
                    for (id_y, pos_y) in term_index[t]:
                        lat_y = df.loc[id_y].lat
                        lng_y = df.loc[id_y].lng

                        text_y = df.loc[id_y].text
                        if (len(text_y) < theta * len(text_x)) or (vincenty(
                            (lat_x, lng_x), (lat_y, lng_y)).km > epsilon):
                            continue
                        elif (pos_filter(df, id_x, id_y, pos_x,pos_y, theta)) \
                                & (suf_filter(df, id_x, id_y, pos_x,pos_y, theta)):
                            overlap_x[id_y] += 1
                        else:
                            overlap_x[id_y] = -10000
                    if pos_x <= index_pref_len:
                        term_index[t].append((id_x, pos_x))
                pairs = verify(df, pairs, id_x, overlap_x, theta)

    return resultJSON(df, pairs)
def ppj_c_grouping(df, theta, epsilon):

    pairs = {}

    print "Grid contruction"
    grid_dict, grid_shape = construct_grid(df, epsilon)
    grid_cols = grid_shape[1]
    print "Grid: done"

    for cell in grid_dict:
        neighbor_cells = find_neighbors(cell, grid_cols)

        for neighbor_cell in neighbor_cells:
            df_cells = df.loc[grid_dict[neighbor_cell] + grid_dict[cell]]

            inverted_file_cells = get_inverted_file(df_cells)
            term_index = {t: [] for t in inverted_file_cells.keys()}

            group_dict = group_objects(df_cells, theta)

            for ppref in group_dict:
                group = group_dict[ppref]
                id_x = group[0]
                overlap_x = Counter()
                text_x = df.loc[id_x].text
                lat_x = df.loc[id_x].lat
                lng_x = df.loc[id_x].lng
                if len(text_x) == 0:
                    continue

                index_pref_len = len(text_x) - int(ceil(2 * theta * len(text_x)/ (theta+1))) + 1

                for pos_x in range(len(ppref)):
                    t = text_x[pos_x]
                    for (id_y, pos_y) in term_index[t]:
                        lat_y = df.loc[id_y].lat
                        lng_y = df.loc[id_y].lng

                        text_y = df.loc[id_y].text
                        if (len(text_y) < theta * len(text_x)) or (vincenty((lat_x, lng_x), (lat_y, lng_y)).km > epsilon):
                            continue
                        elif (pos_filter(df, id_x, id_y, pos_x,pos_y, theta)) \
                                & (suf_filter(df, id_x, id_y, pos_x,pos_y, theta)):
                            overlap_x[id_y] += 1
                        else:
                            overlap_x[id_y] = -10000
                    if pos_x <= index_pref_len:
                        term_index[t].append((id_x, pos_x))
                pairs = verify(df, pairs, id_x, overlap_x, theta)

    return resultJSON(df, pairs)
            json_ = {
                "id": id_,
                "long": str(obj.lat),
                "lat": str(obj.lng),
                "text": obj.raw_text
            }
            cell.append(json_)
        result.append(cell)
    return json.dumps(result)


# -----------------------------------------------------------------------------------------------------------------------

if __name__ == "__main__":
    df = prepare_data('data/miami1000.pkl')
    inverted_file = get_inverted_file(df)
    theta = 0.8

    start_time = time.time()
    pairs = ppjoin(df, inverted_file, theta)
    print "Time elapsed:", time.time() - start_time
    print pairs[0]
    print 'Total: ', len(pairs)
    for pair in pairs:
        id1 = pair[0]["id"]
        id2 = pair[1]["id"]
        print jaccard_similarity(df.loc[id1].text, df.loc[id2].text)

    group_dict = group_objects(df, theta)
    start_time = time.time()
    pairs = ppjoin_group(df, inverted_file, theta, group_dict)
            obj = df.loc[id_]
            json_ = {
                "id": id_,
                "long": str(obj.lat),
                "lat": str(obj.lng),
                "text": obj.raw_text
            }
            cell.append(json_)
        result.append(cell)
    return json.dumps(result)

# -----------------------------------------------------------------------------------------------------------------------

if __name__ == "__main__":
    df = prepare_data('data/miami1000.pkl')
    inverted_file = get_inverted_file(df)
    theta = 0.8

    start_time = time.time()
    pairs = ppjoin(df, inverted_file, theta)
    print "Time elapsed:", time.time() - start_time
    print pairs[0]
    print 'Total: ', len(pairs)
    for pair in pairs:
        id1 = pair[0]["id"]
        id2 = pair[1]["id"]
        print jaccard_similarity(df.loc[id1].text, df.loc[id2].text)

    group_dict = group_objects(df, theta)
    start_time = time.time()
    pairs = ppjoin_group(df, inverted_file, theta, group_dict)