예제 #1
0
def create_idr(session_id, iteration, geom):
    with app.app_context():
        idr = IDR(session_id=session_id,
                  geom=geom, iteration=iteration)
        db.session.add(idr)
        db.session.commit()

        idr.profile = create_profile(
            session_id, lambda d: get_points_id_in_idr(d, idr))
        db.session.add(idr)
        db.session.commit()
예제 #2
0
def create_polygon(session_id, iteration, geom):
    with app.app_context():
        polygon = Polygon(session_id=session_id,
                          geom=geom, iteration=iteration)
        db.session.add(polygon)
        db.session.commit()

        polygon.profile = create_profile(
            session_id, lambda d: get_points_id_in_polygon(d, polygon))
        db.session.add(polygon)
        db.session.commit()
예제 #3
0
def index_dataset_from_hdf(dataset_id):
    start = time()
    with app.app_context():
        dataset = Dataset.query.get(dataset_id)
        hdf_path = path_to_hdf(dataset)
        tmp_hdf_path = '{}.tmp'.format(hdf_path)

        store = pd.HDFStore(hdf_path)
        df = store.select('data')

        datetime_columns = [
            attr.description for attr in dataset.attributes
            if attr.type == AttributeType.datetime
        ]
        number_columns = [
            attr.description for attr in dataset.attributes
            if attr.type == AttributeType.number
        ]
        text_columns = [
            attr.description for attr in dataset.attributes
            if attr.type == AttributeType.text
        ]
        cat_number_columns = [
            attr.description for attr in dataset.attributes
            if attr.type == AttributeType.categorical_number
        ]
        cat_text_columns = [
            attr.description for attr in dataset.attributes
            if attr.type == AttributeType.categorical_text
        ]

        datetime_columns_limits = 3, 3 + len(datetime_columns)
        number_columns_limits = datetime_columns_limits[
            1], datetime_columns_limits[1] + len(number_columns)
        text_columns_limits = number_columns_limits[
            1], number_columns_limits[1] + len(text_columns)
        cat_number_columns_limits = text_columns_limits[
            1], text_columns_limits[1] + len(cat_number_columns)
        cat_text_columns_limits = cat_number_columns_limits[
            1], cat_number_columns_limits[1] + len(cat_text_columns)

        df = df.loc[:, [
            dataset.latitude_attr, dataset.
            longitude_attr, *datetime_columns, *number_columns, *text_columns,
            *cat_number_columns, *cat_text_columns
        ]]
        greatest_distance = 0
        greatest_similarity = 0

        x, x_chunk, next_chunk = 1, 1, CHUNKSIZE
        n_rows = store.get_storer('data').nrows
        ds = []
        tmp_store = pd.HDFStore(tmp_hdf_path)
        for row_a in df.itertuples():
            if DEBUG: logging.info('{}/{}'.format(row_a[0], n_rows))

            a_datetimes = list(
                chain.from_iterable([
                    [d.hour, d.minute, d.weekday()] for d in row_a[
                        datetime_columns_limits[0]:datetime_columns_limits[1]]
                ])) if datetime_columns_limits[1] > datetime_columns_limits[
                    0] else []
            a_numbers = row_a[
                number_columns_limits[0]:number_columns_limits[1]]
            a_texts = row_a[text_columns_limits[0]:text_columns_limits[1]]
            a_cat_numbers = row_a[
                cat_number_columns_limits[0]:cat_number_columns_limits[1]]
            a_cat_texts = row_a[
                cat_text_columns_limits[0]:cat_text_columns_limits[1]]

            for row_b in df.iloc[x:].itertuples():
                b_datetimes = list(
                    chain.from_iterable([
                        [d.hour, d.minute, d.weekday()]
                        for d in row_b[datetime_columns_limits[0]:
                                       datetime_columns_limits[1]]
                    ])) if datetime_columns_limits[
                        1] > datetime_columns_limits[0] else []
                b_numbers = row_b[
                    number_columns_limits[0]:number_columns_limits[1]]
                b_texts = row_b[text_columns_limits[0]:text_columns_limits[1]]
                b_cat_numbers = row_b[
                    cat_number_columns_limits[0]:cat_number_columns_limits[1]]
                b_cat_texts = row_b[
                    cat_text_columns_limits[0]:cat_text_columns_limits[1]]

                distance = haversine_distance(row_a[1], row_a[2], row_b[1],
                                              row_b[2])

                i = cosine_similarity(a_datetimes, b_datetimes) * 1
                ii = cosine_similarity(a_numbers, b_numbers) * 2
                iii = fuzz_similarity(a_texts, b_texts) * 1
                iv = cosine_similarity(a_cat_numbers, b_cat_numbers) * 2
                v = jaccard_similarity(a_cat_texts, b_cat_texts) * 1
                similarity = i + ii + iii + iv + v

                ds.append((row_a[0], row_b[0], similarity, distance))

                if distance > greatest_distance:
                    greatest_distance = distance
                if similarity > greatest_similarity:
                    greatest_similarity = similarity

            x += 1
            x_chunk += n_rows - x
            if x_chunk > next_chunk:
                tmp_store.append(
                    'relation/pure',
                    pd.DataFrame(
                        ds, columns=['id_a', 'id_b', 'similarity',
                                     'distance']))
                next_chunk += CHUNKSIZE
                ds = []

        if ds:
            tmp_store.append(
                'relation/pure',
                pd.DataFrame(
                    ds, columns=['id_a', 'id_b', 'similarity', 'distance']))

        for dfr in tmp_store.select('relation/pure', chunksize=CHUNKSIZE):
            dfr = dfr.assign(
                similarity=lambda x: x.similarity / greatest_similarity,
                distance=lambda x: x.distance / greatest_distance)
            store.append('relation', dfr, data_columns=True)

        tmp_store.close()
        store.close()

        os.remove(tmp_hdf_path)

        dataset.indexed_at = datetime.datetime.now(datetime.timezone.utc)
        db.session.add(dataset)
        db.session.commit()
        if DEBUG:
            logging.info('[PROCESSING HDF] {} seconds'.format(time() - start))
예제 #4
0
def index_dataset_from_sql(dataset_id):
    start = time()
    with app.app_context():
        engine = create_engine(SQLALCHEMY_DATABASE_URI)
        dataset = Dataset.query.get(dataset_id)
        table_name = 'datasets.' + dataset.filename.rsplit('.', 1)[0]
        table_rel_name = '{}-rel'.format(table_name)

        df = pd.read_sql_table(table_name, engine, index_col='geoguide_id')

        datetime_columns = [
            attr.description for attr in dataset.attributes
            if attr.type == AttributeType.datetime
        ]
        number_columns = [
            attr.description for attr in dataset.attributes
            if attr.type == AttributeType.number
        ]
        text_columns = [
            attr.description for attr in dataset.attributes
            if attr.type == AttributeType.text
        ]
        cat_number_columns = [
            attr.description for attr in dataset.attributes
            if attr.type == AttributeType.categorical_number
        ]
        cat_text_columns = [
            attr.description for attr in dataset.attributes
            if attr.type == AttributeType.categorical_text
        ]

        datetime_columns_limits = 3, 3 + len(datetime_columns)
        number_columns_limits = datetime_columns_limits[
            1], datetime_columns_limits[1] + len(number_columns)
        text_columns_limits = number_columns_limits[
            1], number_columns_limits[1] + len(text_columns)
        cat_number_columns_limits = text_columns_limits[
            1], text_columns_limits[1] + len(cat_number_columns)
        cat_text_columns_limits = cat_number_columns_limits[
            1], cat_number_columns_limits[1] + len(cat_text_columns)

        df = df.loc[:, [
            dataset.latitude_attr, dataset.
            longitude_attr, *datetime_columns, *number_columns, *text_columns,
            *cat_number_columns, *cat_text_columns
        ]]
        greatest_distance = 0
        greatest_similarity = 0

        x, x_chunk, next_chunk = 1, 1, CHUNKSIZE
        n_rows = df.shape[0]
        ds = []
        for row_a in df.itertuples():
            if DEBUG: logging.info('{}/{}'.format(x, n_rows))

            a_datetimes = list(
                chain.from_iterable([
                    [d.hour, d.minute, d.weekday()] for d in row_a[
                        datetime_columns_limits[0]:datetime_columns_limits[1]]
                ])) if datetime_columns_limits[1] > datetime_columns_limits[
                    0] else []
            a_numbers = row_a[
                number_columns_limits[0]:number_columns_limits[1]]
            a_texts = row_a[text_columns_limits[0]:text_columns_limits[1]]
            a_cat_numbers = row_a[
                cat_number_columns_limits[0]:cat_number_columns_limits[1]]
            a_cat_texts = row_a[
                cat_text_columns_limits[0]:cat_text_columns_limits[1]]

            for row_b in df.iloc[x:].itertuples():
                b_datetimes = list(
                    chain.from_iterable([
                        [d.hour, d.minute, d.weekday()]
                        for d in row_b[datetime_columns_limits[0]:
                                       datetime_columns_limits[1]]
                    ])) if datetime_columns_limits[
                        1] > datetime_columns_limits[0] else []
                b_numbers = row_b[
                    number_columns_limits[0]:number_columns_limits[1]]
                b_texts = row_b[text_columns_limits[0]:text_columns_limits[1]]
                b_cat_numbers = row_b[
                    cat_number_columns_limits[0]:cat_number_columns_limits[1]]
                b_cat_texts = row_b[
                    cat_text_columns_limits[0]:cat_text_columns_limits[1]]

                distance = haversine_distance(row_a[1], row_a[2], row_b[1],
                                              row_b[2])

                i = cosine_similarity(a_datetimes, b_datetimes) * 1
                ii = cosine_similarity(a_numbers, b_numbers) * 2
                iii = fuzz_similarity(a_texts, b_texts) * 1
                iv = cosine_similarity(a_cat_numbers, b_cat_numbers) * 2
                v = jaccard_similarity(a_cat_texts, b_cat_texts) * 1
                similarity = i + ii + iii + iv + v

                ds.append((row_a[0], row_b[0], similarity, distance))

                if distance > greatest_distance:
                    greatest_distance = distance
                if similarity > greatest_similarity:
                    greatest_similarity = similarity

            x += 1
            x_chunk += n_rows - x
            if x_chunk > next_chunk:
                pd.DataFrame(
                    ds, columns=['id_a', 'id_b', 'similarity',
                                 'distance']).to_sql(table_rel_name,
                                                     engine,
                                                     if_exists='append')
                next_chunk += CHUNKSIZE
                ds = []

        if ds:
            pd.DataFrame(ds,
                         columns=['id_a', 'id_b', 'similarity',
                                  'distance']).to_sql(table_rel_name,
                                                      engine,
                                                      if_exists='append')

        engine.execute('''
        update "{}"
        set (similarity, distance) = (similarity/{}, distance/{})
        '''.format(table_rel_name, greatest_similarity, greatest_distance))

        dataset.indexed_at = datetime.datetime.now(datetime.timezone.utc)
        db.session.add(dataset)
        db.session.commit()

        if DEBUG:
            logging.info('[PROCESSING SQL] {} seconds'.format(time() - start))