def create_idr(session_id, iteration, geom): with app.app_context(): idr = IDR(session_id=session_id, geom=geom, iteration=iteration) db.session.add(idr) db.session.commit() idr.profile = create_profile( session_id, lambda d: get_points_id_in_idr(d, idr)) db.session.add(idr) db.session.commit()
def create_polygon(session_id, iteration, geom): with app.app_context(): polygon = Polygon(session_id=session_id, geom=geom, iteration=iteration) db.session.add(polygon) db.session.commit() polygon.profile = create_profile( session_id, lambda d: get_points_id_in_polygon(d, polygon)) db.session.add(polygon) db.session.commit()
def index_dataset_from_hdf(dataset_id): start = time() with app.app_context(): dataset = Dataset.query.get(dataset_id) hdf_path = path_to_hdf(dataset) tmp_hdf_path = '{}.tmp'.format(hdf_path) store = pd.HDFStore(hdf_path) df = store.select('data') datetime_columns = [ attr.description for attr in dataset.attributes if attr.type == AttributeType.datetime ] number_columns = [ attr.description for attr in dataset.attributes if attr.type == AttributeType.number ] text_columns = [ attr.description for attr in dataset.attributes if attr.type == AttributeType.text ] cat_number_columns = [ attr.description for attr in dataset.attributes if attr.type == AttributeType.categorical_number ] cat_text_columns = [ attr.description for attr in dataset.attributes if attr.type == AttributeType.categorical_text ] datetime_columns_limits = 3, 3 + len(datetime_columns) number_columns_limits = datetime_columns_limits[ 1], datetime_columns_limits[1] + len(number_columns) text_columns_limits = number_columns_limits[ 1], number_columns_limits[1] + len(text_columns) cat_number_columns_limits = text_columns_limits[ 1], text_columns_limits[1] + len(cat_number_columns) cat_text_columns_limits = cat_number_columns_limits[ 1], cat_number_columns_limits[1] + len(cat_text_columns) df = df.loc[:, [ dataset.latitude_attr, dataset. longitude_attr, *datetime_columns, *number_columns, *text_columns, *cat_number_columns, *cat_text_columns ]] greatest_distance = 0 greatest_similarity = 0 x, x_chunk, next_chunk = 1, 1, CHUNKSIZE n_rows = store.get_storer('data').nrows ds = [] tmp_store = pd.HDFStore(tmp_hdf_path) for row_a in df.itertuples(): if DEBUG: logging.info('{}/{}'.format(row_a[0], n_rows)) a_datetimes = list( chain.from_iterable([ [d.hour, d.minute, d.weekday()] for d in row_a[ datetime_columns_limits[0]:datetime_columns_limits[1]] ])) if datetime_columns_limits[1] > datetime_columns_limits[ 0] else [] a_numbers = row_a[ number_columns_limits[0]:number_columns_limits[1]] a_texts = row_a[text_columns_limits[0]:text_columns_limits[1]] a_cat_numbers = row_a[ cat_number_columns_limits[0]:cat_number_columns_limits[1]] a_cat_texts = row_a[ cat_text_columns_limits[0]:cat_text_columns_limits[1]] for row_b in df.iloc[x:].itertuples(): b_datetimes = list( chain.from_iterable([ [d.hour, d.minute, d.weekday()] for d in row_b[datetime_columns_limits[0]: datetime_columns_limits[1]] ])) if datetime_columns_limits[ 1] > datetime_columns_limits[0] else [] b_numbers = row_b[ number_columns_limits[0]:number_columns_limits[1]] b_texts = row_b[text_columns_limits[0]:text_columns_limits[1]] b_cat_numbers = row_b[ cat_number_columns_limits[0]:cat_number_columns_limits[1]] b_cat_texts = row_b[ cat_text_columns_limits[0]:cat_text_columns_limits[1]] distance = haversine_distance(row_a[1], row_a[2], row_b[1], row_b[2]) i = cosine_similarity(a_datetimes, b_datetimes) * 1 ii = cosine_similarity(a_numbers, b_numbers) * 2 iii = fuzz_similarity(a_texts, b_texts) * 1 iv = cosine_similarity(a_cat_numbers, b_cat_numbers) * 2 v = jaccard_similarity(a_cat_texts, b_cat_texts) * 1 similarity = i + ii + iii + iv + v ds.append((row_a[0], row_b[0], similarity, distance)) if distance > greatest_distance: greatest_distance = distance if similarity > greatest_similarity: greatest_similarity = similarity x += 1 x_chunk += n_rows - x if x_chunk > next_chunk: tmp_store.append( 'relation/pure', pd.DataFrame( ds, columns=['id_a', 'id_b', 'similarity', 'distance'])) next_chunk += CHUNKSIZE ds = [] if ds: tmp_store.append( 'relation/pure', pd.DataFrame( ds, columns=['id_a', 'id_b', 'similarity', 'distance'])) for dfr in tmp_store.select('relation/pure', chunksize=CHUNKSIZE): dfr = dfr.assign( similarity=lambda x: x.similarity / greatest_similarity, distance=lambda x: x.distance / greatest_distance) store.append('relation', dfr, data_columns=True) tmp_store.close() store.close() os.remove(tmp_hdf_path) dataset.indexed_at = datetime.datetime.now(datetime.timezone.utc) db.session.add(dataset) db.session.commit() if DEBUG: logging.info('[PROCESSING HDF] {} seconds'.format(time() - start))
def index_dataset_from_sql(dataset_id): start = time() with app.app_context(): engine = create_engine(SQLALCHEMY_DATABASE_URI) dataset = Dataset.query.get(dataset_id) table_name = 'datasets.' + dataset.filename.rsplit('.', 1)[0] table_rel_name = '{}-rel'.format(table_name) df = pd.read_sql_table(table_name, engine, index_col='geoguide_id') datetime_columns = [ attr.description for attr in dataset.attributes if attr.type == AttributeType.datetime ] number_columns = [ attr.description for attr in dataset.attributes if attr.type == AttributeType.number ] text_columns = [ attr.description for attr in dataset.attributes if attr.type == AttributeType.text ] cat_number_columns = [ attr.description for attr in dataset.attributes if attr.type == AttributeType.categorical_number ] cat_text_columns = [ attr.description for attr in dataset.attributes if attr.type == AttributeType.categorical_text ] datetime_columns_limits = 3, 3 + len(datetime_columns) number_columns_limits = datetime_columns_limits[ 1], datetime_columns_limits[1] + len(number_columns) text_columns_limits = number_columns_limits[ 1], number_columns_limits[1] + len(text_columns) cat_number_columns_limits = text_columns_limits[ 1], text_columns_limits[1] + len(cat_number_columns) cat_text_columns_limits = cat_number_columns_limits[ 1], cat_number_columns_limits[1] + len(cat_text_columns) df = df.loc[:, [ dataset.latitude_attr, dataset. longitude_attr, *datetime_columns, *number_columns, *text_columns, *cat_number_columns, *cat_text_columns ]] greatest_distance = 0 greatest_similarity = 0 x, x_chunk, next_chunk = 1, 1, CHUNKSIZE n_rows = df.shape[0] ds = [] for row_a in df.itertuples(): if DEBUG: logging.info('{}/{}'.format(x, n_rows)) a_datetimes = list( chain.from_iterable([ [d.hour, d.minute, d.weekday()] for d in row_a[ datetime_columns_limits[0]:datetime_columns_limits[1]] ])) if datetime_columns_limits[1] > datetime_columns_limits[ 0] else [] a_numbers = row_a[ number_columns_limits[0]:number_columns_limits[1]] a_texts = row_a[text_columns_limits[0]:text_columns_limits[1]] a_cat_numbers = row_a[ cat_number_columns_limits[0]:cat_number_columns_limits[1]] a_cat_texts = row_a[ cat_text_columns_limits[0]:cat_text_columns_limits[1]] for row_b in df.iloc[x:].itertuples(): b_datetimes = list( chain.from_iterable([ [d.hour, d.minute, d.weekday()] for d in row_b[datetime_columns_limits[0]: datetime_columns_limits[1]] ])) if datetime_columns_limits[ 1] > datetime_columns_limits[0] else [] b_numbers = row_b[ number_columns_limits[0]:number_columns_limits[1]] b_texts = row_b[text_columns_limits[0]:text_columns_limits[1]] b_cat_numbers = row_b[ cat_number_columns_limits[0]:cat_number_columns_limits[1]] b_cat_texts = row_b[ cat_text_columns_limits[0]:cat_text_columns_limits[1]] distance = haversine_distance(row_a[1], row_a[2], row_b[1], row_b[2]) i = cosine_similarity(a_datetimes, b_datetimes) * 1 ii = cosine_similarity(a_numbers, b_numbers) * 2 iii = fuzz_similarity(a_texts, b_texts) * 1 iv = cosine_similarity(a_cat_numbers, b_cat_numbers) * 2 v = jaccard_similarity(a_cat_texts, b_cat_texts) * 1 similarity = i + ii + iii + iv + v ds.append((row_a[0], row_b[0], similarity, distance)) if distance > greatest_distance: greatest_distance = distance if similarity > greatest_similarity: greatest_similarity = similarity x += 1 x_chunk += n_rows - x if x_chunk > next_chunk: pd.DataFrame( ds, columns=['id_a', 'id_b', 'similarity', 'distance']).to_sql(table_rel_name, engine, if_exists='append') next_chunk += CHUNKSIZE ds = [] if ds: pd.DataFrame(ds, columns=['id_a', 'id_b', 'similarity', 'distance']).to_sql(table_rel_name, engine, if_exists='append') engine.execute(''' update "{}" set (similarity, distance) = (similarity/{}, distance/{}) '''.format(table_rel_name, greatest_similarity, greatest_distance)) dataset.indexed_at = datetime.datetime.now(datetime.timezone.utc) db.session.add(dataset) db.session.commit() if DEBUG: logging.info('[PROCESSING SQL] {} seconds'.format(time() - start))