def tweets_to_xls(outxls, searchword=None, searchGeom=None, srs=None, lng='pt', NTW=1000, twType='mixed', Key=None): """ Search for Tweets and Export them to XLS """ from gasp.to import obj_to_tbl data = tweets_to_df(keyword=searchword, inGeom=searchGeom, epsg=srs, LANG=lng, NTWEETS=NTW, tweetType=twType, apiKey=Key) try: if not data: return 0 except: pass obj_to_tbl(data, outxls, sheetsName='twitter') return outxls
def join_tables_in_table(mainTable, mainIdField, joinTables, outTable): """ Join one table with all tables in a folder joinTables = { r'D:\TRENMO_JASP\CARRIS\valid_by_para\period_16_17h59\sabado\fvalidacoes_v6_2018-01-06.xlsx' : { "JOIN_FIELD" : 'paragem', "COLS_TO_JOIN" : {'n_validacao' : 'dia_6'} }, r'D:\TRENMO_JASP\CARRIS\valid_by_para\period_16_17h59\sabado\fvalidacoes_v6_2018-01-13.xlsx' : { "JOIN_FIELD" : 'paragem', "COLS_TO_JOIN" : {'n_validacao' : 'dia_13'} }, r'D:\TRENMO_JASP\CARRIS\valid_by_para\period_16_17h59\sabado\fvalidacoes_v6_2018-01-20.xlsx' : { "JOIN_FIELD" : 'paragem', "COLS_TO_JOIN" : {'n_validacao' : 'dia_20'} }, r'D:\TRENMO_JASP\CARRIS\valid_by_para\period_16_17h59\sabado\fvalidacoes_v6_2018-01-27.xlsx' : { "JOIN_FIELD" : 'paragem', "COLS_TO_JOIN" : {'n_validacao' : 'dia_27'} } } #TODO: only works with xlsx tables as join TABLES """ # Modules import os; import pandas from gasp.fm import tbl_to_obj from gasp.to import obj_to_tbl # Get table format tableType = os.path.splitext(mainTable)[1] tableDf = tbl_to_obj(mainTable) for table in joinTables: xlsDf = tbl_to_obj(table) join_field = 'id_entity' if joinTables[table]["JOIN_FIELD"] == mainIdField \ else joinTables[table]["JOIN_FIELD"] if joinTables[table]["JOIN_FIELD"] == mainIdField: xlsDf.rename(columns={mainIdField : join_field}, inplace=True) xlsDf.rename(columns=joinTables[table]["COLS_TO_JOIN"], inplace=True) tableDf = tableDf.merge( xlsDf, how='outer', left_on=mainIdField, right_on=join_field ) tableDf.fillna(0, inplace=True) tableDf[mainIdField].replace(0, tableDf[join_field], inplace=True) tableDf.drop(join_field, axis=1, inplace=True) obj_to_tbl(tableDf, outTable) return outTable
def get_day_table(day): print('Starting: ' + day) if EXCLUDE_DAYS: if day in EXCLUDE_DAYS: print('Ending: ' + day) return 0 COUNTING = [] for __int in INTERVALS: start, end = __int COUNT_FIELD = 'p{}h{}_{}h{}'.format(str(start[0]), str(start[1]), str(end[0]), str(end[1])) if COUNT_FIELD not in INTERVAL_COLUMNS: INTERVAL_COLUMNS.append(COUNT_FIELD) countTbl = count_by_period_entity(psqldb, start, end, pgtable, DAY_FIELD, day, HOUR_FIELD, MINUTES_FIELD, ENTITY_FIELD) COUNTING.append(countTbl) main_table = COUNTING[0] for i in range(1, len(COUNTING)): main_table = combine_dfs(main_table, COUNTING[i], ENTITY_FIELD) if workspace_day_tables: obj_to_tbl(main_table, os.path.join(workspace_day_tables, 'ti_{}.xlsx')) return main_table
def count_entity_periods_with_certain_duration(db, PERIOD_INTERVAL, PGTABLE, TIME_FIELD, ENTITY_FIELD, OUT_TABLE, filterWhere=None): """ Count rows in a pgtable for a given period of X minutes for each interest entity PERIOD_INTERVAL = "01:00:00" """ import pandas from gasp.pyt.tm import day_to_intervals2 from gasp.pyt.df.joins import combine_dfs # Get Intervals INTERVALS = day_to_intervals2(PERIOD_INTERVAL) # For each interval/period, count the number of rows by entity counting = [] for _int in INTERVALS: Q = ("SELECT {entityCol}, COUNT({entityCol}) AS {cntCol} " "FROM {table} WHERE " "TO_TIMESTAMP({timeCol}, 'HH24:MI:SS') >= " "TO_TIMESTAMP('{minLower}', 'HH24:MI:SS') AND " "TO_TIMESTAMP({timeCol}, 'HH24:MI:SS') < " "TO_TIMESTAMP('{minUpper}', 'HH24:MI:SS'){whr} " "GROUP BY {entityCol}").format(cntCol="s{}_e{}".format( _int[0][:5], _int[1][:5]).replace(":", "_"), table=PGTABLE, timeCol=TIME_FIELD, entityCol=ENTITY_FIELD, minLower=_int[0], minUpper=_int[1], whr="" if not filterWhere else " AND ({}) ".format(filterWhere)) count = q_to_obj(db, Q, db_api='psql') counting.append(count) mainDf = combine_dfs(counting[0], counting[1:], ENTITY_FIELD) obj_to_tbl(mainDf, OUT_TABLE) return OUT_TABLE
def count_by_periods_with_certain_duration(conParam, PERIOD_INTERVAL, pgtable, TIME_FIELD, outTable, filterWhere=None): """ Count rows in a pgtable by periods of X minutes PERIOD_INTERVAL = "01:00:00" """ import pandas from gasp import day_to_intervals2 # Get Intervals INTERVALS = day_to_intervals2(PERIOD_INTERVAL) # For each interval/period, count the number of rows counting = None for _int_ in INTERVALS: QUERY = ( "SELECT COUNT(*) AS count FROM {table} WHERE " "TO_TIMESTAMP({timeCol}, 'HH24:MI:SS') >= " "TO_TIMESTAMP('{minLower}', 'HH24:MI:SS') AND " "TO_TIMESTAMP({timeCol}, 'HH24:MI:SS') < " "TO_TIMESTAMP('{minUpper}', 'HH24:MI:SS'){whr}" ).format( table = pgtable, timeCol = TIME_FIELD, minLower = _int_[0], minUpper = _int_[1], whr = "" if not filterWhere else " AND ({})".format( filterWhere ) ) count = query_to_df(conParam, QUERY, db_api='psql') count.rename(index={0 : "{}-{}".format( _int_[0][:5], _int_[1][:5] )}, inplace=True) if type(counting) != pandas.DataFrame: counting = count.copy() else: counting = counting.append(count, ignore_index=False) obj_to_tbl(counting, outTable) return outTable
def field_sum_two_tables(tableOne, tableTwo, joinFieldOne, joinFieldTwo, field_to_sum, outTable): """ Sum same field in different tables Table 1: id | field 0 | 10 1 | 11 2 | 13 3 | 10 Table 2: id | field 0 | 10 1 | 9 2 | 17 4 | 15 Create the new table id | field 0 | 20 1 | 20 2 | 30 3 | 10 4 | 15 """ from gasp.fm import tbl_to_obj from gasp.to import obj_to_tbl from gasp.mng.joins import sum_field_of_two_tables # Open two tables df_one = tbl_to_obj(tableOne) df_two = tbl_to_obj(tableTwo) # Do it! outDf = sum_field_of_two_tables( df_one, joinFieldOne, df_two, joinFieldTwo, field_to_sum ) obj_to_tbl(outDf, outTable) return outTable
def show_duplicates_in_xls(db_name, table, pkCols, outFile, tableIsQuery=None): """ Find duplicates and write these objects in a table """ import pandas from gasp.pyt import obj_to_lst from gasp.sql.fm import q_to_obj from gasp.to import obj_to_tbl pkCols = obj_to_lst(pkCols) if not pkCols: raise ValueError("pkCols value is not valid") if not tableIsQuery: q = ("SELECT {t}.* FROM {t} INNER JOIN (" "SELECT {cls}, COUNT({cnt}) AS conta FROM {t} " "GROUP BY {cls}" ") AS foo ON {rel} " "WHERE conta > 1").format(t=table, cls=", ".join(pkCols), cnt=pkCols[0], rel=" AND ".join([ "{t}.{c} = foo.{c}".format(t=table, c=col) for col in pkCols ])) else: q = ("SELECT foo.* FROM ({q_}) AS foo INNER JOIN (" "SELECT {cls}, COUNT({cnt}) AS conta " "FROM ({q_}) AS foo2 GROUP BY {cls}" ") AS jt ON {rel} " "WHERE conta > 1").format(q_=table, cls=", ".join(pkCols), cnt=pkCols[0], rel=" AND ".join([ "foo.{c} = jt.{c}".format(c=x) for x in pkCols ])) data = q_to_obj(db_name, q, db_api='psql') obj_to_tbl(data, outFile) return outFile
def record_time_consumed(timeData, outXls): """ Record the time consumed by a OSM2LULC procedure version in a excel table """ import pandas from gasp.to import obj_to_tbl # Produce main table - Time consumed by rule main = [{ 'rule': timeData[i][0], 'time': timeData[i][1] } for i in range(len(timeData.keys())) if timeData[i]] # Produce detailed table - Time consumed inside rules timeInsideRule = [] timeDataKeys = timeData.keys() timeDataKeys.sort() for i in timeDataKeys: if not timeData[i]: continue if len(timeData[i]) == 2: timeInsideRule.append({ 'rule': timeData[i][0], 'task': timeData[i][0], 'time': timeData[i][1] }) elif len(timeData[i]) == 3: taskKeys = timeData[i][2].keys() taskKeys.sort() for task in taskKeys: if not timeData[i][2][task]: continue timeInsideRule.append({ 'rule': timeData[i][0], 'task': timeData[i][2][task][0], 'time': timeData[i][2][task][1] }) else: print 'timeData object with key {} is not valid'.format(i) # Export tables to excel dfs = [pandas.DataFrame(main), pandas.DataFrame(timeInsideRule)] return obj_to_tbl(dfs, outXls, sheetsName=['general', 'detailed'])
def model_conf_matrix(tblFile, refCol, clsCol, outMxt): """ Model Evaluation """ import pandas as pd from gasp.fm import tbl_to_obj from gasp.to import obj_to_tbl from sklearn.metrics import confusion_matrix, classification_report data = tbl_to_obj(tblFile) data[refCol] = data[refCol].astype(str) data[clsCol] = data[clsCol].astype(str) ref_id = data[[refCol]].drop_duplicates().sort_values(refCol) conf_mat = confusion_matrix(data[refCol], data[clsCol]) mxt = pd.DataFrame(conf_mat, columns=ref_id[refCol].values, index=ref_id[refCol].values) mxt.reset_index(inplace=True) mxt.rename(columns={'index': 'confusion_mxt'}, inplace=True) # Get classification report report = classification_report(data[refCol], data[clsCol], target_names=ref_id[refCol], output_dict=True) global_keys = ['accuracy', 'macro avg', 'micro avg', 'weighted avg'] cls_eval = {k: report[k] for k in report if k not in global_keys} glb_eval = {k: report[k] for k in report if k in global_keys} if 'accuracy' in glb_eval: glb_eval['accuracy'] = { 'f1-score': glb_eval['accuracy'], 'precision': 0, 'recall': 0, 'support': 0 } cls_eval = pd.DataFrame(cls_eval).T gbl_eval = pd.DataFrame(glb_eval).T return obj_to_tbl([gbl_eval, cls_eval, mxt], outMxt, sheetsName=['global', 'report', 'matrix'])
def merge_xls_in_folder(tbl_folder, out_table): """ Get all excel tables in a folder and make one table of them """ import pandas from gasp.pyt.oss import lst_ff from gasp.fm import tbl_to_obj from gasp.to import obj_to_tbl tables = lst_ff(tbl_folder, file_format=['.xls', '.xlsx']) dfs = [tbl_to_obj(table) for table in tables] result = pandas.concat(dfs) out_table = obj_to_tbl(result, out_table) return out_table
def meanday_of_periods_by_entity(psqldb, pgtable, DAY_FIELD, HOUR_FIELD, MINUTES_FIELD, ENTITY_FIELD, OUTPUT_FILE, PERIODS=None, PERIODS_INTERVAL=None, EXCLUDE_DAYS=None, workspace_day_tables=None): """ For every day in a pgtable, count the number of rows by periods of X minutes for each interest entity. At the end, calculate the mean between every day for each period. """ import os import pandas from gasp.pyt.tm import day_to_intervals from gasp.pyt.df.joins import combine_dfs from gasp.sql.fm import q_to_obj from gasp.to import obj_to_tbl from gasp.sql.q.count import count_by_period_entity if not PERIODS and not PERIODS_INTERVAL: raise ValueError( ("Please give value to PERIODS or PERIODS_INTERAL. " "If PERIODS and PERIODS_INTERVAL, PERIODS will have priority.")) # Get intervals INTERVALS = day_to_intervals(PERIODS_INTERVAL) if not PERIODS else PERIODS # Get unique values VALUES = q_to_obj( psqldb, "SELECT {col} FROM {t} GROUP BY {col}".format( col=DAY_FIELD, t=pgtable))[DAY_FIELD].tolist() DAYS_ARRAY = [] INTERVAL_COLUMNS = [] def get_day_table(day): print('Starting: ' + day) if EXCLUDE_DAYS: if day in EXCLUDE_DAYS: print('Ending: ' + day) return 0 COUNTING = [] for __int in INTERVALS: start, end = __int COUNT_FIELD = 'p{}h{}_{}h{}'.format(str(start[0]), str(start[1]), str(end[0]), str(end[1])) if COUNT_FIELD not in INTERVAL_COLUMNS: INTERVAL_COLUMNS.append(COUNT_FIELD) countTbl = count_by_period_entity(psqldb, start, end, pgtable, DAY_FIELD, day, HOUR_FIELD, MINUTES_FIELD, ENTITY_FIELD) COUNTING.append(countTbl) main_table = COUNTING[0] for i in range(1, len(COUNTING)): main_table = combine_dfs(main_table, COUNTING[i], ENTITY_FIELD) if workspace_day_tables: obj_to_tbl(main_table, os.path.join(workspace_day_tables, 'ti_{}.xlsx')) return main_table for day in VALUES: t = get_day_table(day[0]) if type(t) == int: continue else: DAYS_ARRAY.append(t) print('Ending: ' + day[0]) main_table = DAYS_ARRAY[0] for i in range(1, len(DAYS_ARRAY)): join_field = 'id_entity' renameDict = {col: 'join_' + col for col in INTERVAL_COLUMNS} renameDict.update({ENTITY_FIELD: join_field}) DAYS_ARRAY[i].rename(columns=renameDict, inplace=True) main_table = main_table.merge(DAYS_ARRAY[i], how='outer', left_on=ENTITY_FIELD, right_on=join_field) main_table.fillna(0, inplace=True) main_table[ENTITY_FIELD].replace(0, main_table[join_field], inplace=True) main_table.drop(join_field, axis=1, inplace=True) for k in INTERVAL_COLUMNS: main_table[k] = main_table[k] + main_table[renameDict[k]] main_table.drop(renameDict[k], axis=1, inplace=True) for col in INTERVAL_COLUMNS: main_table[col] = main_table[col] / len(DAYS_ARRAY) obj_to_tbl(main_table, OUTPUT_FILE)
def model_selection(dataFile, refCol, dataCol, outTbl, lang='english', CV=5): """ See which model is better to use in text classification for a specific data sample Compare: Logistic Regression (LogisticRegression) (Multinomial) Naive Bayes (MultinomialNB) Linear Support Vector Machine (LinearSVC) Random Forest (RandomForestClassifier) """ import os from gasp.pyt.oss import fprop from gasp.fm import tbl_to_obj from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.svm import LinearSVC from sklearn.naive_bayes import MultinomialNB from sklearn.model_selection import cross_val_score from gasp.to import obj_to_tbl # Data to DataFrame trainDf = tbl_to_obj(dataFile) # Just in case, delete rows with NULL refCol and NULL dataCol trainDf = trainDf[pd.notnull(trainDf[dataCol])] trainDf = trainDf[pd.notnull(trainDf[refCol])] # Ref col to integers from io import StringIO trainDf['ref_id'] = trainDf[refCol].factorize()[0] # Text to numbers features = txt_to_num_representation(trainDf, dataCol, lang) labels = trainDf.ref_id """ Test Models """ models = [ RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0), LinearSVC(), MultinomialNB(), LogisticRegression(random_state=0) ] cv_df = pd.DataFrame(index=range(CV * len(models))) entries = [] for model in models: m_name = model.__class__.__name__ accuracies = cross_val_score( model, features, labels, scoring='accuracy', cv=CV ) for fold_idx, accuracy in enumerate(accuracies): entries.append((m_name, fold_idx, accuracy)) # Create and Export evaluation table cv_df = pd.DataFrame( entries, columns=['model_name', 'fold_idx', 'accuracy']) cv_df_gp = pd.DataFrame(cv_df.groupby('model_name').accuracy.mean()) cv_df_gp.reset_index(inplace=True) # Export Graphic import seaborn as sns a = sns.boxplot(x='model_name', y='accuracy', data=cv_df) b = sns.stripplot( x='model_name', y='accuracy', data=cv_df, size=10, jitter=True, edgecolor="gray", linewidth=2) fig = b.get_figure() fig.savefig(os.path.join( os.path.dirname(outTbl), fprop(outTbl, 'fn') + '.png' )) return obj_to_tbl(cv_df_gp, outTbl)
def get_not_used_tags(OSM_FILE, OUT_TBL): """ Use a file OSM to detect tags not considered in the OSM2LULC procedure """ import os from gasp.to import obj_to_tbl from gasp.gt.attr import sel_by_attr from gasp.sql.fm import q_to_obj from gasp.pyt.df.split import df_split from gasp.pyt.oss import fprop from gasp.gt.toshp.osm import osm_to_gpkg OSM_TAG_MAP = { "DB": os.path.join( os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'osmtolulc.sqlite'), "OSM_FEAT": "osm_features", "KEY_COL": "key", "VALUE_COL": "value", "GEOM_COL": "geom" } WORKSPACE = os.path.dirname(OUT_TBL) sqdb = osm_to_gpkg( OSM_FILE, os.path.join(WORKSPACE, fprop(OSM_FILE, 'fn') + '.gpkg')) # Get Features we are considering ourOSMFeatures = q_to_obj( OSM_TAG_MAP["DB"], ("SELECT {key} AS key_y, {value} AS value_y, {geom} AS geom_y " "FROM {tbl}").format(key=OSM_TAG_MAP["KEY_COL"], value=OSM_TAG_MAP["VALUE_COL"], geom=OSM_TAG_MAP["GEOM_COL"], tbl=OSM_TAG_MAP["OSM_FEAT"]), db_api='sqlite') # Get Features in File TABLES_TAGS = { 'points': ['highway', 'man_made', 'building'], 'lines': ['highway', 'waterway', 'aerialway', 'barrier', 'man_made', 'railway'], 'multipolygons': [ 'aeroway', 'amenity', 'barrier', 'building', 'craft', 'historic', 'land_area', '' 'landuse', 'leisure', 'man_made', 'military', 'natural', 'office', 'place', 'shop', 'sport', 'tourism', 'waterway', 'power', 'railway', 'healthcare', 'highway' ] } Qs = [ " UNION ALL ".join([( "SELECT '{keycol}' AS key, {keycol} AS value, " "'{geomtype}' AS geom FROM {tbl} WHERE " "{keycol} IS NOT NULL" ).format( keycol=c, geomtype='Point' if table == 'points' else 'Line' \ if table == 'lines' else 'Polygon', tbl=table ) for c in TABLES_TAGS[table]]) for table in TABLES_TAGS ] fileOSMFeatures = q_to_obj(sqdb, ("SELECT key, value, geom FROM ({}) AS foo " "GROUP BY key, value, geom").format( " UNION ALL ".join(Qs)), db_api='sqlite') _fileOSMFeatures = fileOSMFeatures.merge( ourOSMFeatures, how='outer', left_on=["key", "value", "geom"], right_on=["key_y", "value_y", "geom_y"]) # Select OSM Features of file without correspondence _fileOSMFeatures["isnew"] = _fileOSMFeatures.key_y.fillna(value='nenhum') newTags = _fileOSMFeatures[_fileOSMFeatures.isnew == 'nenhum'] newTags["value"] = newTags.value.str.replace("'", "''") newTags["whr"] = newTags.key + "='" + newTags.value + "'" # Export tags not being used to new shapefile def to_regular_str(row): san_str = row.whr row["whr_san"] = san_str return row for t in TABLES_TAGS: if t == 'points': filterDf = newTags[newTags.geom == 'Point'] elif t == 'lines': filterDf = newTags[newTags.geom == 'Line'] elif t == 'multipolygons': filterDf = newTags[newTags.geom == 'Polygon'] if filterDf.shape[0] > 500: dfs = df_split(filterDf, 500, nrows=True) else: dfs = [filterDf] Q = "SELECT * FROM {} WHERE {}".format( t, filterDf.whr.str.cat(sep=" OR ")) i = 1 for df in dfs: fn = t + '.shp' if len(dfs) == 1 else '{}_{}.shp'.format(t, str(i)) try: shp = sel_by_attr(sqdb, Q.format(t, df.whr.str.cat(sep=" OR ")), os.path.join(WORKSPACE, fn), api_gis='ogr') except: __df = df.apply(lambda x: to_regular_str(x), axis=1) shp = sel_by_attr(sqdb, Q.format(t, __df.whr.str.cat(sep=" OR ")), os.path.join(WORKSPACE, fn)) i += 1 # Export OUT_TBL with tags not being used newTags.drop(['key_y', 'value_y', 'geom_y', 'isnew', 'whr'], axis=1, inplace=True) obj_to_tbl(newTags, OUT_TBL, sheetsName="new_tags", sanitizeUtf8=True) return OUT_TBL
def text_prediction(trainData, classData, trainRefCol, trainClsCol, clsDataCol, outfile, method='NaiveBayes', lang='english'): """ Text classification Classifier Options: 1) NaiveBayes; 2) LinearSupportVectorMachine; 3) RandomForest; 4) LogisticRegression. """ import pandas as pd from gasp.fm import tbl_to_obj from gasp.to import obj_to_tbl # Data to Dataframe trainDf = tbl_to_obj(trainData) if type(trainData) != pd.DataFrame else trainData classDf = tbl_to_obj(classData) if type(classData) != pd.DataFrame else classData # Just in case, delete rows with NULL refCol and NULL dataCol trainDf = trainDf[pd.notnull(trainDf[trainClsCol])] trainDf = trainDf[pd.notnull(trainDf[trainRefCol])] classDf = classDf[pd.notnull(classDf[clsDataCol])] if method == 'NaiveBayes': from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer """" Train Model """ # X train is trainClsCol # Y train is trainRefCol x_train, y_train = trainDf[trainClsCol], trainDf[trainRefCol] count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(x_train) tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) clf = MultinomialNB().fit(X_train_tfidf, y_train) """ Predict """ result = clf.predict(count_vect.transform(classDf[clsDataCol])) classDf['classification'] = result elif method == 'LinearSupportVectorMachine': import numpy from sklearn.svm import LinearSVC # Get features and Labels trainDf['ref_id'] = trainDf[trainRefCol].factorize()[0] labels = trainDf.ref_id features, tvect = txt_to_num_representation( trainDf, trainClsCol, __lang=lang, returnTfiDf=True) featTst = tvect.transform(classDf[clsDataCol]) """ Train model """ model = LinearSVC() model.fit(features, labels) y_pred = model.predict(featTst) classDf['classification'] = y_pred # Create Dataframe only with ref_id's, without duplicates ref_id_df = trainDf[[ trainRefCol, 'ref_id' ]].drop_duplicates().sort_values('ref_id') ref_id_df.columns = ['class_name', 'ref_fid'] classDf = classDf.merge( ref_id_df, how='inner', left_on='classification', right_on='ref_fid' ) classDf.loc[:, 'classification'] = classDf.class_name classDf.drop(['ref_fid', 'class_name'], axis=1, inplace=True) elif method == 'RandomForest': from sklearn.ensemble import RandomForestClassifier # Get features features, tvect = txt_to_num_representation( trainDf, trainClsCol, __lang=lang, returnTfiDf=True) featTst = tvect.transform(classDf[clsDataCol]) classifier = RandomForestClassifier( n_estimators=1000, random_state=0 ) classifier.fit(features, trainDf[trainRefCol]) y_pred = classifier.predict(featTst) classDf['classification'] = y_pred elif method == 'LogisticRegression': from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.pipeline import Pipeline from sklearn.linear_model import LogisticRegression logreg = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', LogisticRegression(n_jobs=1, C=1e5, multi_class='auto', solver='lbfgs')), ]) logreg.fit(trainDf[trainClsCol], trainDf[trainRefCol]) y_pred = logreg.predict(classDf[clsDataCol]) classDf['classification'] = y_pred return obj_to_tbl(classDf, outfile)
def correlated_words(dataFile, refCol, dataCol, outTbl, lang='english', N=2, refSheet=None): """ Get words correlated with some text class """ from sklearn.feature_selection import chi2 from gasp.to import obj_to_tbl from gasp.fm import tbl_to_obj from gasp.pyt.txtcls import txt_to_num_representation # Data to DataFrame trainDf = tbl_to_obj( dataFile, sheet=refSheet ) if type(dataFile) != pd.DataFrame else dataFile # Just in case, delete rows with NULL refCol and NULL dataCol trainDf = trainDf[pd.notnull(trainDf[dataCol])] trainDf = trainDf[pd.notnull(trainDf[refCol])] """ Add a column encoding the reference classes as an integer because categorical variables are often better represented by integers than strings """ from io import StringIO # Get a ID for Ref/text classes values trainDf['ref_id'] = trainDf[refCol].factorize()[0] # Create Dataframe only with ref_id's, without duplicates ref_id_df = trainDf[[refCol, 'ref_id']].drop_duplicates().sort_values( 'ref_id' ) # Create dicts to easy relate ref_id with ref_value ref_to_id = dict(ref_id_df.values) id_to_ref = dict(ref_id_df[['ref_id', refCol]].values) """ Text to numbers """ features, tfidf = txt_to_num_representation( trainDf, dataCol, lang, returnTfiDf=True) labels = trainDf.ref_id """ Get most correlated words """ corr_words = [] for ref_name, ref_id in sorted(ref_to_id.items()): features_chi2 = chi2(features, labels == ref_id) indices = np.argsort(features_chi2[0]) feat_names = np.array(tfidf.get_feature_names())[indices] unigrams = [v for v in feat_names if len(v.split(' ')) == 1][-N:] bigrams = [v for v in feat_names if len(v.split(' ')) == 2][-N:] cols_d = [ref_name] + unigrams + bigrams corr_words.append(cols_d) COLS_NAME = ['ref_name'] + [ 'unigram_{}'.format(str(i+1)) for i in range(N) ] + [ 'bigram_{}'.format(str(i+1)) for i in range(N) ] dfCorrWords = pd.DataFrame(corr_words,columns=COLS_NAME) return obj_to_tbl(dfCorrWords, outTbl)
def run_query_for_values_in_col(conParam, query, table_interest_col, interest_col, outworkspace): """ Execute a query for each value in one column In each iteration, the values may participate in the query. Export the several tables to excel Example: ID_PERCURSO | PARAGEM | DIA | GEOM 0 | 255 |'2018-01-01 | xxxx 0 | 255 |'2018-01-01 | xxxx 0 | 254 |'2018-01-01 | xxxx 0 | 254 |'2018-01-01 | xxxx 0 | 255 |'2018-01-02 | xxxx 0 | 255 |'2018-01-02 | xxxx 0 | 254 |'2018-01-02 | xxxx 0 | 254 |'2018-01-02 | xxxx For a query as: SELECT ID_PERCURSO, PARAGEM, GEOM, DIA, COUNT(PARAGEM) AS conta FROM table WHERE DIA={} GROUP BY PARAGEM, GEOM, DIA; This method will generate two tables: First table: ID_PERCURSO | PARAGEM | DIA | GEOM | conta 0 | 255 |'2018-01-01 | xxxx | 2 0 | 254 |'2018-01-01 | xxxx | 2 Second table: ID_PERCURSO | PARAGEM | DIA | GEOM | conta 0 | 255 |'2018-01-02 | xxxx | 2 0 | 254 |'2018-01-02 | xxxx | 2 {} will be replaced for every value in the interest_column that will be iterated one by one """ from gasp.fm.sql import query_to_df from gasp.sql.mng.fld import get_columns_type from gasp.to import obj_to_tbl fields_types = get_columns_type(conParam, table_interest_col) # Get unique values VALUES = query_to_df(conParam, "SELECT {col} FROM {t} GROUP BY {col}".format( col=interest_col, t=table_interest_col), db_api='psql')[interest_col].tolist() # Aplly query for every value in VALUES # Write data in excel for value in VALUES: data = query_to_df(conParam, query.format( str(value[0]) if fields_types[interest_col] != str \ and fields_types[interest_col] != unicode else \ "'{}'".format(str(value[0])) ), db_api='psql') obj_to_tbl( data, os.path.join( outworkspace, '{}_{}.xlsx'.format(table_interest_col, str(value[0]))))
def binary_eval(refTbl, refId, refCol, tstTbl, tstId, outTbl=None, tstCol=None): """ Evaluation of a binary classification When tstCol is None, the script assumes that in tstTbl there are only positives A tabela de referencia deve ter positivos e negativos; mas a tabela de teste pode ter so positivos. """ import numpy as np import pandas import math from gasp.fm import tbl_to_obj from gasp.to import obj_to_tbl # Data to Pandas Dataframe ref_df = tbl_to_obj(refTbl, fields=[ refId, refCol ]) if type(refTbl) != pandas.DataFrame else refTbl[[refId, refCol]] tst_df = tbl_to_obj( tstTbl, fields=[tstId] if not tstCol else [tstId, tstCol] ) if type(refTbl) != pandas.DataFrame else tstTbl[[tstId]] \ if not tstCol else tstTbl[[tstId, tstCol]] # Check if refId is equal to tstId; they must be different if refId == tstId: colRename = {tstId: 'tst_fid__'} # Do the same for refCol and tstCol if refCol == tstCol: colRename[tstCol] = 'tst_col__' tst_df.rename(columns=colRename, inplace=True) tstId = 'tst_fid__' if refCol == tstCol: tstCol = 'tst_col__' df = ref_df.merge(tst_df, how='left', left_on=refId, right_on=tstId) # Check if we have a tstCol if not tstCol: df[tstId].fillna('None', inplace=True) tstCol = 'cls_tst' df[tstCol] = np.where(df[tstId] == 'None', 0, 1) # Get VP, VN, FP, FN df['confusion'] = np.where( (df[refCol] == 1) & (df[tstCol] == 1), 'VP', np.where((df[refCol] == 0) & (df[tstCol] == 0), 'VN', np.where((df[refCol] == 1) & (df[tstCol] == 0), 'FN', 'FP'))) # tabela sintese conf_tbl = pandas.DataFrame() conf_tbl['nrows'] = df.groupby(['confusion'])[refId].nunique() conf_tbl.reset_index(inplace=True) conf_tbl['percentage'] = (conf_tbl.nrows * 100) / df.shape[0] # Get some evaluation mesures dConf = {} for row in conf_tbl.to_dict(orient='records'): dConf[row['confusion']] = row['nrows'] l = ['VP', 'VN', 'FP', 'FN'] for i in l: if i not in dConf: dConf[i] = 0 """ Error rate Error rate (ERR) is calculated as the number of all incorrect predictions divided by the total number of the dataset. The best error rate is 0.0, whereas the worst is 1.0. """ ERR = (dConf['FP'] + dConf['FN']) / (dConf['VP'] + dConf['VN'] + dConf['FN'] + dConf['FP']) """ Accuracy Accuracy (ACC) is calculated as the number of all correct predictions divided by the total number of the dataset. The best accuracy is 1.0, whereas the worst is 0.0. It can also be calculated by 1 – ERR. """ ACC = (dConf['VP'] + dConf['VN']) / (dConf['VP'] + dConf['VN'] + dConf['FN'] + dConf['FP']) """ Sensitivity (Recall or True positive rate) Sensitivity (SN) is calculated as the number of correct positive predictions divided by the total number of positives. It is also called recall (REC) or true positive rate (TPR). The best sensitivity is 1.0, whereas the worst is 0.0. """ try: SN = dConf['VP'] / (dConf['VP'] + dConf['FN']) except: SN = -99 """ Specificity (True negative rate) Specificity (SP) is calculated as the number of correct negative predictions divided by the total number of negatives. It is also called true negative rate (TNR). The best specificity is 1.0, whereas the worst is 0.0. """ SP = dConf['VN'] / (dConf['VN'] + dConf['FP']) """ Precision (Positive predictive value) Precision (PREC) is calculated as the number of correct positive predictions divided by the total number of positive predictions. It is also called positive predictive value (PPV). The best precision is 1.0, whereas the worst is 0.0. """ PREC = dConf["VP"] / (dConf["VP"] + dConf['FP']) """ False positive rate False positive rate (FPR) is calculated as the number of incorrect positive predictions divided by the total number of negatives. The best false positive rate is 0.0 whereas the worst is 1.0. It can also be calculated as 1 – specificity. """ FPR = dConf['FP'] / (dConf['VN'] + dConf['FP']) """ Matthews correlation coefficient Matthews correlation coefficient (MCC) is a correlation coefficient calculated using all four values in the confusion matrix. """ try: MCC = (dConf['VP'] * dConf['VN'] - dConf['FP'] * dConf['FN']) / (math.sqrt( (dConf['VP'] + dConf['FP']) * (dConf['VP'] + dConf['FN']) * (dConf['VN'] + dConf['FP']) * (dConf['VN'] + dConf['FN']))) except: MCC = -99 """ F-score F-score is a harmonic mean of precision and recall. """ F0_5 = ((1 + 0.5**2) * (PREC * SN)) / (0.5**2 * PREC + SN) F_1 = (2 * PREC * SN) / (PREC + SN) F_2 = (5 * PREC * SN) / (4 * PREC + SN) evalMeasures = pandas.DataFrame( [['Error rate', ERR], ['Accuracy', ACC], ['Sensitivity', SN], ['Specificity', SP], ['Precision', PREC], [ 'False positive rate', FPR ], ['Matthews correlation coefficient', MCC], ['F-score 0.5', F0_5], ['F-score 1', F_1], ['F-score 2', F_2]], columns=['eval_mesure', 'value']) if outTbl: return obj_to_tbl([conf_tbl, evalMeasures, df], outTbl, sheetsName=['matrix', 'eval_mesures', 'tbl']) else: return conf_tbl, evalMeasures, df
def count_by_groupcols_and_periods(conParam, pgtable, COLUMNS_TO_GROUP, HOUR_FIELD, MINUTES_FIELD, COUNT_FIELD_NAME, OUTPUT_FILE, PERIOD_INTERVAL=None, PERIODS=None): """ Count rows in a pgtable by periods of X minutes grouping by columns values """ from gasp import day_to_intervals if not PERIODS and not PERIODS_INTERVAL: raise ValueError(( "Please give value to PERIODS or PERIODS_INTERAL. " "If PERIODS and PERIODS_INTERVAL, PERIODS will have priority." )) INTERVALS = day_to_intervals(PERIOD_INTERVAL) if not PERIODS else PERIODS i = 0 for interval in INTERVALS: start, end = interval INTERVAL_STR = '{}h{}-{}h{}'.format(start[0], start[1], end[0], end[1]) if start[0] == end[0]: QUERY = ( "SELECT {cols}, COUNT({col}) AS {countname} FROM {table} " "WHERE {hourF}={hour} AND " "{minF} >= {minLower} AND {minF} < {minUpper} " "GROUP BY {cols}" ).format( table=pgtable, cols=', '.join(COLUMNS_TO_GROUP), col=COLUMNS_TO_GROUP[0], countname=COUNT_FIELD_NAME, hourF=HOUR_FIELD, hour=str(start[0]), minF=MINUTES_FIELD, minLower=str(start[1]), minUpper=str(end[1]) ) else: if end[0] - start[0] == 1: QUERY = ( "SELECT {cols}, COUNT({col}) AS {countname} FROM {table} " "WHERE ({hourF}={hourLower} AND {minF}>={minLower}) OR " "({hourF}={hourUpper} AND {minF} < {minUpper}) " "GROUP BY {cols}" ).format( table=pgtable, cols=', '.join(COLUMNS_TO_GROUP), col=COLUMNS_TO_GROUP[0], countname=COUNT_FIELD_NAME, hourF=HOUR_FIELD, hourLower=str(start[0]), hourUpper=str(end[0]), minF=MINUTES_FIELD, minLower=str(start[1]), minUpper=str(end[1]) ) else: mHours = [start[0] + i for i in range(1, end[0] - start[0])] QUERY = ( "SELECT {cols}, COUNT({col}) AS {countname} FROM {table} " "WHERE ({hourF}={hourLower} AND {minF}>={minLower}) OR " "{mean_hours_exp} OR " "({hourF}={hourUpper} AND {minF} < {minUpper}) " "GROUP BY {cols}" ).format( table_pgtable, cols=', '.join(COLUMNS_TO_GROUP), col=COLUMNS_TO_GROUP[0], countname=COUNT_FIELD_NAME, hourF=HOUR_FIELD, hourLower=str(start[0]), hourUpper=str(end[0]), minF=MINUTES_FIELD, minLower=str(end[1]), minUpper=str(end[1]), mean_hours_exp=" OR ".join([ "({}={} AND {} >= 0)".format( HOUR_FIELD, h, MINUTES_FIELD ) for h in mHours ]) ) countTbl = query_to_df(conParam, QUERY, db_api='psql') countTbl[HOUR_FIELD] = INTERVAL_STR if not i: table = countTbl i+=1 else: table = table.append(countTbl, ignore_index=True) obj_to_tbl(table, OUTPUT_FILE)
def meanrowsday_of_periods_by_entity(psql_con, pgtable, dayField, hourField, minutesField, secondField, entityField, PERIODS, outFile, filterData=None, numberDays=None): """ Evolution of meanday_of_periods_by_entity: For every day in a pgtable, count the number of rows by periods of X minutes for each interest entity. At the end, calculate the mean between every day for each period. This method uses SQL and TimeInterval columns. PERIODS = [('07:30:00', '09:30:00'), ('07:30:00', '09:30:00')] It is not complete because the output table not have a column for each period """ import pandas from gasp.pyt import obj_to_lst from gasp.sql.fm import q_to_obj from gasp.to import obj_to_tbl def get_case(PTUPLE, PFIELD): return ("CASE " "WHEN TO_TIMESTAMP(" "COALESCE(CAST({h} AS text), '') || ':' || " "COALESCE(CAST({m} AS text), '') || ':' || " "COALESCE(CAST({s} AS text), ''), 'HH24:MI:SS'" ") >= TO_TIMESTAMP('{tLower}', 'HH24:MI:SS') AND " "TO_TIMESTAMP(" "COALESCE(CAST({h} AS text), '') || ':' || " "COALESCE(CAST({m} AS text), '') || ':' || " "COALESCE(CAST({s} AS text), ''), 'HH24:MI:SS'" ") < TO_TIMESTAMP('{tUpper}', 'HH24:MI:SS') " "THEN 1 ELSE 0 " "END AS {fld}").format(h=hourField, m=minutesField, s=secondField, tLower=PTUPLE[0], tUpper=PTUPLE[1], fld=PFIELD) entityField = obj_to_lst(entityField) periodsCols = [ "p{ha}h{ma}_{hb}h{mb}".format(ha=p[0].split(':')[0], ma=p[0].split(':')[1], hb=p[1].split(':')[0], mb=p[1].split(':')[1]) for p in PERIODS ] ndaysQ = "SELECT {} AS nday".format(numberDays) if numberDays else \ ("SELECT MAX(nday) AS nday FROM (" "SELECT row_number() OVER(ORDER BY {dayF}) AS nday " "FROM {t} {whr}" "GROUP BY {dayF}" ") AS dayt") # Get mean rows of all days by entity and period q = ("SELECT {entityF}, {meanSq}, nday FROM (" "SELECT {entityF}, {dayF}, {sumSeq} FROM (" "SELECT {entityF}, {dayF}, {caseSt} FROM {t} {whr}" ") AS foo " "WHERE {whrSq} " "GROUP BY {entityF}, {dayF}" ") AS foo2, ({getND}) AS fooday " "GROUP BY {entityF}, nday").format( entityF=", ".join(entityField), meanSq=", ".join([ "(SUM({f}) / nday) AS {f}".format(f=p) for p in periodsCols ]), dayF=dayField, sumSeq=", ".join( ["SUM({f}) AS {f}".format(f=p) for p in periodsCols]), caseSt=", ".join([ get_case(PERIODS[x], periodsCols[x]) for x in range(len(PERIODS)) ]), t=pgtable, whr="" if not filterData else "WHERE {} ".format(filterData), whrSq=" OR ".join(["{}=1".format(p) for p in periodsCols]), getND=ndaysQ) data = q_to_obj(psql_con, q, db_api='psql') obj_to_tbl(data, outFile) return outFile
def get_not_used_tags(OSM_FILE, OUT_TBL): """ Use a file OSM to detect tags not considered in the OSM2LULC procedure """ import os from gasp.anls.exct import sel_by_attr from gasp.fm.sql import query_to_df from gasp.oss import get_filename from gasp.osm2lulc.utils import osm_to_sqdb from gasp.to import obj_to_tbl OSM_TAG_MAP = { "DB": os.path.join( os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'osmtolulc.sqlite'), "OSM_FEAT": "osm_features", "KEY_COL": "key", "VALUE_COL": "value", "GEOM_COL": "geom" } WORKSPACE = os.path.dirname(OUT_TBL) sqdb = osm_to_sqdb( OSM_FILE, os.path.join(WORKSPACE, get_filename(OSM_FILE) + '.sqlite')) # Get Features we are considering ourOSMFeatures = query_to_df( OSM_TAG_MAP["DB"], ("SELECT {key} AS key_y, {value} AS value_y, {geom} AS geom_y " "FROM {tbl}").format(key=OSM_TAG_MAP["KEY_COL"], value=OSM_TAG_MAP["VALUE_COL"], geom=OSM_TAG_MAP["GEOM_COL"], tbl=OSM_TAG_MAP["OSM_FEAT"]), db_api='sqlite') # Get Features in File TABLES_TAGS = { 'points': ['highway', 'man_made', 'building'], 'lines': ['highway', 'waterway', 'aerialway', 'barrier', 'man_made', 'railway'], 'multipolygons': [ 'aeroway', 'amenity', 'barrier', 'building', 'craft', 'historic', 'land_area', '' 'landuse', 'leisure', 'man_made', 'military', 'natural', 'office', 'place', 'shop', 'sport', 'tourism', 'waterway', 'power', 'railway', 'healthcare', 'highway' ] } Qs = [ " UNION ALL ".join([( "SELECT '{keycol}' AS key, {keycol} AS value, " "'{geomtype}' AS geom FROM {tbl} WHERE " "{keycol} IS NOT NULL" ).format( keycol=c, geomtype='Point' if table == 'points' else 'Line' \ if table == 'lines' else 'Polygon', tbl=table ) for c in TABLES_TAGS[table]]) for table in TABLES_TAGS ] fileOSMFeatures = query_to_df(sqdb, ("SELECT key, value, geom FROM ({}) AS foo " "GROUP BY key, value, geom").format( " UNION ALL ".join(Qs)), db_api='sqlite') _fileOSMFeatures = fileOSMFeatures.merge( ourOSMFeatures, how='outer', left_on=["key", "value", "geom"], right_on=["key_y", "value_y", "geom_y"]) # Select OSM Features of file without correspondence _fileOSMFeatures["isnew"] = _fileOSMFeatures.key_y.fillna(value='nenhum') newTags = _fileOSMFeatures[_fileOSMFeatures.isnew == 'nenhum'] newTags["value"] = newTags.value.str.replace("'", "''") newTags["whr"] = newTags.key.str.encode('utf-8').astype(str) + "='" + \ newTags.value.str.encode('utf-8').astype(str) + "'" # Export OUT_TBL with tags not being used obj_to_tbl(newTags, OUT_TBL, sheetsName="new_tags", sanitizeUtf8=True) # Export tags not being used to new shapefile def to_regular_str(row): from gasp import unicode_to_str san_str = unicode_to_str(row.whr) row["whr_san"] = san_str return row for t in TABLES_TAGS: if t == 'points': filterDf = newTags[newTags.geom == 'Point'] elif t == 'lines': filterDf = newTags[newTags.geom == 'Line'] elif t == 'multipolygons': filterDf = newTags[newTags.geom == 'Polygon'] Q = unicode("SELECT * FROM {} WHERE {}", 'utf-8').format(unicode(t, 'utf-8'), filterDf.whr.str.cat(sep=" OR "), 'utf-8') try: shp = sel_by_attr(sqdb, Q, os.path.join(WORKPSACE, t + '.shp'), api_gis='ogr') except: __filterDf = filterDf.apply(lambda x: to_regular_str(x), axis=1) _Q = "SELECT * FROM {} WHERE {}".format( t, __filterDf.whr_san.str.cat(sep=" OR ")) shp = sel_by_attr(sqdb, _Q, os.path.join(WORKSPACE, t + '.shp')) return OUT_TBL
def ID_rows_with_temporal_proximity_by_entities(conParam, table, entity_field, day_field, hour_field, hour_decimal_field, time_tolerance, outXlsPath): """ Retrieve rows from one pgtable with some temporal proximity Table structure should be entity | day | hour | hour_decimal 0 | 2018-01-02 | 5 | 5,10 0 | 2018-01-03 | 4 | 4,15 0 | 2018-01-02 | 5 | 5,12 0 | 2018-01-02 | 5 | 5,8 1 | 2018-01-02 | 4 | 4,10 1 | 2018-01-02 | 5 | 5,12 1 | 2018-01-02 | 4 | 4,20 1 | 2018-01-02 | 4 | 4,12 1 | 2018-01-02 | 4 | 4,6 For a time_tolerance of 5 minutes, the output table will have the rows with a temporal difference inside/bellow that time tolerance entity_field could be more than one field This method only identifies if one entity, for one day, has rows very close of each others, in terms of time. Not a good strategy for large tables. For large tables, SQL based methods are needed """ import pandas from gasp import goToList from gasp.fm.sql import query_to_df from gasp.sql.mng.fld import get_columns_type from gasp.to import obj_to_tbl entity_field = goToList(entity_field) COLS = entity_field + [day_field, hour_field] COLS_TYPE = get_columns_type(conParam, table) # TIME TOLERANCE IN HOURS TIME_TOLERANCE = time_tolerance / 60.0 def thereIsRowsSameTimeInt(row): whr = [] for c in COLS: if COLS_TYPE[c] == str: whr.append("{}='{}'".format(c, row[c])) else: whr.append("{}={}".format(c, row[c])) hourRows = query_to_df(conParam, "SELECT {} FROM {} WHERE {}".format( hour_decimal_field, table, " AND ".join(whr) ), db_api='psql' )[hour_decimal_field].tolist() for i in range(len(hourRows)): for e in range(i+1, len(hourRows)): dif = abs(hourRows[i][0] - hourRows[e][0]) if dif < TIME_TOLERANCE: break if dif < TIME_TOLERANCE: break if dif < TIME_TOLERANCE: row['time_difference'] = 1 else: row['time_difference'] = 0 return row # Count entity occourrences for one day and hour countsByEntityTime = query_to_df(conParam, ( "SELECT {scols}, conta FROM " "(SELECT {scols}, COUNT({ent}) AS conta FROM {tbl} " "GROUP BY {scols}) AS foo WHERE conta > 1" ).format( scols = ', '.join(COLS), ent = entity_field[0], tbl = table ), db_api='psql') # For each row in the last count, When count is > 1 # Check time difference between rows for one day and hour countsByEntityTime = countsByEntityTime.apply( lambda x: thereIsRowsSameTimeInt(x), axis=1 ) obj_to_tbl(countsByEntityTime, outXlsPath) return outXlsPath
df = df[~df.b_refid.isnull()] if fn == 'ovl_union': df['areav'] = df.geometry.area df = pd.DataFrame({ 'areav': df.groupby(['a_FID'])['areav'].agg('sum') }).reset_index() fish_df = fish_df.merge(df, how='left', left_on='fid', right_on='a_FID') if fn != 'ovl_union': fish_df[fn] = fish_df.areav * 100 / fish_df.area else: fish_df['overlay'] = fish_df.areav * 100 / fish_df.area fish_df.drop(['areav', 'a_FID'], axis=1, inplace=True) # Save file df_to_shp(fish_df, os.path.join(results, os.path.basename(fishp))) # Write List of Fishnet from gasp.to import obj_to_tbl obj_to_tbl(df_fnet, os.path.join(results, 'fishnet_list.xlsx'))
def meanrowsday_by_entity(psqldb, pgtable, dayField, entityField, out_file, filterData=None, newMeanField=None, numberDays=None): """ For every day in a pgtable, count the number of rows for each interest entity. At the end, calculate the mean of rows between every day for each entity. Day field must be of type text Difference in relation to meandays_by_entity: this one uses only SQL and PGSQL and not Pandas. if numberDays=None, the number of days used will be based on the days included in the data. If you want the mean for 5 days, but there are no data for one of these days, with numberDays=None, the mean will be only for 4 days. """ import pandas from gasp.pyt import obj_to_lst from gasp.sql.fm import q_to_obj from gasp.to import obj_to_tbl entityField = obj_to_lst(entityField) mean_field = "mean_rows" if not newMeanField else newMeanField ndaysQ = "SELECT {} AS nday".format(numberDays) if numberDays else \ ("SELECT MAX(nday) AS nday FROM (" "SELECT row_number() OVER(ORDER BY {dayF}) AS nday " "FROM {t} {whr}" "GROUP BY {dayF}" ") AS fooday").format( whr="" if not filterData else "WHERE {} ".format(filterData), dayF=dayField, t=pgtable ) # Get mean rows of all days by entity q = ("SELECT {entityF}, (SUM(conta) / nday) AS {mF} " "FROM (" "SELECT {entityF}, {dayF}, COUNT({cnt}) AS conta " "FROM {t} {whr}" "GROUP BY {entityF}, {dayF}" ") AS foo, ({getD}) AS foo2 " "GROUP BY {entityF}, nday").format( entityF=", ".join(entityField), dayF=dayField, mF=mean_field, cnt=entityField[0], t=pgtable, whr="" if not filterData else "WHERE {} ".format(filterData), getD=ndaysQ) data = q_to_obj(psqldb, q, db_api='psql') obj_to_tbl(data, out_file) return out_file
def meandays_by_entity(db, pgtable, DAY_FIELD, ENTITY_FIELD, COUNT_FIELD_NAME, OUTPUT_FILE, EXCLUDE_DAYS=None): """ For every day in a pgtable, count the number of rows for each interest entity. At the end, calculate the mean of rows between every day for each entity. Day field must be of type text """ import os import pandas from gasp.sql.fm import q_to_obj from gasp.to import obj_to_tbl # Get days VALUES = q_to_obj(db, "SELECT {col} FROM {t} GROUP BY {col}".format( col=DAY_FIELD, t=pgtable), db_api='psql')[DAY_FIELD].tolist() # For every day, Group rows by entities tableArray = [] for day in VALUES: if EXCLUDE_DAYS: if day[0] in EXCLUDE_DAYS: continue QUERY = ("SELECT {col}, COUNT({col}) AS {countname} FROM {table} " "WHERE {dayF}='{d}' GROUP BY {col}").format( col=ENTITY_FIELD, countname=COUNT_FIELD_NAME, table=pgtable, dayF=DAY_FIELD, d=day[0]) countTbl = q_to_obj(db, QUERY, db_api='psql') tableArray.append(countTbl) # Get mean for all entities main_table = tableArray[0] TMP_COUNT_FIELD_NAME = 'join_' + COUNT_FIELD_NAME TMP_JOIN_FIELD = 'id_entity' for i in range(1, len(tableArray)): tableArray[i].rename(columns={ COUNT_FIELD_NAME: TMP_COUNT_FIELD_NAME, ENTITY_FIELD: TMP_JOIN_FIELD }, inplace=True) main_table = main_table.merge(tableArray[i], how='outer', left_on=ENTITY_FIELD, right_on=TMP_JOIN_FIELD) main_table.fillna(0, inplace=True) main_table[ENTITY_FIELD].replace(0, main_table[TMP_JOIN_FIELD], inplace=True) main_table[COUNT_FIELD_NAME] = main_table[COUNT_FIELD_NAME] + \ main_table[TMP_COUNT_FIELD_NAME] main_table.drop([TMP_COUNT_FIELD_NAME, TMP_JOIN_FIELD], axis=1, inplace=True) main_table[COUNT_FIELD_NAME] = main_table[COUNT_FIELD_NAME] / len( tableArray) obj_to_tbl(main_table, OUTPUT_FILE)
def tbl_to_areamtx(inShp, col_a, col_b, outXls, db=None, with_metrics=None): """ Table to Matrix Table as: FID | col_a | col_b | geom 0 | 1 | A | A | .... 0 | 2 | A | B | .... 0 | 3 | A | A | .... 0 | 4 | A | C | .... 0 | 5 | A | B | .... 0 | 6 | B | A | .... 0 | 7 | B | A | .... 0 | 8 | B | B | .... 0 | 9 | B | B | .... 0 | 10 | C | A | .... 0 | 11 | C | B | .... 0 | 11 | C | D | .... To: classe | A | B | C | D A | | | | B | | | | C | | | | D | | | | col_a = rows col_b = cols api options: * pandas; * psql; """ if not db: import pandas as pd import numpy as np from gasp.gt.fmshp import shp_to_obj from gasp.to import obj_to_tbl # Open data df = shp_to_obj(inShp) # Remove nan values by -9999 df = df[pd.notnull(df[col_a])] df = df[pd.notnull(df[col_b])] # Get Area df['realarea'] = df.geometry.area / 1000000 # Get rows and Cols rows = df[col_a].unique() cols = df[col_b].unique() refval = list(np.sort(np.unique(np.append(rows, cols)))) # Produce matrix outDf = [] for row in refval: newCols = [row] for col in refval: newDf = df[(df[col_a] == row) & (df[col_b] == col)] if not newDf.shape[0]: newCols.append(0) else: area = newDf.realarea.sum() newCols.append(area) outDf.append(newCols) outcols = ['class'] + refval outDf = pd.DataFrame(outDf, columns=outcols) if with_metrics: from gasp.pyt.dtcls.eval import get_measures_for_mtx out_df = get_measures_for_mtx(outDf, 'class') return obj_to_tbl(out_df, outXls) # Export to Excel return obj_to_tbl(outDf, outXls) else: from gasp.pyt.oss import fprop from gasp.sql.db import create_db from gasp.sql.i import db_exists from gasp.gql.to import shp_to_psql from gasp.gql.tomtx import tbl_to_area_mtx from gasp.to import db_to_tbl # Create database if not exists is_db = db_exists(db) if not is_db: create_db(db, api='psql') # Add data to database tbl = shp_to_psql(db, inShp, api='shp2pgsql') # Create matrix mtx = tbl_to_area_mtx(db, tbl, col_a, col_b, fprop(outXls, 'fn')) # Export result return db_to_tbl(db, mtx, outXls, sheetsNames='matrix')
def dist_matrix_using_shp(originsShp, destinationsShp, originsEpsg, destinationsEpsg, outTable, transMode=None): """ Create a distance matrix using shapes and Google Maps API """ import time from threading import Thread from gasp.mng.split import split_df, split_df_inN from gasp.mng.prj import project from gasp.prop.feat import get_geom_type from gasp.mng.gen import merge_df from gasp.fm import tbl_to_obj from gasp.to import obj_to_tbl from gasp.web.glg import get_keys from gasp.web.glg.distmx import dist_matrix # Origins and Destionations to GeoDataframe originsDf = tbl_to_obj(originsShp) destnatDf = tbl_to_obj(destinationsShp) # Check Geometries type - shapes should be of type point originsGeom = get_geom_type(originsDf, gisApi='pandas') destGeom = get_geom_type(destnatDf, gisApi='pandas') if (originsGeom != 'Point' and originsGeom != 'MultiPoint') or \ (destGeom != 'Point' and destGeom != 'MultiPoint'): raise ValueError('All input geometries must be of type point') # Re-project GeoDataframes if needed originsDf = originsDf if originsEpsg == 4326 else \ project(originsDf, None, 4326, gisApi='pandas') destnatDf = destnatDf if destinationsEpsg == 4326 else \ project(destnatDf, None, 4326, gisAPi='pandas') # Geom to Field as str originsDf["geom"] = originsDf["geometry"].y.astype(str) + "," + \ originsDf["geometry"].x.astype(str) destnatDf["geom"] = destnatDf["geometry"].y.astype(str) + "," + \ destnatDf["geometry"].x.astype(str) originsDf["old_fid"] = originsDf.index destnatDf["old_fid"] = destnatDf.index # Split destinations DataFrame into Dafaframes with lst_destinos = split_df(destnatDf, 10) # Get Keys KEYS = get_keys() lst_keys = KEYS["key"].tolist() origensByKey = split_df_inN(originsDf, KEYS.shape[0]) if len(origensByKey) == len(lst_keys) + 1: origensByKey[-2] = origensByKey[-2].append(origensByKey[-1]) del origensByKey[-1] # Produce matrix for each origins in origensByKey results = [] def get_matrix(origins, key): subOrigins = split_df(origins, 10) for df in subOrigins: for __df in lst_destinos: matrix = dist_matrix(str(df.geom.str.cat(sep="|")), str(__df.geom.str.cat(sep="|")), df.shape[0], __df.shape[0], transport_mode=transMode, useKey=str(key)) matrix = pandas.DataFrame(matrix) matrix = pandas.concat([ matrix.drop(["elements"], axis=1), matrix["elements"].apply(pandas.Series) ], axis=1) originsFID = df.old_fid.tolist() destinaFID = __df.old_fid.tolist() mm = [] for i in range(len(originsFID)): for e in range(len(destinaFID)): ll = [originsFID[i], destinaFID[e], matrix.iloc[i, e]] mm.append(ll) Fmatrix = pandas.DataFrame( mm, columns=["fid_origin", "fid_destin", "cost"]) results.append(Fmatrix) time.sleep(5) # Create threads thrds = [] i = 1 for df in origensByKey: thrds.append( Thread(name="tk{}".format(str(i)), target=get_matrix, args=(df, lst_keys[i - 1]))) i += 1 # Start all threads for thr in thrds: thr.start() # Wait for all threads to finish for thr in thrds: thr.join() # Join all dataframes RESULT = merge_df(results, ignIndex=False) RESULT = sanitizeDataCols(RESULT, "cost") RESULT = RESULT.merge(originsDf, how='inner', left_on=["fid_origin"], right_on=["old_fid"]) RESULT.drop([x for x in originsDf.columns.values if x != "geometry"], axis=1, inplace=True) RESULT.rename(columns={"geometry": "origin_geom"}, inplace=True) RESULT = RESULT.merge(destnatDf, how='inner', left_on=["fid_destin"], right_on=["old_fid"]) RESULT.drop([x for x in destnatDf.columns.values if x != "geometry"], axis=1, inplace=True) RESULT.rename(columns={"geometry": "destin_geom"}, inplace=True) RESULT["origin_geom"] = RESULT.origin_geom.astype(str) RESULT["destin_geom"] = RESULT.destin_geom.astype(str) return obj_to_tbl(RESULT, outTable)
def cost_od(shpOrigins, shpDestinations, epsgOrigins, epsgDestinations, table_result, mode='foot-walking'): """ Matrix od Service Implementation """ import pandas from threading import Thread from gasp.fm.api.orouteserv import get_keys from gasp.fm.api.orouteserv import matrix_od from gasp.fm import shp_to_df from gasp.mng.split import split_df_inN from gasp.fm.geom import pointxy_to_cols from gasp.mng.prj import project from gasp.mng.gen import merge_df from gasp.prop.feat import get_geom_type from gasp.to import obj_to_tbl origensDf = tbl_to_obj( shpOrigins) destinoDf = tbl_to_obj(shpDestinations) # Check if SHPs are points inGeomType = get_geom_type(origensDf, geomCol="geometry", gisApi='pandas') if inGeomType != 'Point' and inGeomType != 'MultiPoint': raise ValueError('The input geometry must be of type point') inGeomType = get_geom_type(destinoDf, geomCol="geometry", gisApi='pandas') if inGeomType != 'Point' and inGeomType != 'MultiPoint': raise ValueError('The input geometry must be of type point') # Re-project if needed if epsgOrigins != 4326: origensDf = project(origensDf, None, 4326, gisApi='pandas') if epsgDestinations != 4326: destinoDf = project(destinoDf, None, 4326, gisApi='pandas') origensDf = pointxy_to_cols( origensDf, geomCol="geometry", colX="longitude", colY="latitude" ); destinoDf = pointxy_to_cols( destinoDf, geomCol="geometry", colX="longitude", colY="latitude" ) origensDf["location"] = origensDf.longitude.astype(str) + "," + \ origensDf.latitude.astype(str) destinoDf["location"] = destinoDf.longitude.astype(str) + "," + \ destinoDf.latitude.astype(str) origensDf["old_fid"] = origensDf.index destinoDf["old_fid"] = destinoDf.index # Get Keys KEYS = get_keys() origensByKey = split_df_inN(origensDf, KEYS.shape[0]) lst_keys = KEYS["key"].tolist() # Produce matrix results = [] def get_matrix(origins, key): origins.reset_index(inplace=True) origins["rqst_idx"] = origins.index.astype(str) destinations = destinoDf.copy() strSource = origins.location.str.cat(sep="|") idxSource = origins.rqst_idx.str.cat(sep=",") destinations["rqst_idx"] = destinations.old_fid + origins.shape[0] destinations["rqst_idx"] = destinations.rqst_idx.astype(str) strDestin = destinations.location.str.cat(sep="|") idxDestin = destinations.rqst_idx.str.cat(sep=",") rslt = matrix_od( strSource + "|" + strDestin, idxSources=idxSource, idxDestinations=idxDestin, useKey=key, modeTransportation=mode ) rslt = pandas.DataFrame(rslt["durations"]) originsFID = origins.old_fid.tolist() destinaFID = destinations.old_fid.tolist() mm = [] for lnh in range(len(originsFID)): for col in range(len(destinaFID)): ll = [ originsFID[lnh], destinaFID[col], rslt.iloc[lnh, col] ] mm.append(ll) matrix = pandas.DataFrame( mm, columns=["fid_origin", "fid_destin", "cost"]) results.append(matrix) # Create threads thrds = [] i= 1 for df in origensByKey: thrds.append(Thread( name="tk{}".format(str(i)), target=get_matrix, args=(df, lst_keys[i - 1]) )) i += 1 # Start all threads for thr in thrds: thr.start() # Wait for all threads to finish for thr in thrds: thr.join() # Join all dataframes RESULT = merge_df(results, ignIndex=False) RESULT = RESULT.merge( origensDf , how='inner', left_on=["fid_origin"], right_on=["old_fid"] ); RESULT.drop([ x for x in origensDf.columns.values if x != "geometry"], axis=1, inplace=True ); RESULT.rename(columns={"geometry" : "origin_geom"}, inplace=True) RESULT = RESULT.merge( destinoDf, how='inner', left_on=["fid_destin"], right_on=["old_fid"] ); RESULT.drop([ x for x in destinoDf.columns.values if x != "geometry"], axis=1, inplace=True ); RESULT.rename(columns={"geometry" : "destin_geom"}, inplace=True) RESULT["origin_geom"] = RESULT.origin_geom.astype(str) RESULT["destin_geom"] = RESULT.destin_geom.astype(str) return obj_to_tbl(RESULT, table_result)
def dist_matrix_by_shp(oShp, dShp, oEpsg, dEpsg, result, transMode=None): """ Create distance matrix using shapes and Google Maps API - Uses my first API_KEY """ import time import pandas from gasp.fm import tbl_to_obj from gasp.mng.split import split_df from gasp.mng.prj import project from gasp.mng.fld.df import listval_to_newcols from gasp.prop.feat import get_geom_type from gasp.mng.gen import merge_df from gasp.web.glg.distmx import dist_matrix from gasp.to import obj_to_tbl from gasp.to.obj import df_to_list from gasp.oss import get_filename # Origins and Destionations to GeoDataframe originsDf = tbl_to_obj(oShp) destnatDf = tbl_to_obj(dShp) # Check Geometries type - shapes should be of type point originsGeom = get_geom_type(originsDf, gisApi='pandas') destGeom = get_geom_type(destnatDf, gisApi='pandas') if (originsGeom != 'Point' and originsGeom != 'MultiPoint') or \ (destGeom != 'Point' and destGeom != 'MultiPoint'): raise ValueError('All input geometries must be of type point') # Re-project GeoDataframes if needed originsDf = originsDf if oEpsg == 4326 else \ project(originsDf, None, 4326, gisApi='pandas') destnatDf = destnatDf if dEpsg == 4326 else \ project(destnatDf, None, 4326, gisApi='pandas') # Geom to Field as str originsDf["geom"] = originsDf["geometry"].y.astype(str) + "," + \ originsDf["geometry"].x.astype(str) destnatDf["geom"] = destnatDf["geometry"].y.astype(str) + "," + \ destnatDf["geometry"].x.astype(str) originsDf["old_fid"] = originsDf.index destnatDf["old_fid"] = destnatDf.index # Split Destinations lstOrigins = split_df(originsDf, 95) for odf in lstOrigins: odf.reset_index(inplace=True) lstDestinations = df_to_list(destnatDf) RESULTS = [] for destino in lstDestinations: for oDf in lstOrigins: matrix = dist_matrix( str(oDf.geom.str.cat(sep="|")), str(destino["geom"]), oDf.shape[0], 1, transport_mode=transMode, useKey='AIzaSyAmyPmqtxD20urqtpCpn4ER74a6J4N403k') matrix = pandas.DataFrame(matrix) matrix = listval_to_newcols(matrix, "elements") matrix = matrix.merge(oDf, how='inner', left_index=True, right_index=True) matrix.rename(columns={ 'old_fid': "fid_origin", 0: "cost" }, inplace=True) matrix["fid_destin"] = destino['old_fid'] RESULTS.append(matrix) time.sleep(5) # Join all dataframes RESULT = merge_df(RESULTS, ignIndex=False) RESULT = sanitizeDataCols(RESULT, "cost") RESULT.drop([ x for x in originsDf.columns.values if x != "geometry" and x != "old_fid" ], axis=1, inplace=True) RESULT.rename(columns={"geometry": "origin_geom"}, inplace=True) RESULT = RESULT.merge(destnatDf, how='inner', left_on=["fid_destin"], right_on=["old_fid"]) RESULT.drop([x for x in destnatDf.columns.values if x != "geometry"], axis=1, inplace=True) RESULT.rename(columns={"geometry": "destin_geom"}, inplace=True) RESULT["origin_geom"] = RESULT.origin_geom.astype(str) RESULT["destin_geom"] = RESULT.destin_geom.astype(str) obj_to_tbl(RESULT, result, sheetsName=get_filename(result)) return result