def load_file(filename): if not os.path.isfile(filename): raise FileNotFoundError(filename) errors = mdb.read_table(filename, 'ErrLog') if len(errors) > 0: print('Errors reported in file %s:' % filename) print(errors) messages = mdb.read_table(filename, 'Messages') messages = messages[messages.MsgID == 'w01'] if len(messages) > 0: messages.Message = messages.Message.replace( regex='Plate Scanned. Please check the connection on positions:\\s*', value='Connection problem: ') print('Connection problems reported in file %s:' % filename) print(messages.drop(['MsgID', 'MsgOrder'], axis=1)) ttimes = mdb.read_table(filename, 'TTimes') ttimes = ttimes[['TimePoint', 'TestTime']] ttimes.columns = ['tp', 'dt'] ttimes['dt'] = pd.to_datetime(ttimes['dt']) org = mdb.read_table(filename, 'Org10K').drop('TestOrder', axis=1) assert org.shape[0] > 0, '%s contains no data!' % filename org['Row'] = org['Row'].map(ord) - 65 org = org.rename({'TimePoint':'tp', 'Row':'row'}, axis=1).set_index(['tp', 'row']).sort_index() n_cols = org.shape[1] org.columns = pd.MultiIndex.from_product([['org'], range(n_cols)], names=[None, 'col']) org = org.stack('col').reset_index() org['org'] = org['org'].astype(float) org['welln'] = org['row'] * (org['col'].max() + 1) + org['col'] org.drop(['row', 'col'], axis=1, inplace=True) org = org.merge(ttimes) org = org[['tp', 'welln', 'dt', 'org']] return org
def main(argv): if len(argv) < 3: print("Syntax: gridlabd mdb_info MDBFILE COMMAND [ARGS ...]", file=sys.stderr) exit(1) MDBFILE = argv[1] COMMAND = argv[2] if COMMAND == "tables": if len(argv) > 3: REGEX = argv[3] else: REGEX = ".*" for TABLE in mdb.list_tables(MDBFILE): if re.match(REGEX, TABLE): print(TABLE) elif COMMAND == "export": if len(argv) < 3: error(f"missing export table name", 1) REGEX = argv[3] for TABLE in mdb.list_tables(MDBFILE): if re.match(REGEX, TABLE): DF = mdb.read_table(MDBFILE, TABLE) DF.to_csv(TABLE + ".csv") else: error(f"commmand '{COMMAND}' is not valid", 2)
def __init__(self, filename): super().__init__() self.df_objects = mdb.read_table(filename, "RoboClip", converters_from_schema=False) self.df_objects.rename({"GRUPPO": "GROUP"}, axis=1, inplace=True) self.load_from_df(self.df_objects)
def get_character_from_access(idn): query = '`Character Number` == %s' df = mdb.read_table("Character.accdb", "Characters") character = df.query(query % idn) if character.empty: return None return character
def get_mdb_data(self, payload): ''' Returns list of Entity object(s) (str,dataframe) from a successful extraction Arguments: payload -- payload object (str,binary) ''' # TODO: find a way to directly pass byteio to reading utility without writing to disk # Write to bytes to disk open(payload.filename, 'wb').write(payload.data.getvalue()) # Get database schema mdb_schema = pandas_access.read_schema(payload.filename) # Get attributes that are of integer type integer_attributes = self.get_attributes_by_data_type(mdb_schema,'Long Integer') # Declare entity list entity_list = [] # Iterate through each table in database for tbl in pandas_access.list_tables(payload.filename): # Issue: Default pandas integer type is not nullable - null values in integer column causes read error # Workaround: Read integer as Int64 (pandas nullable integer type in pandas) dtype_input = {attribute:'Int64' for attribute in integer_attributes[tbl]} df = pandas_access.read_table(payload.filename, tbl, dtype = dtype_input) entity_list.append(Entity(tbl,df)) return entity_list
def __init__(self, filename): super().__init__() self.df_targets = mdb.read_table( filename, "RoboClip", converters_from_schema=False ) self.df_targets.rename({"GRUPPO": "GROUP"}, axis=1, inplace=True) self.df_targets["GROUP"] = self.df_targets["GROUP"].fillna("UNLABELED") self.load_from_df(self.df_targets)
def main(self): # Read database by groups of 10000 for chunk in mdb.read_table(self.db, self.table, chunksize=self.chunk_size): self.data.append(chunk) pd.concat(self.data).to_csv(self.output, index=False) # Print execution time self.execution_time()
def access_to_csv(get_in, get_out, table): df = mdb.read_table(get_in, table) df.columns = [meta.clean_metadata(x.lower()) for x in df.columns] df.to_csv(get_out + '_temp_' + table + '.csv', date_format=date_format, encoding='utf-8', index=False, header=True, sep=';') txt_to_csv(get_out + '_temp_' + table + '.csv', get_out + '_' + table + '.csv', ';')
def ParseSynergiDatabase(self): """ Use Pandas Access to convert the MDB tables to Pandas DataFrames. """ print("Opening synergie database - ", self.paths["Synergi File"]) table_list = mdb.list_tables(self.paths["Synergi File"]) table_list_warehouse = [] if "warehouse" in self.paths: print("Opening warehouse database - ", self.paths["warehouse"]) table_list_warehouse = mdb.list_tables(self.paths["warehouse"]) for table in table_list: self.SynergiDictionary[table] = mdb.read_table( self.paths["Synergi File"], table) for table in table_list_warehouse: self.SynergiDictionary[table] = mdb.read_table( self.paths["warehouse"], table) return
def load_mdb(self, file): name, ext = os.path.splitext(file) if name == '2007' or name == '2009': table = mdb.list_tables(self.base_path + file)[1] else: table = mdb.list_tables(self.base_path + file)[0] df = mdb.read_table(self.base_path + file, table) # extra rows at 2012 if file == '2012.accdb': df = df.drop(columns=['MV_CIRCUNT', 'MV_TIPO', 'C_MEDICO']) return df
def get_mdb_data(self, payload): ''' Returns list of Entity object(s) (str,dataframe) from a successful extraction Arguments: payload -- payload object (str,binary) ''' # TODO: find a way to directly pass byteio to reading utility without writing to disk try: # Write to bytes to disk open(payload.filename, 'wb').write(payload.data.getvalue()) # Get database schema mdb_schema = pandas_access.read_schema(payload.filename) # Get attributes that are of integer type integer_attributes = self.get_attributes_by_data_type( mdb_schema, 'Long Integer') # Declare entity dict entity_dict = dict() # Get list of table from database table_list = pandas_access.list_tables(payload.filename) # Update progress bar job count self.job_count += len(table_list) self.pbar.total = self.job_count # Iterate through each table in database for tbl in table_list: self.logger.debug('Extracting table: \'%s\' from file: %s...', tbl, payload.filename) # Issue: Default pandas integer type is not nullable - null values in integer column causes read error # Workaround: Read integer as Int64 (pandas nullable integer type in pandas) dtype_input = { attribute: 'Int64' for attribute in integer_attributes[tbl] } df = pandas_access.read_table(payload.filename, tbl, dtype=dtype_input) entity_dict.update({tbl: df}) # update progress bar self.pbar.update() return entity_dict finally: self.logger.debug('Removing intermediate file: %s...', payload.filename) utils.silentremove(payload.filename)
def from_SQLT(args): sql_table = args.sql_table if sql_table == None or sql_table == "": return "MUST SUPPLY --sql_table" sql_type = args.sql_kind.lower() if sql_type == "psql": return sql_type + " Coming Soon" elif sql_type == "msql": return sql_type + " Coming Soon" elif sql_type == "sqll": return sql_type + " Coming Soon" elif sql_type == "mdb": sql_file = args.sql_file if sql_file == None or sql_file == "": return "MUST SUPPLY --sql_file" try: tables = mdb.list_tables(sql_file) if from_table not in tables: return "TABLE NOT FOUND. Available Tables:\n\t" + "\n\t".join( tables) df = mdb.read_table(sql_file, from_table) except: traceback.print_exc() return "ERROR" else: return "UNSUPPORTED SQL. Available Kinds: PSQL (PostGreSQL), MSQL (MySQL), MDB (Access DB), SQLL (SQLite)" return df
import pandas_access as mdb import pandas as pd db_filename = 'isear_databank.mdb' # Listing the tables. for tbl in mdb.list_tables(db_filename): df = mdb.read_table(db_filename, tbl) df.to_csv(tbl + '.csv') # Read a small table. db = pd.read_spss('ISEAR SPSS Databank.sav') db.to_csv('isear.csv')
def main(): program_name = 'qsomysql' program_version = '1.2' qrz = QRZ(cfg='./qrz.ini') config = configparser.ConfigParser() config.read(['qsoviz.ini', 'qrz.ini']) use_n3fjp_mdb = config['general'].getboolean('use_n3fjp_mdb') reset_mysql_db = config['general'].getboolean('reset_mysql_db') db_user = config['mysql']['db_user'] db_password = config['mysql']['db_password'] db_name = config['mysql']['db_name'] logging.basicConfig(filename='/home/qsoviz/qsomysql.log', level=logging.INFO, format='%(asctime)s %(message)s') logging.info('%s %s: Run started', program_name, program_version) states_fp = open('states.json', 'r') states_coords = json.load(states_fp) states_fp.close() mysql_cnx = mysql.connector.connect(user=db_user, password=db_password, database=db_name) mysql_cursor = mysql_cnx.cursor() if reset_mysql_db: mysql_cursor.execute("DELETE FROM aarc_fd") mysql_cnx.commit() add_qso = ( "INSERT INTO aarc_fd " "(datetime_on, callsign, n3fjp_modecontest, band, state, arrl_sect, country, " + "n3fjp_initials, operator, class, computer_name, latitude, longitude, geohash) " "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)") query_qso = "SELECT datetime_on, callsign FROM aarc_fd WHERE datetime_on = %s AND callsign = %s" logfile_filename = newest_file('/home/qsoviz/logfiles') records_added = 0 if use_n3fjp_mdb: fields = [ 'fldCall', 'fldModeContest', 'fldBand', 'fldState', 'fldSection', 'fldCountryWorked', 'fldInitials', 'fldOperator', 'fldClass', 'fldComputerName' ] df = mdb.read_table(logfile_filename, 'tblContacts') for idx, row in df.iterrows(): fldDateStr = row.loc['fldDateStr'] fldTimeOnStr = row.loc['fldTimeOnStr'] year, month, day = fldDateStr.split('/') hour, minute, second = fldTimeOnStr.split(':') a_datetime_on = datetime(int(year), int(month), int(day), hour=int(hour), minute=int(minute), second=int(second), tzinfo=timezone.utc) a_callsign = row.loc['fldCall'] query_qso_data = (a_datetime_on, a_callsign) mysql_cursor.execute(query_qso, query_qso_data) result = mysql_cursor.fetchall() if len(result) == 0: add_qso_data = [a_datetime_on] for field in fields: temp = row.loc[field] if temp != temp: temp = '' temp = temp.upper() add_qso_data.append(temp) a_latitude, a_longitude, a_geohash = geo_info( qrz, add_qso_data[1], add_qso_data[4], states_coords) add_qso_data.append(a_latitude) add_qso_data.append(a_longitude) add_qso_data.append(a_geohash) add_qso_data = tuple(add_qso_data) mysql_cursor.execute(add_qso, add_qso_data) mysql_cnx.commit() records_added = records_added + 1 else: adif_fp = open(logfile_filename, 'r') qsos = ADIReader(adif_fp) for qso in qsos: a_datetime_on = qso['datetime_on'] fldCall = qso['call'].upper() query_qso_data = (a_datetime_on, fldCall) result = mysql_cursor(query_qso, query_qso_data) if not result: fldModeContest = qso.get('mode', 'NONE').upper() fldBand = qso.get('band', 'NONE').upper() fldState = qso.get('state', 'NONE').upper() fldSection = qso.get('arrl_sect', 'NONE').upper() fldCountryWorked = qso.get('country', 'NONE').upper() fldInitials = qso.get('app_n3fjp_initials', 'NONE').upper() fldOperator = qso.get('operator', 'NONE').upper() fldClass = qso.get('class', 'NONE').upper() fldComputerName = qso.get('n3fjp_stationid', 'NONE').upper() a_latitude, a_longitude, a_geohash = geo_info( qrz, fldCall, fldState, states_coords) add_qso_data = (a_datetime_on, fldCall, fldModeContest, fldBand, fldState, fldSection, fldCountryWorked, fldInitials, fldOperator, fldClass, fldComputerName, a_latitude, a_longitude, a_geohash) mysql_cursor.execute(add_qso, add_qso_data) mysql_cnx.commit() records_added = records_added + 1 if not use_n3fjp_mdb: adif_fp.close() mysql_cursor.close() mysql_cnx.close() logging.info('%s %s: records_added = %s', program_name, program_version, records_added) logging.info('%s %s: Run ended', program_name, program_version)
UPDATE_DB = False UPDATE_DB = True SAVEDATA = False db_filename = "bjerg2003.mdb" BOOKS_DB = "books.sqlite" f_isbn = "books_isbn.csv" f_noisbn = "books_no_isbn.csv" # Listing the tables. for tbl in mdb.list_tables(db_filename): print(tbl) # merge de to tabeller df1 = mdb.read_table(db_filename, "Udgave", dtype={"Sideantal": "string"}) df2 = mdb.read_table(db_filename, "Titel") df = pd.merge(left=df1, right=df2, left_on="Titel", right_on="Titel") # Convert missing data Na or NaN to empty strings # Not for Land, as we use books with NaN to indicate wrong placement df["Forfatter"] = df["Forfatter"].fillna("") df["Sideantal"] = df["Sideantal"].fillna("") data_isbn = [] data_noisbn = [] data_not_online = [] data_oquery = [] data_gquery = [] data_dbkkquery = []
help='Prometea Dataset file') args = parser.parse_args() # connect to MongoDB prometea database print('STEP 01: Connect to Prometea MongoDB database') print('---------------------------------------------') client = MongoClient(port=27017) db = client.prometeadb print() # load to MS Access prometea database print('STEP 02: Load Prometea MS Access dataset') print('----------------------------------------') print('Loading Prometea Prescripciones Dataset from ' + args.dataset) dataset = mdb.read_table(args.dataset, 'prescripciones') print() # Iterate over parent datasets print('STEP 03: Start Merge Prescripciones Dataset') print('--------------------------------------') # insert child pacientes dataset pbar = tqdm(total=dataset.shape[0], desc='Prescripciones Dataset') num = 0 for index, rows in dataset.iterrows(): db.Pacientes.update_one({ 'centro': rows.centro, 'nhc': rows.nhc }, {
while attempts < 3: try: response = urlopen( 'http://static.data.gov.hk/td/routes-and-fares/ROUTE_GMB.mdb', timeout=10) content = response.read() with open(expanduser('~/Desktop/ROUTE_GMB.mdb'), 'wb') as f: f.write(content) break except URLError as e: if attempts == 2: exit(1) attempts += 1 df = mdb.read_table(expanduser('~/Desktop/ROUTE_GMB.mdb'), mdb.list_tables(expanduser('~/Desktop/ROUTE_GMB.mdb'))[0]) #df.drop_duplicates(subset=['ROUTE_ID', 'ROUTE_SEQ', 'STOP_SEQ'], keep='last', inplace=True) #df.fillna(-1, inplace=True) conn = pymssql.connect(server=r'192.168.9.75\sql2012', user='******', password='******', database='DWHRMS_DEMO') last = len(df) - 1 sql = 'DELETE FROM [t_MobMinibusFares]' cursor = conn.cursor() cursor.execute(sql) conn.commit() for i, r in df.iterrows(): if i % 1000 == 0: sql = f'INSERT INTO [t_MobMinibusFares] ([Route ID], [District], [Route Name], [Source zh-hk], [Source zh-cn], [Source en-us], [Dest zh-hk], [Dest zh-cn], [Dest en-us], [Fare]) \n' else:
# (c) Copyright IBM Corporation 2020. # LICENSE: Apache License 2.0 (Apache-2.0) # http://www.apache.org/licenses/LICENSE-2.0 # Preliminary processing code, used to create the csv files. Requires special installation to open `.mdb` files try: import pandas_access as mdb # This is a pandas wrapper for `mdbtools`, mdbtools must be installed separately with homebrew (on mac) mdb_file = 'isear/isear_databank.mdb' df_full = mdb.read_table(mdb_file, 'DATA') df = df_full[['Field1', 'SIT']] df = df.rename(columns={"Field1": "label", "SIT": "text"}) df['text'] = df['text'].apply(lambda x: x.replace('á\n', '')) df.to_csv('isear/isear_data.csv', index=False) except Exception as e: print( '\n********************************************************************************************************\n' '****** Loading the ISEAR dataset requires special dependencies. *********\n' '****** On Mac/Linux, install https://github.com/mdbtools/mdbtools and `pip install pandas_access`, *****\n' '****** and then rerun the main script. *****\n' '********************************************************************************************************\n' )
help='Prometea Dataset file') args = parser.parse_args() # connect to MongoDB prometea database print('STEP 01: Connect to Prometea MongoDB database') print('---------------------------------------------') client = MongoClient(port=27017) db = client.prometeadb print() # load to MS Access prometea database print('STEP 02: Load Prometea MS Access dataset') print('----------------------------------------') print('Loading Prometea DGP 5 Años Dataset from ' + args.dataset) dataset = mdb.read_table(args.dataset, 'DGP_5Anios') print() # Iterate over parent datasets print('STEP 03: Start Merge DGP 5 Años Dataset') print('--------------------------------------') # insert child pacientes dataset pbar = tqdm(total=dataset.shape[0], desc='DGP 5 Años Dataset') num = 0 for index, rows in dataset.iterrows(): db.Paciente.update_one({ 'centro': rows.centro, 'nhc': rows.nhc }, {
import pandas_access as mdb import json db_filename = 'l2.mdb' # Listing the tables. #for tbl in mdb.list_tables(db_filename): # print(tbl) # Read a small table. df = mdb.read_table(db_filename, "drops").sort_values(by=['npc_id', 'item_id']).to_dict() result = {} current = None drops = [] spoil = [] for k, v in df['npc_id'].iteritems(): if current != v: if current: #print sorted(drops, key = lambda x: int(x[0])) result[current] = {"drop": drops, "spoil": spoil} print "New mob: %s " % v current = v drops = [] spoil = [] print v #print k # print "Item id: {} percentage: {} is_sweep: {} min: {} max: {}".format(v, drops_list['percentage'][k], drops_list['sweep'][k], drops_list['min'][k], drops_list['max'][k]) data = [df['item_id'][k], df['min'][k], df['max'][k], df['percentage'][k] / 100.0] if df['sweep'][k] == "1":
def phabconvert(infile, login_info='test-test-test-test-test'): login_info = login_info.strip().split("-") login = str(login_info[0]) agency = str(login_info[1]) owner = str(login_info[2]) year = str(login_info[3]) project = str(login_info[4]) errorLog("READ IN TABLES REQUIRED TO RUN QUERY:") sample_entry = mdb.read_table(infile, "Sample_Entry", dtype={'s_Generation': str}) errorLog(sample_entry) event_lookup = mdb.read_table(infile, "EventLookUp") protocol_lookup = mdb.read_table(infile, "ProtocolLookUp") station_lookup = mdb.read_table(infile, "StationLookUp", dtype={'s_Generation': str}) agency_lookup = mdb.read_table(infile, "AgencyLookUp", dtype={'s_Generation': str}) project_lookup = mdb.read_table(infile, "ProjectLookUp", dtype={'s_Generation': str}) qa_lookup = mdb.read_table(infile, "QALookUp", dtype={'s_Generation': str}) resqual_lookup = mdb.read_table(infile, "ResQualLookUp", dtype={'s_Generation': str}) stationdetail_lookup = mdb.read_table(infile, "StationDetailLookUp", dtype={'s_Generation': str}) location_entry = mdb.read_table(infile, "Location_Entry", dtype={'s_Generation': str}) location_lookup = mdb.read_table(infile, "LocationLookUp", dtype={'s_Generation': str}) parent_project_lookup = mdb.read_table(infile, "ParentProjectLookUp", dtype={'s_Generation': str}) collectionmethod_lookup = mdb.read_table(infile, "CollectionMethodLookUp", dtype={'s_Generation': str}) constituent_lookup = mdb.read_table(infile, "ConstituentLookUp", dtype={'s_Generation': str}) matrix_lookup = mdb.read_table(infile, "MatrixLookUp", dtype={'s_Generation': str}) method_lookup = mdb.read_table(infile, "MethodLookUp", dtype={'s_Generation': str}) analyte_lookup = mdb.read_table(infile, "AnalyteLookUp", dtype={'s_Generation': str}) unit_lookup = mdb.read_table(infile, "UnitLookUp", dtype={'s_Generation': str}) fraction_lookup = mdb.read_table(infile, "FractionLookUp", dtype={'s_Generation': str}) collectiondevice_lookup = mdb.read_table(infile, "CollectionDeviceLookUp", dtype={'s_Generation': str}) compliance_lookup = mdb.read_table(infile, "ComplianceLookUp", dtype={'s_Generation': str}) batchverification_lookup = mdb.read_table(infile, "BatchVerificationLookUp", dtype={'s_Generation': str}) ##### field specific tables fieldcollection_entry = mdb.read_table(infile, "FieldCollection_Entry", dtype={'s_Generation': str}) fieldresult_entry = mdb.read_table(infile, "FieldResult_Entry", dtype={'s_Generation': str}) #### habitat specific tables habitatcollection_entry = mdb.read_table(infile, "HabitatCollection_Entry", dtype={'s_Generation': str}) habitatresult_entry = mdb.read_table(infile, "HabitatResult_Entry", dtype={'s_Generation': str}) errorLog("STARTING MERGING TOGETHER FIELDS - IE. RUNNING QUERY") ### pull together sample and location data - used by both field and habitat queries sample = pd.merge(sample_entry[[ 'AgencyCode', 'EventCode', 'ProjectCode', 'ProtocolCode', 'StationCode', 'SampleDate', 'SampleComments', 'SampleRowID' ]], agency_lookup[['AgencyCode', 'AgencyName']], on='AgencyCode', how='left') sample = pd.merge(sample, event_lookup[['EventCode', 'EventName']], on='EventCode', how='left') sample = pd.merge( sample, project_lookup[['ParentProjectCode', 'ProjectCode', 'ProjectName']], on='ProjectCode', how='left') sample = pd.merge(sample, protocol_lookup[['ProtocolCode', 'ProtocolName']], on='ProtocolCode', how='left') station = pd.merge(station_lookup[[ 'StationCode', 'StationName', 'EcoregionLevel3Code', 'HydrologicUnit', 'County', 'LocalWatershed', 'UpstreamArea' ]], stationdetail_lookup[[ 'StationCode', 'TargetLatitude', 'TargetLongitude', 'Datum' ]], on='StationCode', how='left') sample = pd.merge(sample, station[[ 'StationCode', 'StationName', 'EcoregionLevel3Code', 'HydrologicUnit', 'County', 'LocalWatershed', 'UpstreamArea', 'TargetLatitude', 'TargetLongitude', 'Datum' ]], on='StationCode', how='left') location = pd.merge(location_entry[[ 'SampleRowID', 'LocationRowID', 'LocationCode', 'GeometryShape' ]], location_lookup[['LocationCode', 'LocationName']], on='LocationCode', how='left') sample = pd.merge(sample, location[[ 'SampleRowID', 'LocationCode', 'LocationName', 'LocationRowID', 'GeometryShape' ]], on='SampleRowID', how='left') # Robert 20 Feb 2019 # Here we add the ParentProjectName to the sample dataframe sample = pd.merge( sample, parent_project_lookup[['ParentProjectCode', 'ParentProjectName']], on="ParentProjectCode", how='left') #### pull together constituent entries - - used by both field and habitat queries constituent = pd.merge(constituent_lookup[[ 'ConstituentRowID', 'AnalyteCode', 'FractionCode', 'MatrixCode', 'MethodCode', 'UnitCode' ]], fraction_lookup[['FractionCode', 'FractionName']], on='FractionCode', how='left') constituent = pd.merge(constituent[[ 'ConstituentRowID', 'AnalyteCode', 'FractionCode', 'FractionName', 'MatrixCode', 'MethodCode', 'UnitCode' ]], analyte_lookup[['AnalyteCode', 'AnalyteName']], on='AnalyteCode', how='left') constituent = pd.merge(constituent[[ 'ConstituentRowID', 'AnalyteCode', 'AnalyteName', 'FractionCode', 'FractionName', 'MatrixCode', 'MethodCode', 'UnitCode' ]], matrix_lookup[['MatrixCode', 'MatrixName']], on='MatrixCode', how='left') constituent = pd.merge(constituent[[ 'ConstituentRowID', 'AnalyteCode', 'AnalyteName', 'FractionCode', 'FractionName', 'MatrixCode', 'MatrixName', 'MethodCode', 'UnitCode' ]], method_lookup[['MethodCode', 'MethodName']], on='MethodCode', how='left') constituent = pd.merge(constituent[[ 'ConstituentRowID', 'AnalyteCode', 'AnalyteName', 'FractionCode', 'FractionName', 'MatrixCode', 'MatrixName', 'MethodCode', 'MethodName', 'UnitCode' ]], unit_lookup[['UnitCode', 'UnitName']], on='UnitCode', how='left') ##### FIELD SPECIFIC CODE #### pull together field collection entry fieldcollection = pd.merge(fieldcollection_entry[[ 'FieldCollectionRowID', 'LocationRowID', 'CollectionMethodCode', 'CollectionTime', 'Replicate', 'CollectionDepth', 'UnitCollectionDepth', 'FieldCollectionComments' ]], collectionmethod_lookup[[ 'CollectionMethodCode', 'CollectionMethodName' ]], on='CollectionMethodCode', how='left') #### pull together field result entries fieldresult = pd.merge(fieldresult_entry[[ 'ConstituentRowID', 'FieldResultRowID', 'FieldCollectionRowID', 'CalibrationDate', 'FieldReplicate', 'Result', 'FieldResultComments', 'BatchVerificationCode', 'CollectionDeviceCode', 'ComplianceCode', 'QACode', 'ResQualCode', 'ExportData' ]], batchverification_lookup[[ 'BatchVerificationCode', 'BatchVerificationDescr' ]], on='BatchVerificationCode', how='left') fieldresult = pd.merge(fieldresult[[ 'ConstituentRowID', 'FieldResultRowID', 'FieldCollectionRowID', 'CalibrationDate', 'FieldReplicate', 'Result', 'FieldResultComments', 'BatchVerificationCode', 'BatchVerificationDescr', 'CollectionDeviceCode', 'ComplianceCode', 'QACode', 'ResQualCode', 'ExportData' ]], collectiondevice_lookup[[ 'CollectionDeviceCode', 'CollectionDeviceDescr' ]], on='CollectionDeviceCode', how='left') fieldresult = pd.merge(fieldresult[[ 'ConstituentRowID', 'FieldResultRowID', 'FieldCollectionRowID', 'CalibrationDate', 'FieldReplicate', 'Result', 'FieldResultComments', 'BatchVerificationCode', 'BatchVerificationDescr', 'CollectionDeviceCode', 'CollectionDeviceDescr', 'ComplianceCode', 'QACode', 'ResQualCode', 'ExportData' ]], compliance_lookup[[ 'ComplianceCode', 'ComplianceName' ]], on='ComplianceCode', how='left') fieldresult = pd.merge(fieldresult[[ 'ConstituentRowID', 'FieldResultRowID', 'FieldCollectionRowID', 'CalibrationDate', 'FieldReplicate', 'Result', 'FieldResultComments', 'BatchVerificationCode', 'BatchVerificationDescr', 'CollectionDeviceCode', 'CollectionDeviceDescr', 'ComplianceCode', 'ComplianceName', 'QACode', 'ResQualCode', 'ExportData' ]], qa_lookup[['QACode', 'QAName', 'QADescr']], on='QACode', how='left') fieldresult = pd.merge(fieldresult[[ 'ConstituentRowID', 'FieldResultRowID', 'FieldCollectionRowID', 'CalibrationDate', 'FieldReplicate', 'Result', 'FieldResultComments', 'BatchVerificationCode', 'BatchVerificationDescr', 'CollectionDeviceCode', 'CollectionDeviceDescr', 'ComplianceCode', 'ComplianceName', 'QACode', 'QAName', 'ResQualCode', 'ExportData' ]], resqual_lookup[['ResQualCode', 'ResQualName']], on='ResQualCode', how='left') # combine fieldresult and constituent fieldresult = pd.merge(fieldresult[[ 'ConstituentRowID', 'FieldResultRowID', 'FieldCollectionRowID', 'CalibrationDate', 'FieldReplicate', 'Result', 'FieldResultComments', 'BatchVerificationCode', 'BatchVerificationDescr', 'CollectionDeviceCode', 'CollectionDeviceDescr', 'ComplianceCode', 'ComplianceName', 'QACode', 'QAName', 'ResQualCode', 'ResQualName', 'ExportData' ]], constituent[[ 'ConstituentRowID', 'AnalyteCode', 'AnalyteName', 'FractionCode', 'FractionName', 'MatrixCode', 'MatrixName', 'MethodCode', 'MethodName', 'UnitCode', 'UnitName' ]], on='ConstituentRowID', how='left') field = pd.merge(fieldresult[[ 'FieldResultRowID', 'FieldCollectionRowID', 'CalibrationDate', 'FieldReplicate', 'Result', 'FieldResultComments', 'BatchVerificationCode', 'BatchVerificationDescr', 'CollectionDeviceCode', 'CollectionDeviceDescr', 'ComplianceCode', 'ComplianceName', 'QACode', 'QAName', 'ResQualCode', 'ResQualName', 'ConstituentRowID', 'AnalyteCode', 'AnalyteName', 'FractionCode', 'FractionName', 'MatrixCode', 'MatrixName', 'MethodCode', 'MethodName', 'UnitCode', 'UnitName', 'ExportData' ]], fieldcollection[[ 'FieldCollectionRowID', 'LocationRowID', 'CollectionMethodCode', 'CollectionMethodName', 'CollectionTime', 'Replicate', 'CollectionDepth', 'UnitCollectionDepth', 'FieldCollectionComments' ]], on='FieldCollectionRowID') field_query = pd.merge( field[[ 'FieldResultRowID', 'FieldCollectionRowID', 'CalibrationDate', 'FieldReplicate', 'Result', 'FieldResultComments', 'BatchVerificationCode', 'BatchVerificationDescr', 'CollectionDeviceCode', 'CollectionDeviceDescr', 'ComplianceCode', 'ComplianceName', 'QACode', 'QAName', 'ResQualCode', 'ResQualName', 'ConstituentRowID', 'AnalyteCode', 'AnalyteName', 'FractionCode', 'FractionName', 'MatrixCode', 'MatrixName', 'MethodCode', 'MethodName', 'UnitCode', 'UnitName', 'FieldCollectionRowID', 'LocationRowID', 'CollectionMethodCode', 'CollectionMethodName', 'CollectionTime', 'Replicate', 'CollectionDepth', 'UnitCollectionDepth', 'FieldCollectionComments' ]], sample[[ 'LocationRowID', 'AgencyCode', 'AgencyName', 'EventCode', 'EventName', 'ProjectCode', 'ProjectName', 'ProtocolCode', 'ProtocolName', 'StationCode', 'StationName', 'EcoregionLevel3Code', 'HydrologicUnit', 'County', 'LocalWatershed', 'UpstreamArea', 'TargetLatitude', 'TargetLongitude', 'Datum', 'SampleDate', 'SampleComments', 'SampleRowID', 'LocationCode', 'LocationName', 'GeometryShape' ]], on='LocationRowID') # lowercase all fieldname field_query.columns = [x.lower() for x in field_query.columns] errorLog("created fields -- datetime:") ### calculated fields ### month, year, ecoregionlayer, ecoregionlevel, ecoregioncode, rwqcb # month from sampledate # year from sampledate # make sampledate type object into datetime field_query['sampledate'] = pd.to_datetime(field_query['sampledate']) field_query["month"] = field_query.sampledate.dt.month field_query["year"] = field_query.sampledate.dt.year errorLog("--- end created fields") # ecoregionlayer = empty why? field_query['ecoregionlayer'] = '' # ecoregionlevel = 3 (habitat) or 33 (field) field_query['ecoregionlevel'] = 33 # ecoregioncode = stationlookup.ecoregionlevel3code field_query.rename(columns={'ecoregionlevel3code': 'ecoregioncode'}, inplace=True) field_query.rename(columns={ 'agencycode': 'sampleagencycode', 'agencyname': 'sampleagencyname' }, inplace=True) # rwqcb = empty why? field_query['rwqcb'] = '' ### IMPORTANT THE FIELDS BELOW NEED TO BE FILLED IN #### #analytewfraction #analytewfractionwunit #analytewfractionwmatrixwunit #result_textraw #resultraw # f-h = field or habitat field_query['f_h'] = 'f' #submittingagency = login_agency field_query['submittingagency'] = '' #databasefilepath = '' field_query['databasefilepath'] = '' #dateloaded = submissiondate? errorLog("start Timestamp") field_query['dateloaded'] = pd.Timestamp(datetime.datetime(2017, 1, 1)) errorLog("end Timestamp") ##dataloadedby field_query['dataloadedby'] = 'checker' ##cleaned field_query['cleaned'] = 1 ##qaed field_query['qaed'] = 1 ##metricscalculated field_query['metricscalculated'] = 1 ##deactivate field_query['deactivate'] = 1 ##projectcode field_query['projectcode'] = '' #loadidnum = submissionid field_query['loadidnum'] = -88 field_query['rownum'] = -88 field_query['project_code'] = project field_query['login_email'] = login field_query['login_agency'] = agency field_query['login_owner'] = owner field_query['login_year'] = year field_query['login_project'] = project # drop temp columns # field_query.drop(['constituentrowid','fieldresultrowid', 'fieldcollectionrowid','fieldresultcomments','qaname','fieldcollectioncomments'],axis=1,inplace=True) # Rename the columns to what they will be when it gets loaded into tbl_phab field_query.rename(columns={ 'fieldcollectionrowid': 'collectionrowid', 'fieldresultcomments': 'resultcomments', 'fieldcollectioncomments': 'collectioncomments' }, inplace=True) ##### END FIELD SPECIFIC CODE ##### HABITAT SPECIFIC CODE #### pull together field collection entry habitatcollection = pd.merge(habitatcollection_entry[[ 'HabitatCollectionRowID', 'LocationRowID', 'CollectionMethodCode', 'CollectionTime', 'Replicate', 'HabitatCollectionComments' ]], collectionmethod_lookup[[ 'CollectionMethodCode', 'CollectionMethodName' ]], on='CollectionMethodCode', how='left') #### pull together field result entries habitatresult = pd.merge(habitatresult_entry[[ 'ConstituentRowID', 'HabitatResultRowID', 'HabitatCollectionRowID', 'VariableResult', 'Result', 'HabitatResultComments', 'CollectionDeviceCode', 'ComplianceCode', 'QACode', 'ResQualCode', 'ExportData' ]], collectiondevice_lookup[[ 'CollectionDeviceCode', 'CollectionDeviceDescr' ]], on='CollectionDeviceCode', how='left') habitatresult = pd.merge( habitatresult[[ 'ConstituentRowID', 'HabitatResultRowID', 'HabitatCollectionRowID', 'VariableResult', 'Result', 'HabitatResultComments', 'CollectionDeviceCode', 'CollectionDeviceDescr', 'ComplianceCode', 'QACode', 'ResQualCode', 'ExportData' ]], compliance_lookup[['ComplianceCode', 'ComplianceName']], on='ComplianceCode', how='left') habitatresult = pd.merge(habitatresult[[ 'ConstituentRowID', 'HabitatResultRowID', 'HabitatCollectionRowID', 'VariableResult', 'Result', 'HabitatResultComments', 'CollectionDeviceCode', 'CollectionDeviceDescr', 'ComplianceCode', 'ComplianceName', 'QACode', 'ResQualCode', 'ExportData' ]], qa_lookup[['QACode', 'QAName']], on='QACode', how='left') habitatresult = pd.merge(habitatresult[[ 'ConstituentRowID', 'HabitatResultRowID', 'HabitatCollectionRowID', 'VariableResult', 'Result', 'HabitatResultComments', 'CollectionDeviceCode', 'CollectionDeviceDescr', 'ComplianceCode', 'ComplianceName', 'QACode', 'QAName', 'ResQualCode', 'ExportData' ]], resqual_lookup[['ResQualCode', 'ResQualName']], on='ResQualCode', how='left') # combine fieldresult and constituent habitatresult = pd.merge(habitatresult[[ 'ConstituentRowID', 'HabitatResultRowID', 'HabitatCollectionRowID', 'VariableResult', 'Result', 'HabitatResultComments', 'CollectionDeviceCode', 'CollectionDeviceDescr', 'ComplianceCode', 'ComplianceName', 'QACode', 'QAName', 'ResQualCode', 'ResQualName', 'ExportData' ]], constituent[[ 'ConstituentRowID', 'AnalyteCode', 'AnalyteName', 'FractionCode', 'FractionName', 'MatrixCode', 'MatrixName', 'MethodCode', 'MethodName', 'UnitCode', 'UnitName' ]], on='ConstituentRowID', how='left') habitat = pd.merge(habitatresult[[ 'HabitatResultRowID', 'HabitatCollectionRowID', 'VariableResult', 'Result', 'HabitatResultComments', 'CollectionDeviceCode', 'CollectionDeviceDescr', 'ComplianceCode', 'ComplianceName', 'QACode', 'QAName', 'ResQualCode', 'ResQualName', 'ConstituentRowID', 'AnalyteCode', 'AnalyteName', 'FractionCode', 'FractionName', 'MatrixCode', 'MatrixName', 'MethodCode', 'MethodName', 'UnitCode', 'UnitName', 'ExportData' ]], habitatcollection[[ 'HabitatCollectionRowID', 'LocationRowID', 'CollectionMethodCode', 'CollectionMethodName', 'CollectionTime', 'Replicate', 'HabitatCollectionComments' ]], on='HabitatCollectionRowID') habitat_query = pd.merge( habitat[[ 'HabitatResultRowID', 'HabitatCollectionRowID', 'LocationRowID', 'VariableResult', 'Result', 'HabitatResultComments', 'CollectionDeviceCode', 'CollectionDeviceDescr', 'ComplianceCode', 'ComplianceName', 'QACode', 'QAName', 'ResQualCode', 'ResQualName', 'ConstituentRowID', 'AnalyteCode', 'AnalyteName', 'FractionCode', 'FractionName', 'MatrixCode', 'MatrixName', 'MethodCode', 'MethodName', 'UnitCode', 'UnitName', 'CollectionMethodCode', 'CollectionMethodName', 'CollectionTime', 'Replicate', 'HabitatCollectionComments', 'ExportData' ]], sample[[ 'LocationRowID', 'AgencyCode', 'AgencyName', 'EventCode', 'EventName', 'ProjectCode', 'ProjectName', 'ProtocolCode', 'ProtocolName', 'StationCode', 'StationName', 'EcoregionLevel3Code', 'HydrologicUnit', 'County', 'LocalWatershed', 'UpstreamArea', 'TargetLatitude', 'TargetLongitude', 'Datum', 'SampleDate', 'SampleComments', 'SampleRowID', 'LocationCode', 'LocationName', 'GeometryShape' ]], on='LocationRowID') # lowercase all fieldname habitat_query.columns = [x.lower() for x in habitat_query.columns] ### calculated fields ### month, year, ecoregionlayer, ecoregionlevel, ecoregioncode, rwqcb # month from sampledate # year from sampledate # make sampledate type object into datetime habitat_query['sampledate'] = pd.to_datetime(habitat_query['sampledate']) habitat_query["month"] = habitat_query.sampledate.dt.month habitat_query["year"] = habitat_query.sampledate.dt.year # ecoregionlayer = empty why? habitat_query['ecoregionlayer'] = '' # ecoregionlevel = 3 (habitat) or 33 (field) habitat_query['ecoregionlevel'] = 3 # ecoregioncode = stationlookup.ecoregionlevel3code habitat_query.rename(columns={'ecoregionlevel3code': 'ecoregioncode'}, inplace=True) habitat_query.rename(columns={ 'agencycode': 'sampleagencycode', 'agencyname': 'sampleagencyname' }, inplace=True) # rwqcb = empty why? habitat_query['rwqcb'] = '' #analytewfraction #analytewfractionwunit #analytewfractionwmatrixwunit #result_textraw #resultraw # f-h = field or habitat habitat_query['f_h'] = 'h' #submittingagency = login_agency habitat_query['submittingagency'] = '' #databasefilepath = '' habitat_query['databasefilepath'] = '' #dateloaded = submissiondate? habitat_query['dateloaded'] = pd.Timestamp(datetime.datetime(2017, 1, 1)) ##dataloadedby habitat_query['dataloadedby'] = 'checker' ##cleaned habitat_query['cleaned'] = 1 ##qaed habitat_query['qaed'] = 1 ##metricscalculated habitat_query['metricscalculated'] = 1 ##deactivate habitat_query['deactivate'] = 1 ##projectcode habitat_query['projectcode'] = '' #loadidnum = submissionid habitat_query['loadidnum'] = -88 habitat_query['rownum'] = -88 habitat_query['project_code'] = project habitat_query['login_email'] = login habitat_query['login_agency'] = agency habitat_query['login_owner'] = owner habitat_query['login_year'] = year habitat_query['login_project'] = project # drop temp columns # habitat_query.drop(['constituentrowid','habitatresultrowid', 'habitatcollectionrowid','habitatresultcomments','qaname','habitatcollectioncomments'],axis=1,inplace=True) # Rename columns to what they will be when it gets put into tbl_phab habitat_query.rename(columns={ 'habitatcollectionrowid': 'collectionrowid', 'habitatresultcomments': 'resultcomments', 'habitatcollectioncomments': 'collectioncomments' }, inplace=True) ##### END HABITAT SPECIFIC CODE #return field_query, habitat_query # For some reason there was a duplicated column in the field query dataframe, and it caused the script to break at the concatenation part. # Here is the workaround, to drop duplicated columns field_query = field_query.loc[:, ~field_query.columns.duplicated()] ## CONCATENATE FIELD AND HABITAT QUERIES rawdata = pd.concat([field_query, habitat_query]) # drop unnecessary columns rawdata.drop([ 'constituentrowid', 'fieldresultrowid', 'habitatresultrowid', 'qaname' ], axis=1, inplace=True) ## FILL IN MISSING FIELDS # create engine connection eng = create_engine('postgresql://*****:*****@192.168.1.17:5432/smc') # get list of tbl_phab columns metadata = MetaData() tbl_phab = Table('tbl_phab', metadata, autoload=True, autoload_with=eng) tbl_phab_cols = [x.name for x in tbl_phab.columns] tbl_phab_cols = [ x for x in tbl_phab_cols if x not in [ 'objectid', 'globalid', 'created_user', 'created_date', 'last_edited_user', 'last_edited_date' ] ] # get list of missing columns missing_cols = [c for c in tbl_phab_cols if c not in rawdata.columns] # fill in missing columns with default values. (Needed to load into raw phab data table) for x in missing_cols: if isinstance(tbl_phab.columns[x].type, VARCHAR): rawdata[x] = '' if isinstance(tbl_phab.columns[x].type, INTEGER) | isinstance( tbl_phab.columns[x].type, NUMERIC) | isinstance( tbl_phab.columns[x].type, SMALLINT): rawdata[x] = -88 if isinstance(tbl_phab.columns[x].type, TIMESTAMP): rawdata[x] = datetime.date(1950, 01, 01) phabtable = pd.read_sql("SELECT * FROM tbl_phab LIMIT 1", eng) phabcolumnsmatched = len(phabtable.columns) - 1 phabmetricstable = pd.read_sql("SELECT * FROM tmp_phabmetrics LIMIT 1", eng) phabmetricscolumnsmatched = len(phabmetricstable.columns) - 1 return rawdata
# try: # if element == i.encode("utf-8") : # return True # except: # print (i,db.index(i)) # return False print("Loading First DataSet ") df1 = pd.read_excel('PersianWords.xlsx', sheet_name='Sheet1') persian_words = set(df1['*Total farsi Word (Moin+openoffice.fa+wikipedia)']) print("Finish Loading Fisrt DataSet") #Load First DataSet to Memory print("Loading Second DataSet") df2 = mdb.read_table("FLEXICON.mdb", "Entries") affixes = mdb.read_table("FLEXICON.mdb", "Affixes") # words = list(df2["WrittenForm"]) flexicon_words = set(df2["WrittenForm"]) #Load Second DataSet to Memory print("Finish Loading Second DataSet") search_word = u'کیارش' answer_list = set() gen = all_perms(search_word) print("start executing ...") while 1: try: next_perm = next(gen) if search(next_perm, flexicon_words) or search(next_perm,
parser.add_argument('-d', '--dataset', type=str, help='Prometea Dataset file') args = parser.parse_args() # connect to MongoDB prometea database print('STEP 01: Connect to Prometea MongoDB database') print('---------------------------------------------') client = MongoClient(port=27017) db=client.prometeadb print() # load to MS Access prometea database print('STEP 02: Load Prometea MS Access dataset') print('----------------------------------------') print('Loading Prometea DGP Ultimos Dataset from ' + args.dataset) dataset = mdb.read_table(args.dataset, 'DGP_Ultimo') print() # Iterate over parent datasets print('STEP 03: Start Merge DGP Ultimos Dataset') print('--------------------------------------') # insert child pacientes dataset pbar = tqdm(total=dataset.shape[0], desc='DGP Ultimos Dataset') num = 0 for index, rows in dataset.iterrows(): db.Pacientes.update_one({'centro': rows.centro, 'nhc': rows.nhc}, {'$push': {'dgps': {'enfermedad': 'epoc', 'dgp': rows.dgp, 'dgpTxt': rows.dgpTxt,
def convert(input_name, output_name=None, options={}): """Convert MDB table to a GLM object list """ debug(f"input_name = '{input_name}'") debug(f"output_name = '{output_name}'") debug(f"options = '{options}'") # options if "class" in options.keys(): global CLASS CLASS = options.pop("class") if "modules" in options.keys(): global MODULES MODULES = options.pop("modules").split(",") if "chunksize" in options.keys(): global CHUNKSIZE CHUNKSIZE = int(options.pop("chunksize")) if "columns" in options.keys(): global COLUMNS columns = options.pop("columns").split(",") COLUMNS = {} for column in columns: spec = column.split(":") if len(spec) != 2: error(f"columns spec '{spec}'' is invalid", 1) COLUMNS[spec[0]] = spec[1] if "dtypes" in options.keys(): global DTYPES dtypes = options.pop("dtypes").split(",") DTYPES = {} for dtype in dtypes: spec = dtype.split(":") if len(spec) != 2: error(f"dtype spec '{spec}'' is invalid", 1) DTYPES[spec[0]] = spec[1] if "table" in options.keys(): global TABLE TABLE = options.pop("table") if options: error(f"option '{list(options.keys())[0]}' not valid", 1) # check mdb name if not os.path.exists(input_name): error(f"'{input_name}' not found", 1) debug(f"file '{input_name}' ok") # check schema schema = mdb.read_schema(input_name) if TABLE not in schema.keys(): error(f"table '{TABLE}' not found") debug(f"table '{TABLE}' ok") # check for class if not CLASS and "class" not in schema.keys(): error("class must be specified if not found in data", 1) # load data try: DB = mdb.read_table(input_name, TABLE, chunksize=CHUNKSIZE) except Exception as err: error(f"{input_name} open failed ({err})", 1) objects = [] for chunk in DB: objects.append(chunk) # save data if not output_name: output_name = TABLE.lower() + ".glm" try: GLM = open(output_name, "w") except Exception as err: error(f"{output_name} open failed ({err})", 1) GLM.write(f"// generated by '{' '.join(sys.argv)}'\n") for module in MODULES: GLM.write(f'module {module};\n') # process records classes = [] modules = MODULES for obj in objects: # write module (first time only) if "module" in obj.columns: module = obj["module"] else: module = None if module and module not in modules: modules.append(module) GLM.write(f'module {module};\n') # write class (first time only) if "class" in obj.columns: classname = obj["class"] elif CLASS: classname = CLASS else: error(f"no class specified (obj={obj})", 2) if classname not in classes and not module: classes.append(classname) GLM.write(f'class {classname} {{\n') for item, dtype in zip(obj.columns, obj.dtypes): if item in COLUMNS.keys(): item = COLUMNS[item] if item in [ "id", "class", "groupid", "name", "next", "parent", "child_count", "rank", "valid_to_", "schedule_skew", "latitude", "longitude", "in_svc", "out_svc", "rng_state", "heartbeat", "guid", "flags" ]: continue if item in DTYPES.keys(): dtype = DTYPES[item] elif str(dtype) not in DATATYPES.keys(): dtype = "object" else: dtype = DATATYPES[str(dtype)] GLM.write(f'\t{dtype} {item};\n') GLM.write('}\n') # write object for num, values in obj.to_dict('index').items(): GLM.write(f'object {classname} {{\n') for key, value in values.items(): if key in COLUMNS.keys(): GLM.write(f'\t{COLUMNS[key]} "{value}";\n') else: GLM.write(f'\t{key} "{value}";\n') GLM.write('}\n')
# Does what it says on the tin. # Note that it takes a while to read all the mdb thingies. # To run pandas_access successfully, install mdbtools through whatever you need. # -Matt import pandas import numpy as np import pandas_access DATASET = "./FDOC_January_2017.mdb" if __name__ == "__main__": table_list = [] for tablename in pandas_access.list_tables(DATASET): print(tablename) table_list.append(tablename) tables_dict = {} for table in table_list[0:2]: #doing more makes it take forever. tables_dict[table] = pandas_access.read_table(DATASET, table)
while attempts < 3: try: response = urlopen( 'http://static.data.gov.hk/td/routes-and-fares/RSTOP_BUS.mdb', timeout=10) content = response.read() with open(expanduser('~/Desktop/RSTOP_BUS.mdb'), 'wb') as f: f.write(content) break except URLError as e: if attempts == 2: exit(1) attempts += 1 df = mdb.read_table(expanduser('~/Desktop/RSTOP_BUS.mdb'), mdb.list_tables(expanduser('~/Desktop/RSTOP_BUS.mdb'))[0]) df.drop_duplicates(subset=['ROUTE_ID', 'ROUTE_SEQ', 'STOP_SEQ'], keep='last', inplace=True) #df.fillna(-1, inplace=True) conn = pymssql.connect(server='192.168.13.60', user='******', password='******', database='DWHRMS_DEMO') last = len(df) - 1 sql = 'DELETE FROM [t_MobBusStops]' cursor = conn.cursor() cursor.execute(sql) conn.commit() folder = expanduser('~/Desktop/rstops')
def convert(input_name, output_name = None, options = {}): """Convert MDB table to a GLM object list """ debug(f"input_name = '{input_name}'") debug(f"output_name = '{output_name}'") debug(f"options = '{options}'") global PROPERTIES # options if "chunksize" in options.keys(): global CHUNKSIZE CHUNKSIZE = int(options.pop("chunksize")) debug(f"chunksize = {CHUNKSIZE}") if "table" in options.keys(): global TABLE TABLE = options.pop("table") debug(f"table = {TABLE}") if "index" in options.keys(): global INDEXCOLS INDEXCOLS = options.pop("index").split(",") debug(f"index = {INDEXCOLS}") if "parent" in options.keys(): global PARENTCOL PARENTCOL = options.pop("parent") debug(f"parent = {PARENTCOL}") if "values" in options.keys(): global VALUECOLS VALUECOLS = options.pop("values").split(",") PROPERTIES = VALUECOLS debug(f"values = {VALUECOLS}") if "csvfile" in options.keys(): global CSVFILE CSVFILE = options.pop("csvfile") debug(f"csvfile = {CSVFILE}") if "scale" in options.keys(): global SCALE SCALE = options.pop("scale").split(",") try: SCALE = list(map(lambda x:float(x),SCALE)) except Exception as err: error(f"scale is invalid ({err})",1) debug(f"scale = {SCALE}") if "round" in options.keys(): global ROUND ROUND = options.pop("round)").split(",") try: ROUND = list(map(lambda x:int(x),ROUND)) except Exception as err: error(f"round is invalid ({err}),1)") debug(f"round = {ROUND}") if "properties" in options.keys(): PROPERTIES = options.pop("properties").split(",") if len(PROPERTIES) != len(VALUECOLS): error("incorrect number of properties",1) debug(f"properties = {PROPERTIES}") if options: error(f"option '{list(options.keys())[0]}' not valid",1) # check mdb name if not os.path.exists(input_name): error(f"'{input_name}' not found",1) debug(f"file '{input_name}' ok") # check schema schema = mdb.read_schema(input_name) if TABLE not in schema.keys(): error(f"table '{TABLE}' not found") debug(f"table '{TABLE}' ok") # load data try: DB = mdb.read_table(input_name,TABLE,chunksize=CHUNKSIZE) except Exception as err: error(f"{input_name} open failed ({err})",1) data = [] index_col = [PARENTCOL] index_col.extend(INDEXCOLS) usecols = VALUECOLS.copy() usecols.extend(index_col) for chunk in DB: data.append(pandas.read_csv(StringIO(chunk.to_csv()), index_col = index_col, usecols = usecols, parse_dates = [INDEXCOLS[0]], infer_datetime_format = False, # date_parser = lambda x:[datetime.strptime(y,DATEFORMAT) for y in x], ).dropna()) debug(f"*** Block {len(data)} ***\n {data[-1]}") break; data = pandas.concat(data).sort_index() data.index.names = ["parent","date","hour"] data["datetime"] = data.index.get_level_values(1) + pandas.TimedeltaIndex(data.index.get_level_values(2),unit="h") data.reset_index(inplace=True) data.set_index(["parent","datetime"],inplace=True) data.drop(["date","hour"],inplace=True,axis=1) for n, name in enumerate(VALUECOLS): if type(SCALE) == float: data[name] *= SCALE elif n > len(SCALE): data[name] *= SCALE[-1] else: data[name] *= SCALE[n] if type(ROUND) == int: data[name] = data[name].round(ROUND) elif n > len(SCALE): data[name] = data[name].round(ROUND[-1]) else: data[name] = data[name].round(ROUND[n]) # write player object if not output_name: output_name = TABLE.lower() + ".glm" try: GLM = open(output_name,"w") except Exception as err: error(f"{output_name} open failed ({err})",1) GLM.write(f'''// generated by '{' '.join(sys.argv)}' #define {TABLE}_STARTTIME={data.index.get_level_values(1).min()} #define {TABLE}_STOPTIME={data.index.get_level_values(1).max()} #define {TABLE}_PLAYERS={" ".join(data.index.get_level_values(0).unique())} module tape; ''') for parent in data.index.get_level_values(0).unique(): GLM.write(f'''object tape.player {{ parent "{parent}"; file "{TABLE}_{parent}.csv"; property "{','.join(PROPERTIES)}"; }} ''') data.loc[parent].to_csv(f"{TABLE}_{parent}.csv",header=False,index=True)
parser.add_argument('-d', '--dataset', type=str, help='Prometea Dataset filer') args = parser.parse_args() # connect to MongoDB prometea database print('STEP 01: Connect to Prometea MongoDB database') print('---------------------------------------------') client = MongoClient(port=27017) db=client.prometeadb print() # load to MS Access prometea database print('STEP 02: Load Prometea MS Access dataset') print('----------------------------------------') print('Loading Prometea Pacientes Dataset from ' + args.dataset) datasetPaciente = mdb.read_table(args.dataset, 'Pacientes') print() # Iterate over parent datasets print('STEP 03: Start Update Pacientes Dataset') print('--------------------------------------') # insert child pacientes dataset pbar = tqdm(total=datasetPaciente.shape[0], desc='Pacientes Dataset') num = 0 for index, rows in datasetPaciente.iterrows(): # get first paciente from centro and nhc paciente = db.Pacientes.find_one({'centro': rows.centro, 'nhc': rows.nhc}) # if not exist the paciente insert the new one if paciente is None:
import pandas_access as mdb db_filename = inputfile # Listing the tables. for tbl in mdb.list_tables(db_filename): print(tbl) # import all the necessary libraries import csv import pyodbc import pandas as pd import numpy as np # In[4]: dfcl = mdb.read_table(db_filename, table_name="CRASH_LEVEL") dfcl.to_csv(outputcl, index=False) # In[5]: dfvl = mdb.read_table(db_filename, table_name="VEHICLE_LEVEL") dfvl.to_csv(outputcl, index=False) # In[6]: dfpl = mdb.read_table(db_filename, table_name="PERSON_LEVEL") dfpl.to_csv(outputpl, index=False) # In[7]: import csv import pyodbc import pandas as pd import numpy as np
import pandas import numpy as np import geopandas from simpledbf import Dbf5 import pandas_access as mdb # Parcel Shape file prcl_shp = geopandas.GeoDataFrame.from_file( 'data/open-data-extracted/prcl_shape/prcl.shp') # Join Parcel Info Prcl = mdb.read_table("data/open-data-extracted/prcl/prcl.mdb", "Prcl") parcels = pandas.merge(prcl_shp, Prcl, left_on='HANDLE', right_on='Handle') parcels.loc[:, 'Nbrhd'] = parcels.Nbrhd.astype(int) # Join in Neighborhood names neighborhoods = geopandas.read_file( 'data/open-data-extracted/nbrhds_wards/BND_Nhd88_cw.shp') parcels = pandas.merge(parcels, neighborhoods[['NHD_NAME', 'NHD_NUM']], left_on='Nbrhd', right_on='NHD_NUM') # Join More Parcel info, this one has nice address names to join to CSB data par = Dbf5('data/open-data-extracted/par/par.dbf', codec='cp1250').to_dataframe() parcels = pandas.merge(parcels, par, on='HANDLE') parcels.loc[:, 'parcel_address'] = parcels.SITEADDR.apply(