示例#1
0
文件: rtca.py 项目: michalkahle/rtca
def load_file(filename):
    if not os.path.isfile(filename):
        raise FileNotFoundError(filename)
    errors = mdb.read_table(filename, 'ErrLog')
    if len(errors) > 0:
        print('Errors reported in file %s:' % filename)
        print(errors)
    messages = mdb.read_table(filename, 'Messages')
    messages = messages[messages.MsgID == 'w01']
    if len(messages) > 0:
        messages.Message = messages.Message.replace(
            regex='Plate Scanned. Please check the connection on positions:\\s*',
            value='Connection problem: ')
        print('Connection problems reported in file %s:' % filename)
        print(messages.drop(['MsgID', 'MsgOrder'], axis=1))
    ttimes = mdb.read_table(filename, 'TTimes')
    ttimes = ttimes[['TimePoint', 'TestTime']]
    ttimes.columns = ['tp', 'dt']
    ttimes['dt'] = pd.to_datetime(ttimes['dt'])
    org = mdb.read_table(filename, 'Org10K').drop('TestOrder', axis=1)
    assert org.shape[0] > 0, '%s contains no data!' % filename
    org['Row'] = org['Row'].map(ord) - 65
    org = org.rename({'TimePoint':'tp', 'Row':'row'}, axis=1).set_index(['tp', 'row']).sort_index()
    n_cols = org.shape[1]
    org.columns = pd.MultiIndex.from_product([['org'], range(n_cols)], names=[None, 'col'])
    org = org.stack('col').reset_index()
    org['org'] = org['org'].astype(float)
    org['welln'] = org['row'] * (org['col'].max() + 1) + org['col']
    org.drop(['row', 'col'], axis=1, inplace=True)
    org = org.merge(ttimes)
    org = org[['tp', 'welln', 'dt', 'org']]
    return org
示例#2
0
def main(argv):

    if len(argv) < 3:
        print("Syntax: gridlabd mdb_info MDBFILE COMMAND [ARGS ...]",
              file=sys.stderr)
        exit(1)

    MDBFILE = argv[1]
    COMMAND = argv[2]

    if COMMAND == "tables":

        if len(argv) > 3:
            REGEX = argv[3]
        else:
            REGEX = ".*"
        for TABLE in mdb.list_tables(MDBFILE):
            if re.match(REGEX, TABLE):
                print(TABLE)

    elif COMMAND == "export":

        if len(argv) < 3:
            error(f"missing export table name", 1)
        REGEX = argv[3]
        for TABLE in mdb.list_tables(MDBFILE):
            if re.match(REGEX, TABLE):
                DF = mdb.read_table(MDBFILE, TABLE)
                DF.to_csv(TABLE + ".csv")

    else:
        error(f"commmand '{COMMAND}' is not valid", 2)
示例#3
0
 def __init__(self, filename):
     super().__init__()
     self.df_objects = mdb.read_table(filename,
                                      "RoboClip",
                                      converters_from_schema=False)
     self.df_objects.rename({"GRUPPO": "GROUP"}, axis=1, inplace=True)
     self.load_from_df(self.df_objects)
示例#4
0
def get_character_from_access(idn):
    query = '`Character Number` == %s'
    df = mdb.read_table("Character.accdb", "Characters")
    character = df.query(query % idn)
    if character.empty:
        return None
    return character
示例#5
0
    def get_mdb_data(self, payload):
        '''
        Returns list of Entity object(s) (str,dataframe) from a successful extraction

        Arguments:
        payload -- payload object (str,binary)
        '''
        # TODO: find a way to directly pass byteio to reading utility without writing to disk
        # Write to bytes to disk
        open(payload.filename, 'wb').write(payload.data.getvalue())

        # Get database schema
        mdb_schema = pandas_access.read_schema(payload.filename)
        # Get attributes that are of integer type
        integer_attributes = self.get_attributes_by_data_type(mdb_schema,'Long Integer')

        # Declare entity list
        entity_list = []
        # Iterate through each table in database
        for tbl in pandas_access.list_tables(payload.filename):
                # Issue: Default pandas integer type is not nullable - null values in integer column causes read error
                # Workaround: Read integer as Int64 (pandas nullable integer type in pandas)
                dtype_input = {attribute:'Int64' for attribute in integer_attributes[tbl]}
                df = pandas_access.read_table(payload.filename, tbl, dtype = dtype_input)
                entity_list.append(Entity(tbl,df))
        return entity_list
示例#6
0
 def __init__(self, filename):
     super().__init__()
     self.df_targets = mdb.read_table(
         filename, "RoboClip", converters_from_schema=False
     )
     self.df_targets.rename({"GRUPPO": "GROUP"}, axis=1, inplace=True)
     self.df_targets["GROUP"] = self.df_targets["GROUP"].fillna("UNLABELED")
     self.load_from_df(self.df_targets)
示例#7
0
	def main(self):
		# Read database by groups of 10000
		for chunk in mdb.read_table(self.db, self.table, chunksize=self.chunk_size):
			self.data.append(chunk)

		pd.concat(self.data).to_csv(self.output, index=False)
		# Print execution time
		self.execution_time()
示例#8
0
def access_to_csv(get_in, get_out, table):
    df = mdb.read_table(get_in, table)
    df.columns = [meta.clean_metadata(x.lower()) for x in df.columns]
    df.to_csv(get_out + '_temp_' + table + '.csv',
              date_format=date_format,
              encoding='utf-8',
              index=False,
              header=True,
              sep=';')
    txt_to_csv(get_out + '_temp_' + table + '.csv',
               get_out + '_' + table + '.csv', ';')
示例#9
0
    def ParseSynergiDatabase(self):
        """
        Use Pandas Access to convert the MDB tables to Pandas DataFrames.
        """
        print("Opening synergie database - ", self.paths["Synergi File"])
        table_list = mdb.list_tables(self.paths["Synergi File"])

        table_list_warehouse = []
        if "warehouse" in self.paths:
            print("Opening warehouse database - ", self.paths["warehouse"])
            table_list_warehouse = mdb.list_tables(self.paths["warehouse"])

        for table in table_list:
            self.SynergiDictionary[table] = mdb.read_table(
                self.paths["Synergi File"], table)

        for table in table_list_warehouse:
            self.SynergiDictionary[table] = mdb.read_table(
                self.paths["warehouse"], table)
        return
示例#10
0
    def load_mdb(self, file):
        name, ext = os.path.splitext(file)
        if name == '2007' or name == '2009':
            table = mdb.list_tables(self.base_path + file)[1]
        else:
            table = mdb.list_tables(self.base_path + file)[0]

        df = mdb.read_table(self.base_path + file, table)

        # extra rows at 2012
        if file == '2012.accdb':
            df = df.drop(columns=['MV_CIRCUNT', 'MV_TIPO', 'C_MEDICO'])

        return df
示例#11
0
    def get_mdb_data(self, payload):
        '''
        Returns list of Entity object(s) (str,dataframe) from a successful extraction

        Arguments:
        payload -- payload object (str,binary)
        '''
        # TODO: find a way to directly pass byteio to reading utility without writing to disk
        try:
            # Write to bytes to disk
            open(payload.filename, 'wb').write(payload.data.getvalue())

            # Get database schema
            mdb_schema = pandas_access.read_schema(payload.filename)
            # Get attributes that are of integer type
            integer_attributes = self.get_attributes_by_data_type(
                mdb_schema, 'Long Integer')

            # Declare entity dict
            entity_dict = dict()

            # Get list of table from database
            table_list = pandas_access.list_tables(payload.filename)

            # Update progress bar job count
            self.job_count += len(table_list)
            self.pbar.total = self.job_count

            # Iterate through each table in database
            for tbl in table_list:
                self.logger.debug('Extracting table: \'%s\' from file: %s...',
                                  tbl, payload.filename)
                # Issue: Default pandas integer type is not nullable - null values in integer column causes read error
                # Workaround: Read integer as Int64 (pandas nullable integer type in pandas)
                dtype_input = {
                    attribute: 'Int64'
                    for attribute in integer_attributes[tbl]
                }
                df = pandas_access.read_table(payload.filename,
                                              tbl,
                                              dtype=dtype_input)
                entity_dict.update({tbl: df})
                # update progress bar
                self.pbar.update()
            return entity_dict
        finally:
            self.logger.debug('Removing intermediate file: %s...',
                              payload.filename)
            utils.silentremove(payload.filename)
示例#12
0
def from_SQLT(args):
    sql_table = args.sql_table

    if sql_table == None or sql_table == "":
        return "MUST SUPPLY --sql_table"

    sql_type = args.sql_kind.lower()

    if sql_type == "psql":
        return sql_type + " Coming Soon"
    elif sql_type == "msql":
        return sql_type + " Coming Soon"
    elif sql_type == "sqll":
        return sql_type + " Coming Soon"
    elif sql_type == "mdb":
        sql_file = args.sql_file

        if sql_file == None or sql_file == "":
            return "MUST SUPPLY --sql_file"

        try:
            tables = mdb.list_tables(sql_file)

            if from_table not in tables:
                return "TABLE NOT FOUND. Available Tables:\n\t" + "\n\t".join(
                    tables)

            df = mdb.read_table(sql_file, from_table)
        except:
            traceback.print_exc()
            return "ERROR"

    else:
        return "UNSUPPORTED SQL. Available Kinds: PSQL (PostGreSQL), MSQL (MySQL), MDB (Access DB), SQLL (SQLite)"

    return df
示例#13
0
import pandas_access as mdb
import pandas as pd

db_filename = 'isear_databank.mdb'

# Listing the tables.

for tbl in mdb.list_tables(db_filename):
    df = mdb.read_table(db_filename, tbl)
    df.to_csv(tbl + '.csv')
# Read a small table.

db = pd.read_spss('ISEAR SPSS Databank.sav')
db.to_csv('isear.csv')
示例#14
0
def main():

    program_name = 'qsomysql'
    program_version = '1.2'

    qrz = QRZ(cfg='./qrz.ini')

    config = configparser.ConfigParser()
    config.read(['qsoviz.ini', 'qrz.ini'])
    use_n3fjp_mdb = config['general'].getboolean('use_n3fjp_mdb')
    reset_mysql_db = config['general'].getboolean('reset_mysql_db')
    db_user = config['mysql']['db_user']
    db_password = config['mysql']['db_password']
    db_name = config['mysql']['db_name']

    logging.basicConfig(filename='/home/qsoviz/qsomysql.log',
                        level=logging.INFO,
                        format='%(asctime)s %(message)s')
    logging.info('%s %s: Run started', program_name, program_version)

    states_fp = open('states.json', 'r')
    states_coords = json.load(states_fp)
    states_fp.close()

    mysql_cnx = mysql.connector.connect(user=db_user,
                                        password=db_password,
                                        database=db_name)
    mysql_cursor = mysql_cnx.cursor()
    if reset_mysql_db:
        mysql_cursor.execute("DELETE FROM aarc_fd")
    mysql_cnx.commit()
    add_qso = (
        "INSERT INTO aarc_fd "
        "(datetime_on, callsign, n3fjp_modecontest, band, state, arrl_sect, country, "
        +
        "n3fjp_initials, operator, class, computer_name, latitude, longitude, geohash) "
        "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)")
    query_qso = "SELECT datetime_on, callsign FROM aarc_fd WHERE datetime_on = %s AND callsign = %s"
    logfile_filename = newest_file('/home/qsoviz/logfiles')

    records_added = 0
    if use_n3fjp_mdb:
        fields = [
            'fldCall', 'fldModeContest', 'fldBand', 'fldState', 'fldSection',
            'fldCountryWorked', 'fldInitials', 'fldOperator', 'fldClass',
            'fldComputerName'
        ]
        df = mdb.read_table(logfile_filename, 'tblContacts')
        for idx, row in df.iterrows():
            fldDateStr = row.loc['fldDateStr']
            fldTimeOnStr = row.loc['fldTimeOnStr']
            year, month, day = fldDateStr.split('/')
            hour, minute, second = fldTimeOnStr.split(':')
            a_datetime_on = datetime(int(year),
                                     int(month),
                                     int(day),
                                     hour=int(hour),
                                     minute=int(minute),
                                     second=int(second),
                                     tzinfo=timezone.utc)
            a_callsign = row.loc['fldCall']
            query_qso_data = (a_datetime_on, a_callsign)
            mysql_cursor.execute(query_qso, query_qso_data)
            result = mysql_cursor.fetchall()
            if len(result) == 0:
                add_qso_data = [a_datetime_on]
                for field in fields:
                    temp = row.loc[field]
                    if temp != temp:
                        temp = ''
                    temp = temp.upper()
                    add_qso_data.append(temp)
                a_latitude, a_longitude, a_geohash = geo_info(
                    qrz, add_qso_data[1], add_qso_data[4], states_coords)
                add_qso_data.append(a_latitude)
                add_qso_data.append(a_longitude)
                add_qso_data.append(a_geohash)
                add_qso_data = tuple(add_qso_data)
                mysql_cursor.execute(add_qso, add_qso_data)
                mysql_cnx.commit()
                records_added = records_added + 1
    else:
        adif_fp = open(logfile_filename, 'r')
        qsos = ADIReader(adif_fp)
        for qso in qsos:
            a_datetime_on = qso['datetime_on']
            fldCall = qso['call'].upper()
            query_qso_data = (a_datetime_on, fldCall)
            result = mysql_cursor(query_qso, query_qso_data)
            if not result:
                fldModeContest = qso.get('mode', 'NONE').upper()
                fldBand = qso.get('band', 'NONE').upper()
                fldState = qso.get('state', 'NONE').upper()
                fldSection = qso.get('arrl_sect', 'NONE').upper()
                fldCountryWorked = qso.get('country', 'NONE').upper()
                fldInitials = qso.get('app_n3fjp_initials', 'NONE').upper()
                fldOperator = qso.get('operator', 'NONE').upper()
                fldClass = qso.get('class', 'NONE').upper()
                fldComputerName = qso.get('n3fjp_stationid', 'NONE').upper()
                a_latitude, a_longitude, a_geohash = geo_info(
                    qrz, fldCall, fldState, states_coords)
                add_qso_data = (a_datetime_on, fldCall, fldModeContest,
                                fldBand, fldState, fldSection,
                                fldCountryWorked, fldInitials, fldOperator,
                                fldClass, fldComputerName, a_latitude,
                                a_longitude, a_geohash)
                mysql_cursor.execute(add_qso, add_qso_data)
                mysql_cnx.commit()
                records_added = records_added + 1

    if not use_n3fjp_mdb:
        adif_fp.close()
    mysql_cursor.close()
    mysql_cnx.close()
    logging.info('%s %s: records_added = %s', program_name, program_version,
                 records_added)
    logging.info('%s %s: Run ended', program_name, program_version)
示例#15
0
UPDATE_DB = False
UPDATE_DB = True
SAVEDATA = False

db_filename = "bjerg2003.mdb"
BOOKS_DB = "books.sqlite"
f_isbn = "books_isbn.csv"
f_noisbn = "books_no_isbn.csv"

# Listing the tables.
for tbl in mdb.list_tables(db_filename):
    print(tbl)

# merge de to tabeller
df1 = mdb.read_table(db_filename, "Udgave", dtype={"Sideantal": "string"})
df2 = mdb.read_table(db_filename, "Titel")
df = pd.merge(left=df1, right=df2, left_on="Titel", right_on="Titel")

# Convert missing data Na or NaN to empty strings
# Not for Land, as we use books with NaN to indicate wrong placement
df["Forfatter"] = df["Forfatter"].fillna("")
df["Sideantal"] = df["Sideantal"].fillna("")

data_isbn = []
data_noisbn = []
data_not_online = []

data_oquery = []
data_gquery = []
data_dbkkquery = []
示例#16
0
                        help='Prometea Dataset file')

    args = parser.parse_args()

    # connect to MongoDB prometea database
    print('STEP 01: Connect to Prometea MongoDB database')
    print('---------------------------------------------')
    client = MongoClient(port=27017)
    db = client.prometeadb
    print()

    # load to MS Access prometea database
    print('STEP 02: Load Prometea MS Access dataset')
    print('----------------------------------------')
    print('Loading Prometea Prescripciones Dataset from ' + args.dataset)
    dataset = mdb.read_table(args.dataset, 'prescripciones')
    print()

    # Iterate over parent datasets
    print('STEP 03: Start Merge Prescripciones Dataset')
    print('--------------------------------------')

    # insert child pacientes dataset
    pbar = tqdm(total=dataset.shape[0], desc='Prescripciones Dataset')
    num = 0

    for index, rows in dataset.iterrows():
        db.Pacientes.update_one({
            'centro': rows.centro,
            'nhc': rows.nhc
        }, {
示例#17
0
while attempts < 3:
    try:
        response = urlopen(
            'http://static.data.gov.hk/td/routes-and-fares/ROUTE_GMB.mdb',
            timeout=10)
        content = response.read()
        with open(expanduser('~/Desktop/ROUTE_GMB.mdb'), 'wb') as f:
            f.write(content)
        break
    except URLError as e:
        if attempts == 2:
            exit(1)
        attempts += 1

df = mdb.read_table(expanduser('~/Desktop/ROUTE_GMB.mdb'),
                    mdb.list_tables(expanduser('~/Desktop/ROUTE_GMB.mdb'))[0])
#df.drop_duplicates(subset=['ROUTE_ID', 'ROUTE_SEQ', 'STOP_SEQ'], keep='last', inplace=True)
#df.fillna(-1, inplace=True)
conn = pymssql.connect(server=r'192.168.9.75\sql2012',
                       user='******',
                       password='******',
                       database='DWHRMS_DEMO')
last = len(df) - 1
sql = 'DELETE FROM [t_MobMinibusFares]'
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
for i, r in df.iterrows():
    if i % 1000 == 0:
        sql = f'INSERT INTO [t_MobMinibusFares] ([Route ID], [District], [Route Name], [Source zh-hk], [Source zh-cn], [Source en-us], [Dest zh-hk], [Dest zh-cn], [Dest en-us], [Fare]) \n'
    else:
示例#18
0
# (c) Copyright IBM Corporation 2020.

# LICENSE: Apache License 2.0 (Apache-2.0)
# http://www.apache.org/licenses/LICENSE-2.0

# Preliminary processing code, used to create the csv files. Requires special installation to open `.mdb` files
try:
    import pandas_access as mdb  # This is a pandas wrapper for `mdbtools`, mdbtools must be installed separately with homebrew (on mac)

    mdb_file = 'isear/isear_databank.mdb'

    df_full = mdb.read_table(mdb_file, 'DATA')
    df = df_full[['Field1', 'SIT']]
    df = df.rename(columns={"Field1": "label", "SIT": "text"})
    df['text'] = df['text'].apply(lambda x: x.replace('á\n', ''))

    df.to_csv('isear/isear_data.csv', index=False)

except Exception as e:
    print(
        '\n********************************************************************************************************\n'
        '****** Loading the ISEAR dataset requires special dependencies.                                *********\n'
        '****** On Mac/Linux, install https://github.com/mdbtools/mdbtools and `pip install pandas_access`, *****\n'
        '****** and then rerun the main script.                                                             *****\n'
        '********************************************************************************************************\n'
    )
                        help='Prometea Dataset file')

    args = parser.parse_args()

    # connect to MongoDB prometea database
    print('STEP 01: Connect to Prometea MongoDB database')
    print('---------------------------------------------')
    client = MongoClient(port=27017)
    db = client.prometeadb
    print()

    # load to MS Access prometea database
    print('STEP 02: Load Prometea MS Access dataset')
    print('----------------------------------------')
    print('Loading Prometea DGP 5 Años Dataset from ' + args.dataset)
    dataset = mdb.read_table(args.dataset, 'DGP_5Anios')
    print()

    # Iterate over parent datasets
    print('STEP 03: Start Merge DGP 5 Años Dataset')
    print('--------------------------------------')

    # insert child pacientes dataset
    pbar = tqdm(total=dataset.shape[0], desc='DGP 5 Años Dataset')
    num = 0

    for index, rows in dataset.iterrows():
        db.Paciente.update_one({
            'centro': rows.centro,
            'nhc': rows.nhc
        }, {
import pandas_access as mdb
import json

db_filename = 'l2.mdb'

# Listing the tables.
#for tbl in mdb.list_tables(db_filename):
#  print(tbl)


# Read a small table.
df = mdb.read_table(db_filename, "drops").sort_values(by=['npc_id', 'item_id']).to_dict()

result = {}
current = None
drops = []
spoil = []
for k, v in df['npc_id'].iteritems():
	if current != v:
		if current:
			#print sorted(drops, key = lambda x: int(x[0]))
			result[current] = {"drop": drops, "spoil": spoil}
		print "New mob: %s " % v
		current = v
		drops = []
		spoil = []
	print v
	#print k
#	print "Item id: {} percentage: {} is_sweep: {} min: {} max: {}".format(v, drops_list['percentage'][k], drops_list['sweep'][k], drops_list['min'][k], drops_list['max'][k])
	data = [df['item_id'][k], df['min'][k], df['max'][k], df['percentage'][k] / 100.0]
	if df['sweep'][k] == "1":
示例#21
0
def phabconvert(infile, login_info='test-test-test-test-test'):

    login_info = login_info.strip().split("-")
    login = str(login_info[0])
    agency = str(login_info[1])
    owner = str(login_info[2])
    year = str(login_info[3])
    project = str(login_info[4])

    errorLog("READ IN TABLES REQUIRED TO RUN QUERY:")
    sample_entry = mdb.read_table(infile,
                                  "Sample_Entry",
                                  dtype={'s_Generation': str})
    errorLog(sample_entry)
    event_lookup = mdb.read_table(infile, "EventLookUp")
    protocol_lookup = mdb.read_table(infile, "ProtocolLookUp")
    station_lookup = mdb.read_table(infile,
                                    "StationLookUp",
                                    dtype={'s_Generation': str})
    agency_lookup = mdb.read_table(infile,
                                   "AgencyLookUp",
                                   dtype={'s_Generation': str})
    project_lookup = mdb.read_table(infile,
                                    "ProjectLookUp",
                                    dtype={'s_Generation': str})
    qa_lookup = mdb.read_table(infile, "QALookUp", dtype={'s_Generation': str})
    resqual_lookup = mdb.read_table(infile,
                                    "ResQualLookUp",
                                    dtype={'s_Generation': str})
    stationdetail_lookup = mdb.read_table(infile,
                                          "StationDetailLookUp",
                                          dtype={'s_Generation': str})
    location_entry = mdb.read_table(infile,
                                    "Location_Entry",
                                    dtype={'s_Generation': str})
    location_lookup = mdb.read_table(infile,
                                     "LocationLookUp",
                                     dtype={'s_Generation': str})
    parent_project_lookup = mdb.read_table(infile,
                                           "ParentProjectLookUp",
                                           dtype={'s_Generation': str})

    collectionmethod_lookup = mdb.read_table(infile,
                                             "CollectionMethodLookUp",
                                             dtype={'s_Generation': str})
    constituent_lookup = mdb.read_table(infile,
                                        "ConstituentLookUp",
                                        dtype={'s_Generation': str})
    matrix_lookup = mdb.read_table(infile,
                                   "MatrixLookUp",
                                   dtype={'s_Generation': str})
    method_lookup = mdb.read_table(infile,
                                   "MethodLookUp",
                                   dtype={'s_Generation': str})
    analyte_lookup = mdb.read_table(infile,
                                    "AnalyteLookUp",
                                    dtype={'s_Generation': str})
    unit_lookup = mdb.read_table(infile,
                                 "UnitLookUp",
                                 dtype={'s_Generation': str})
    fraction_lookup = mdb.read_table(infile,
                                     "FractionLookUp",
                                     dtype={'s_Generation': str})
    collectiondevice_lookup = mdb.read_table(infile,
                                             "CollectionDeviceLookUp",
                                             dtype={'s_Generation': str})

    compliance_lookup = mdb.read_table(infile,
                                       "ComplianceLookUp",
                                       dtype={'s_Generation': str})
    batchverification_lookup = mdb.read_table(infile,
                                              "BatchVerificationLookUp",
                                              dtype={'s_Generation': str})

    ##### field specific tables
    fieldcollection_entry = mdb.read_table(infile,
                                           "FieldCollection_Entry",
                                           dtype={'s_Generation': str})
    fieldresult_entry = mdb.read_table(infile,
                                       "FieldResult_Entry",
                                       dtype={'s_Generation': str})

    #### habitat specific tables
    habitatcollection_entry = mdb.read_table(infile,
                                             "HabitatCollection_Entry",
                                             dtype={'s_Generation': str})
    habitatresult_entry = mdb.read_table(infile,
                                         "HabitatResult_Entry",
                                         dtype={'s_Generation': str})

    errorLog("STARTING MERGING TOGETHER FIELDS - IE. RUNNING QUERY")
    ### pull together sample and location data - used by both field and habitat queries
    sample = pd.merge(sample_entry[[
        'AgencyCode', 'EventCode', 'ProjectCode', 'ProtocolCode',
        'StationCode', 'SampleDate', 'SampleComments', 'SampleRowID'
    ]],
                      agency_lookup[['AgencyCode', 'AgencyName']],
                      on='AgencyCode',
                      how='left')

    sample = pd.merge(sample,
                      event_lookup[['EventCode', 'EventName']],
                      on='EventCode',
                      how='left')

    sample = pd.merge(
        sample,
        project_lookup[['ParentProjectCode', 'ProjectCode', 'ProjectName']],
        on='ProjectCode',
        how='left')

    sample = pd.merge(sample,
                      protocol_lookup[['ProtocolCode', 'ProtocolName']],
                      on='ProtocolCode',
                      how='left')

    station = pd.merge(station_lookup[[
        'StationCode', 'StationName', 'EcoregionLevel3Code', 'HydrologicUnit',
        'County', 'LocalWatershed', 'UpstreamArea'
    ]],
                       stationdetail_lookup[[
                           'StationCode', 'TargetLatitude', 'TargetLongitude',
                           'Datum'
                       ]],
                       on='StationCode',
                       how='left')

    sample = pd.merge(sample,
                      station[[
                          'StationCode', 'StationName', 'EcoregionLevel3Code',
                          'HydrologicUnit', 'County', 'LocalWatershed',
                          'UpstreamArea', 'TargetLatitude', 'TargetLongitude',
                          'Datum'
                      ]],
                      on='StationCode',
                      how='left')

    location = pd.merge(location_entry[[
        'SampleRowID', 'LocationRowID', 'LocationCode', 'GeometryShape'
    ]],
                        location_lookup[['LocationCode', 'LocationName']],
                        on='LocationCode',
                        how='left')

    sample = pd.merge(sample,
                      location[[
                          'SampleRowID', 'LocationCode', 'LocationName',
                          'LocationRowID', 'GeometryShape'
                      ]],
                      on='SampleRowID',
                      how='left')

    # Robert 20 Feb 2019
    # Here we add the ParentProjectName to the sample dataframe
    sample = pd.merge(
        sample,
        parent_project_lookup[['ParentProjectCode', 'ParentProjectName']],
        on="ParentProjectCode",
        how='left')

    #### pull together constituent entries - - used by both field and habitat queries
    constituent = pd.merge(constituent_lookup[[
        'ConstituentRowID', 'AnalyteCode', 'FractionCode', 'MatrixCode',
        'MethodCode', 'UnitCode'
    ]],
                           fraction_lookup[['FractionCode', 'FractionName']],
                           on='FractionCode',
                           how='left')

    constituent = pd.merge(constituent[[
        'ConstituentRowID', 'AnalyteCode', 'FractionCode', 'FractionName',
        'MatrixCode', 'MethodCode', 'UnitCode'
    ]],
                           analyte_lookup[['AnalyteCode', 'AnalyteName']],
                           on='AnalyteCode',
                           how='left')

    constituent = pd.merge(constituent[[
        'ConstituentRowID', 'AnalyteCode', 'AnalyteName', 'FractionCode',
        'FractionName', 'MatrixCode', 'MethodCode', 'UnitCode'
    ]],
                           matrix_lookup[['MatrixCode', 'MatrixName']],
                           on='MatrixCode',
                           how='left')

    constituent = pd.merge(constituent[[
        'ConstituentRowID', 'AnalyteCode', 'AnalyteName', 'FractionCode',
        'FractionName', 'MatrixCode', 'MatrixName', 'MethodCode', 'UnitCode'
    ]],
                           method_lookup[['MethodCode', 'MethodName']],
                           on='MethodCode',
                           how='left')

    constituent = pd.merge(constituent[[
        'ConstituentRowID', 'AnalyteCode', 'AnalyteName', 'FractionCode',
        'FractionName', 'MatrixCode', 'MatrixName', 'MethodCode', 'MethodName',
        'UnitCode'
    ]],
                           unit_lookup[['UnitCode', 'UnitName']],
                           on='UnitCode',
                           how='left')

    ##### FIELD SPECIFIC CODE
    #### pull together field collection entry
    fieldcollection = pd.merge(fieldcollection_entry[[
        'FieldCollectionRowID', 'LocationRowID', 'CollectionMethodCode',
        'CollectionTime', 'Replicate', 'CollectionDepth',
        'UnitCollectionDepth', 'FieldCollectionComments'
    ]],
                               collectionmethod_lookup[[
                                   'CollectionMethodCode',
                                   'CollectionMethodName'
                               ]],
                               on='CollectionMethodCode',
                               how='left')

    #### pull together field result entries

    fieldresult = pd.merge(fieldresult_entry[[
        'ConstituentRowID', 'FieldResultRowID', 'FieldCollectionRowID',
        'CalibrationDate', 'FieldReplicate', 'Result', 'FieldResultComments',
        'BatchVerificationCode', 'CollectionDeviceCode', 'ComplianceCode',
        'QACode', 'ResQualCode', 'ExportData'
    ]],
                           batchverification_lookup[[
                               'BatchVerificationCode',
                               'BatchVerificationDescr'
                           ]],
                           on='BatchVerificationCode',
                           how='left')

    fieldresult = pd.merge(fieldresult[[
        'ConstituentRowID', 'FieldResultRowID', 'FieldCollectionRowID',
        'CalibrationDate', 'FieldReplicate', 'Result', 'FieldResultComments',
        'BatchVerificationCode', 'BatchVerificationDescr',
        'CollectionDeviceCode', 'ComplianceCode', 'QACode', 'ResQualCode',
        'ExportData'
    ]],
                           collectiondevice_lookup[[
                               'CollectionDeviceCode', 'CollectionDeviceDescr'
                           ]],
                           on='CollectionDeviceCode',
                           how='left')

    fieldresult = pd.merge(fieldresult[[
        'ConstituentRowID', 'FieldResultRowID', 'FieldCollectionRowID',
        'CalibrationDate', 'FieldReplicate', 'Result', 'FieldResultComments',
        'BatchVerificationCode', 'BatchVerificationDescr',
        'CollectionDeviceCode', 'CollectionDeviceDescr', 'ComplianceCode',
        'QACode', 'ResQualCode', 'ExportData'
    ]],
                           compliance_lookup[[
                               'ComplianceCode', 'ComplianceName'
                           ]],
                           on='ComplianceCode',
                           how='left')

    fieldresult = pd.merge(fieldresult[[
        'ConstituentRowID', 'FieldResultRowID', 'FieldCollectionRowID',
        'CalibrationDate', 'FieldReplicate', 'Result', 'FieldResultComments',
        'BatchVerificationCode', 'BatchVerificationDescr',
        'CollectionDeviceCode', 'CollectionDeviceDescr', 'ComplianceCode',
        'ComplianceName', 'QACode', 'ResQualCode', 'ExportData'
    ]],
                           qa_lookup[['QACode', 'QAName', 'QADescr']],
                           on='QACode',
                           how='left')

    fieldresult = pd.merge(fieldresult[[
        'ConstituentRowID', 'FieldResultRowID', 'FieldCollectionRowID',
        'CalibrationDate', 'FieldReplicate', 'Result', 'FieldResultComments',
        'BatchVerificationCode', 'BatchVerificationDescr',
        'CollectionDeviceCode', 'CollectionDeviceDescr', 'ComplianceCode',
        'ComplianceName', 'QACode', 'QAName', 'ResQualCode', 'ExportData'
    ]],
                           resqual_lookup[['ResQualCode', 'ResQualName']],
                           on='ResQualCode',
                           how='left')

    # combine fieldresult and constituent
    fieldresult = pd.merge(fieldresult[[
        'ConstituentRowID', 'FieldResultRowID', 'FieldCollectionRowID',
        'CalibrationDate', 'FieldReplicate', 'Result', 'FieldResultComments',
        'BatchVerificationCode', 'BatchVerificationDescr',
        'CollectionDeviceCode', 'CollectionDeviceDescr', 'ComplianceCode',
        'ComplianceName', 'QACode', 'QAName', 'ResQualCode', 'ResQualName',
        'ExportData'
    ]],
                           constituent[[
                               'ConstituentRowID', 'AnalyteCode',
                               'AnalyteName', 'FractionCode', 'FractionName',
                               'MatrixCode', 'MatrixName', 'MethodCode',
                               'MethodName', 'UnitCode', 'UnitName'
                           ]],
                           on='ConstituentRowID',
                           how='left')

    field = pd.merge(fieldresult[[
        'FieldResultRowID', 'FieldCollectionRowID', 'CalibrationDate',
        'FieldReplicate', 'Result', 'FieldResultComments',
        'BatchVerificationCode', 'BatchVerificationDescr',
        'CollectionDeviceCode', 'CollectionDeviceDescr', 'ComplianceCode',
        'ComplianceName', 'QACode', 'QAName', 'ResQualCode', 'ResQualName',
        'ConstituentRowID', 'AnalyteCode', 'AnalyteName', 'FractionCode',
        'FractionName', 'MatrixCode', 'MatrixName', 'MethodCode', 'MethodName',
        'UnitCode', 'UnitName', 'ExportData'
    ]],
                     fieldcollection[[
                         'FieldCollectionRowID', 'LocationRowID',
                         'CollectionMethodCode', 'CollectionMethodName',
                         'CollectionTime', 'Replicate', 'CollectionDepth',
                         'UnitCollectionDepth', 'FieldCollectionComments'
                     ]],
                     on='FieldCollectionRowID')

    field_query = pd.merge(
        field[[
            'FieldResultRowID', 'FieldCollectionRowID', 'CalibrationDate',
            'FieldReplicate', 'Result', 'FieldResultComments',
            'BatchVerificationCode', 'BatchVerificationDescr',
            'CollectionDeviceCode', 'CollectionDeviceDescr', 'ComplianceCode',
            'ComplianceName', 'QACode', 'QAName', 'ResQualCode', 'ResQualName',
            'ConstituentRowID', 'AnalyteCode', 'AnalyteName', 'FractionCode',
            'FractionName', 'MatrixCode', 'MatrixName', 'MethodCode',
            'MethodName', 'UnitCode', 'UnitName', 'FieldCollectionRowID',
            'LocationRowID', 'CollectionMethodCode', 'CollectionMethodName',
            'CollectionTime', 'Replicate', 'CollectionDepth',
            'UnitCollectionDepth', 'FieldCollectionComments'
        ]],
        sample[[
            'LocationRowID', 'AgencyCode', 'AgencyName', 'EventCode',
            'EventName', 'ProjectCode', 'ProjectName', 'ProtocolCode',
            'ProtocolName', 'StationCode', 'StationName',
            'EcoregionLevel3Code', 'HydrologicUnit', 'County',
            'LocalWatershed', 'UpstreamArea', 'TargetLatitude',
            'TargetLongitude', 'Datum', 'SampleDate', 'SampleComments',
            'SampleRowID', 'LocationCode', 'LocationName', 'GeometryShape'
        ]],
        on='LocationRowID')

    # lowercase all fieldname
    field_query.columns = [x.lower() for x in field_query.columns]

    errorLog("created fields -- datetime:")
    ### calculated fields
    ### month, year, ecoregionlayer, ecoregionlevel, ecoregioncode, rwqcb
    # month from sampledate
    # year from sampledate
    # make sampledate type object into datetime
    field_query['sampledate'] = pd.to_datetime(field_query['sampledate'])
    field_query["month"] = field_query.sampledate.dt.month
    field_query["year"] = field_query.sampledate.dt.year
    errorLog("--- end created fields")

    # ecoregionlayer = empty why?
    field_query['ecoregionlayer'] = ''
    # ecoregionlevel = 3 (habitat) or 33 (field)
    field_query['ecoregionlevel'] = 33
    # ecoregioncode = stationlookup.ecoregionlevel3code
    field_query.rename(columns={'ecoregionlevel3code': 'ecoregioncode'},
                       inplace=True)
    field_query.rename(columns={
        'agencycode': 'sampleagencycode',
        'agencyname': 'sampleagencyname'
    },
                       inplace=True)

    # rwqcb = empty why?
    field_query['rwqcb'] = ''

    ### IMPORTANT THE FIELDS BELOW NEED TO BE FILLED IN ####
    #analytewfraction
    #analytewfractionwunit
    #analytewfractionwmatrixwunit

    #result_textraw
    #resultraw

    # f-h = field or habitat
    field_query['f_h'] = 'f'

    #submittingagency = login_agency
    field_query['submittingagency'] = ''
    #databasefilepath = ''
    field_query['databasefilepath'] = ''
    #dateloaded = submissiondate?
    errorLog("start Timestamp")
    field_query['dateloaded'] = pd.Timestamp(datetime.datetime(2017, 1, 1))
    errorLog("end Timestamp")
    ##dataloadedby
    field_query['dataloadedby'] = 'checker'
    ##cleaned
    field_query['cleaned'] = 1
    ##qaed
    field_query['qaed'] = 1
    ##metricscalculated
    field_query['metricscalculated'] = 1
    ##deactivate
    field_query['deactivate'] = 1
    ##projectcode
    field_query['projectcode'] = ''
    #loadidnum = submissionid
    field_query['loadidnum'] = -88

    field_query['rownum'] = -88

    field_query['project_code'] = project
    field_query['login_email'] = login
    field_query['login_agency'] = agency
    field_query['login_owner'] = owner
    field_query['login_year'] = year
    field_query['login_project'] = project

    # drop temp columns
    # field_query.drop(['constituentrowid','fieldresultrowid', 'fieldcollectionrowid','fieldresultcomments','qaname','fieldcollectioncomments'],axis=1,inplace=True)

    # Rename the columns to what they will be when it gets loaded into tbl_phab
    field_query.rename(columns={
        'fieldcollectionrowid': 'collectionrowid',
        'fieldresultcomments': 'resultcomments',
        'fieldcollectioncomments': 'collectioncomments'
    },
                       inplace=True)

    ##### END FIELD SPECIFIC CODE

    ##### HABITAT SPECIFIC CODE
    #### pull together field collection entry
    habitatcollection = pd.merge(habitatcollection_entry[[
        'HabitatCollectionRowID', 'LocationRowID', 'CollectionMethodCode',
        'CollectionTime', 'Replicate', 'HabitatCollectionComments'
    ]],
                                 collectionmethod_lookup[[
                                     'CollectionMethodCode',
                                     'CollectionMethodName'
                                 ]],
                                 on='CollectionMethodCode',
                                 how='left')

    #### pull together field result entries

    habitatresult = pd.merge(habitatresult_entry[[
        'ConstituentRowID', 'HabitatResultRowID', 'HabitatCollectionRowID',
        'VariableResult', 'Result', 'HabitatResultComments',
        'CollectionDeviceCode', 'ComplianceCode', 'QACode', 'ResQualCode',
        'ExportData'
    ]],
                             collectiondevice_lookup[[
                                 'CollectionDeviceCode',
                                 'CollectionDeviceDescr'
                             ]],
                             on='CollectionDeviceCode',
                             how='left')

    habitatresult = pd.merge(
        habitatresult[[
            'ConstituentRowID', 'HabitatResultRowID', 'HabitatCollectionRowID',
            'VariableResult', 'Result', 'HabitatResultComments',
            'CollectionDeviceCode', 'CollectionDeviceDescr', 'ComplianceCode',
            'QACode', 'ResQualCode', 'ExportData'
        ]],
        compliance_lookup[['ComplianceCode', 'ComplianceName']],
        on='ComplianceCode',
        how='left')

    habitatresult = pd.merge(habitatresult[[
        'ConstituentRowID', 'HabitatResultRowID', 'HabitatCollectionRowID',
        'VariableResult', 'Result', 'HabitatResultComments',
        'CollectionDeviceCode', 'CollectionDeviceDescr', 'ComplianceCode',
        'ComplianceName', 'QACode', 'ResQualCode', 'ExportData'
    ]],
                             qa_lookup[['QACode', 'QAName']],
                             on='QACode',
                             how='left')

    habitatresult = pd.merge(habitatresult[[
        'ConstituentRowID', 'HabitatResultRowID', 'HabitatCollectionRowID',
        'VariableResult', 'Result', 'HabitatResultComments',
        'CollectionDeviceCode', 'CollectionDeviceDescr', 'ComplianceCode',
        'ComplianceName', 'QACode', 'QAName', 'ResQualCode', 'ExportData'
    ]],
                             resqual_lookup[['ResQualCode', 'ResQualName']],
                             on='ResQualCode',
                             how='left')

    # combine fieldresult and constituent
    habitatresult = pd.merge(habitatresult[[
        'ConstituentRowID', 'HabitatResultRowID', 'HabitatCollectionRowID',
        'VariableResult', 'Result', 'HabitatResultComments',
        'CollectionDeviceCode', 'CollectionDeviceDescr', 'ComplianceCode',
        'ComplianceName', 'QACode', 'QAName', 'ResQualCode', 'ResQualName',
        'ExportData'
    ]],
                             constituent[[
                                 'ConstituentRowID', 'AnalyteCode',
                                 'AnalyteName', 'FractionCode', 'FractionName',
                                 'MatrixCode', 'MatrixName', 'MethodCode',
                                 'MethodName', 'UnitCode', 'UnitName'
                             ]],
                             on='ConstituentRowID',
                             how='left')

    habitat = pd.merge(habitatresult[[
        'HabitatResultRowID', 'HabitatCollectionRowID', 'VariableResult',
        'Result', 'HabitatResultComments', 'CollectionDeviceCode',
        'CollectionDeviceDescr', 'ComplianceCode', 'ComplianceName', 'QACode',
        'QAName', 'ResQualCode', 'ResQualName', 'ConstituentRowID',
        'AnalyteCode', 'AnalyteName', 'FractionCode', 'FractionName',
        'MatrixCode', 'MatrixName', 'MethodCode', 'MethodName', 'UnitCode',
        'UnitName', 'ExportData'
    ]],
                       habitatcollection[[
                           'HabitatCollectionRowID', 'LocationRowID',
                           'CollectionMethodCode', 'CollectionMethodName',
                           'CollectionTime', 'Replicate',
                           'HabitatCollectionComments'
                       ]],
                       on='HabitatCollectionRowID')

    habitat_query = pd.merge(
        habitat[[
            'HabitatResultRowID', 'HabitatCollectionRowID', 'LocationRowID',
            'VariableResult', 'Result', 'HabitatResultComments',
            'CollectionDeviceCode', 'CollectionDeviceDescr', 'ComplianceCode',
            'ComplianceName', 'QACode', 'QAName', 'ResQualCode', 'ResQualName',
            'ConstituentRowID', 'AnalyteCode', 'AnalyteName', 'FractionCode',
            'FractionName', 'MatrixCode', 'MatrixName', 'MethodCode',
            'MethodName', 'UnitCode', 'UnitName', 'CollectionMethodCode',
            'CollectionMethodName', 'CollectionTime', 'Replicate',
            'HabitatCollectionComments', 'ExportData'
        ]],
        sample[[
            'LocationRowID', 'AgencyCode', 'AgencyName', 'EventCode',
            'EventName', 'ProjectCode', 'ProjectName', 'ProtocolCode',
            'ProtocolName', 'StationCode', 'StationName',
            'EcoregionLevel3Code', 'HydrologicUnit', 'County',
            'LocalWatershed', 'UpstreamArea', 'TargetLatitude',
            'TargetLongitude', 'Datum', 'SampleDate', 'SampleComments',
            'SampleRowID', 'LocationCode', 'LocationName', 'GeometryShape'
        ]],
        on='LocationRowID')

    # lowercase all fieldname
    habitat_query.columns = [x.lower() for x in habitat_query.columns]

    ### calculated fields
    ### month, year, ecoregionlayer, ecoregionlevel, ecoregioncode, rwqcb
    # month from sampledate
    # year from sampledate
    # make sampledate type object into datetime
    habitat_query['sampledate'] = pd.to_datetime(habitat_query['sampledate'])
    habitat_query["month"] = habitat_query.sampledate.dt.month
    habitat_query["year"] = habitat_query.sampledate.dt.year

    # ecoregionlayer = empty why?
    habitat_query['ecoregionlayer'] = ''
    # ecoregionlevel = 3 (habitat) or 33 (field)
    habitat_query['ecoregionlevel'] = 3
    # ecoregioncode = stationlookup.ecoregionlevel3code
    habitat_query.rename(columns={'ecoregionlevel3code': 'ecoregioncode'},
                         inplace=True)
    habitat_query.rename(columns={
        'agencycode': 'sampleagencycode',
        'agencyname': 'sampleagencyname'
    },
                         inplace=True)

    # rwqcb = empty why?
    habitat_query['rwqcb'] = ''

    #analytewfraction
    #analytewfractionwunit
    #analytewfractionwmatrixwunit

    #result_textraw
    #resultraw

    # f-h = field or habitat
    habitat_query['f_h'] = 'h'

    #submittingagency = login_agency
    habitat_query['submittingagency'] = ''
    #databasefilepath = ''
    habitat_query['databasefilepath'] = ''
    #dateloaded = submissiondate?
    habitat_query['dateloaded'] = pd.Timestamp(datetime.datetime(2017, 1, 1))
    ##dataloadedby
    habitat_query['dataloadedby'] = 'checker'
    ##cleaned
    habitat_query['cleaned'] = 1
    ##qaed
    habitat_query['qaed'] = 1
    ##metricscalculated
    habitat_query['metricscalculated'] = 1
    ##deactivate
    habitat_query['deactivate'] = 1
    ##projectcode
    habitat_query['projectcode'] = ''
    #loadidnum = submissionid
    habitat_query['loadidnum'] = -88

    habitat_query['rownum'] = -88

    habitat_query['project_code'] = project
    habitat_query['login_email'] = login
    habitat_query['login_agency'] = agency
    habitat_query['login_owner'] = owner
    habitat_query['login_year'] = year
    habitat_query['login_project'] = project

    # drop temp columns
    # habitat_query.drop(['constituentrowid','habitatresultrowid', 'habitatcollectionrowid','habitatresultcomments','qaname','habitatcollectioncomments'],axis=1,inplace=True)

    # Rename columns to what they will be when it gets put into tbl_phab
    habitat_query.rename(columns={
        'habitatcollectionrowid': 'collectionrowid',
        'habitatresultcomments': 'resultcomments',
        'habitatcollectioncomments': 'collectioncomments'
    },
                         inplace=True)

    ##### END HABITAT SPECIFIC CODE
    #return field_query, habitat_query

    # For some reason there was a duplicated column in the field query dataframe, and it caused the script to break at the concatenation part.
    # Here is the workaround, to drop duplicated columns
    field_query = field_query.loc[:, ~field_query.columns.duplicated()]

    ## CONCATENATE FIELD AND HABITAT QUERIES
    rawdata = pd.concat([field_query, habitat_query])

    # drop unnecessary columns
    rawdata.drop([
        'constituentrowid', 'fieldresultrowid', 'habitatresultrowid', 'qaname'
    ],
                 axis=1,
                 inplace=True)

    ## FILL IN MISSING FIELDS
    # create engine connection
    eng = create_engine('postgresql://*****:*****@192.168.1.17:5432/smc')
    # get list of tbl_phab columns
    metadata = MetaData()
    tbl_phab = Table('tbl_phab', metadata, autoload=True, autoload_with=eng)
    tbl_phab_cols = [x.name for x in tbl_phab.columns]
    tbl_phab_cols = [
        x for x in tbl_phab_cols if x not in [
            'objectid', 'globalid', 'created_user', 'created_date',
            'last_edited_user', 'last_edited_date'
        ]
    ]
    # get list of missing columns
    missing_cols = [c for c in tbl_phab_cols if c not in rawdata.columns]
    # fill in missing columns with default values. (Needed to load into raw phab data table)
    for x in missing_cols:
        if isinstance(tbl_phab.columns[x].type, VARCHAR):
            rawdata[x] = ''
        if isinstance(tbl_phab.columns[x].type, INTEGER) | isinstance(
                tbl_phab.columns[x].type, NUMERIC) | isinstance(
                    tbl_phab.columns[x].type, SMALLINT):
            rawdata[x] = -88
        if isinstance(tbl_phab.columns[x].type, TIMESTAMP):
            rawdata[x] = datetime.date(1950, 01, 01)

    phabtable = pd.read_sql("SELECT * FROM tbl_phab LIMIT 1", eng)
    phabcolumnsmatched = len(phabtable.columns) - 1

    phabmetricstable = pd.read_sql("SELECT * FROM tmp_phabmetrics LIMIT 1",
                                   eng)
    phabmetricscolumnsmatched = len(phabmetricstable.columns) - 1

    return rawdata
    #     try:
    #         if element == i.encode("utf-8") :
    #             return True
    #     except:
    #         print (i,db.index(i))
    # return False


print("Loading First DataSet ")
df1 = pd.read_excel('PersianWords.xlsx', sheet_name='Sheet1')
persian_words = set(df1['*Total farsi Word (Moin+openoffice.fa+wikipedia)'])
print("Finish Loading Fisrt DataSet")
#Load First DataSet to Memory

print("Loading Second DataSet")
df2 = mdb.read_table("FLEXICON.mdb", "Entries")
affixes = mdb.read_table("FLEXICON.mdb", "Affixes")
# words = list(df2["WrittenForm"])
flexicon_words = set(df2["WrittenForm"])
#Load Second DataSet to Memory
print("Finish Loading Second DataSet")

search_word = u'کیارش'
answer_list = set()
gen = all_perms(search_word)
print("start executing ...")

while 1:
    try:
        next_perm = next(gen)
        if search(next_perm, flexicon_words) or search(next_perm,
    parser.add_argument('-d', '--dataset', type=str, help='Prometea Dataset file')

    args = parser.parse_args()
            
    # connect to MongoDB prometea database
    print('STEP 01: Connect to Prometea MongoDB database')
    print('---------------------------------------------')
    client = MongoClient(port=27017)
    db=client.prometeadb
    print()

    # load to MS Access prometea database        
    print('STEP 02: Load Prometea MS Access dataset')
    print('----------------------------------------')
    print('Loading Prometea DGP Ultimos Dataset from ' + args.dataset)    
    dataset = mdb.read_table(args.dataset, 'DGP_Ultimo')
    print()
          
    # Iterate over parent datasets
    print('STEP 03: Start Merge DGP Ultimos Dataset')    
    print('--------------------------------------')
    
    # insert child pacientes dataset
    pbar = tqdm(total=dataset.shape[0], desc='DGP Ultimos Dataset')
    num = 0
    
    for index, rows in dataset.iterrows(): 
        db.Pacientes.update_one({'centro': rows.centro, 'nhc': rows.nhc}, 
                                {'$push': {'dgps': {'enfermedad': 'epoc',
                                                    'dgp': rows.dgp, 
                                                    'dgpTxt': rows.dgpTxt, 
示例#24
0
def convert(input_name, output_name=None, options={}):
    """Convert MDB table to a GLM object list
    """

    debug(f"input_name = '{input_name}'")
    debug(f"output_name = '{output_name}'")
    debug(f"options = '{options}'")

    # options
    if "class" in options.keys():
        global CLASS
        CLASS = options.pop("class")
    if "modules" in options.keys():
        global MODULES
        MODULES = options.pop("modules").split(",")
    if "chunksize" in options.keys():
        global CHUNKSIZE
        CHUNKSIZE = int(options.pop("chunksize"))
    if "columns" in options.keys():
        global COLUMNS
        columns = options.pop("columns").split(",")
        COLUMNS = {}
        for column in columns:
            spec = column.split(":")
            if len(spec) != 2:
                error(f"columns spec '{spec}'' is invalid", 1)
            COLUMNS[spec[0]] = spec[1]
    if "dtypes" in options.keys():
        global DTYPES
        dtypes = options.pop("dtypes").split(",")
        DTYPES = {}
        for dtype in dtypes:
            spec = dtype.split(":")
            if len(spec) != 2:
                error(f"dtype spec '{spec}'' is invalid", 1)
            DTYPES[spec[0]] = spec[1]
    if "table" in options.keys():
        global TABLE
        TABLE = options.pop("table")
    if options:
        error(f"option '{list(options.keys())[0]}' not valid", 1)

    # check mdb name
    if not os.path.exists(input_name):
        error(f"'{input_name}' not found", 1)
    debug(f"file '{input_name}' ok")

    # check schema
    schema = mdb.read_schema(input_name)
    if TABLE not in schema.keys():
        error(f"table '{TABLE}' not found")
    debug(f"table '{TABLE}' ok")

    # check for class
    if not CLASS and "class" not in schema.keys():
        error("class must be specified if not found in data", 1)

    # load data
    try:
        DB = mdb.read_table(input_name, TABLE, chunksize=CHUNKSIZE)
    except Exception as err:
        error(f"{input_name} open failed ({err})", 1)
    objects = []
    for chunk in DB:
        objects.append(chunk)

    # save data
    if not output_name:
        output_name = TABLE.lower() + ".glm"
    try:
        GLM = open(output_name, "w")
    except Exception as err:
        error(f"{output_name} open failed ({err})", 1)
    GLM.write(f"// generated by '{' '.join(sys.argv)}'\n")
    for module in MODULES:
        GLM.write(f'module {module};\n')

    # process records
    classes = []
    modules = MODULES
    for obj in objects:

        # write module (first time only)
        if "module" in obj.columns:
            module = obj["module"]
        else:
            module = None
        if module and module not in modules:
            modules.append(module)
            GLM.write(f'module {module};\n')

        # write class (first time only)
        if "class" in obj.columns:
            classname = obj["class"]
        elif CLASS:
            classname = CLASS
        else:
            error(f"no class specified (obj={obj})", 2)
        if classname not in classes and not module:
            classes.append(classname)
            GLM.write(f'class {classname} {{\n')
            for item, dtype in zip(obj.columns, obj.dtypes):
                if item in COLUMNS.keys():
                    item = COLUMNS[item]
                if item in [
                        "id", "class", "groupid", "name", "next", "parent",
                        "child_count", "rank", "valid_to_", "schedule_skew",
                        "latitude", "longitude", "in_svc", "out_svc",
                        "rng_state", "heartbeat", "guid", "flags"
                ]:
                    continue
                if item in DTYPES.keys():
                    dtype = DTYPES[item]
                elif str(dtype) not in DATATYPES.keys():
                    dtype = "object"
                else:
                    dtype = DATATYPES[str(dtype)]
                GLM.write(f'\t{dtype} {item};\n')
            GLM.write('}\n')

        # write object
        for num, values in obj.to_dict('index').items():
            GLM.write(f'object {classname} {{\n')
            for key, value in values.items():
                if key in COLUMNS.keys():
                    GLM.write(f'\t{COLUMNS[key]} "{value}";\n')
                else:
                    GLM.write(f'\t{key} "{value}";\n')
            GLM.write('}\n')
# Does what it says on the tin.
# Note that it takes a while to read all the mdb thingies.
# To run pandas_access successfully, install mdbtools through whatever you need.
# -Matt

import pandas
import numpy as np
import pandas_access

DATASET = "./FDOC_January_2017.mdb"

if __name__ == "__main__":
    table_list = []
    for tablename in pandas_access.list_tables(DATASET):
        print(tablename)
        table_list.append(tablename)

    tables_dict = {}

    for table in table_list[0:2]:  #doing more makes it take forever.
        tables_dict[table] = pandas_access.read_table(DATASET, table)
示例#26
0
while attempts < 3:
    try:
        response = urlopen(
            'http://static.data.gov.hk/td/routes-and-fares/RSTOP_BUS.mdb',
            timeout=10)
        content = response.read()
        with open(expanduser('~/Desktop/RSTOP_BUS.mdb'), 'wb') as f:
            f.write(content)
        break
    except URLError as e:
        if attempts == 2:
            exit(1)
        attempts += 1

df = mdb.read_table(expanduser('~/Desktop/RSTOP_BUS.mdb'),
                    mdb.list_tables(expanduser('~/Desktop/RSTOP_BUS.mdb'))[0])
df.drop_duplicates(subset=['ROUTE_ID', 'ROUTE_SEQ', 'STOP_SEQ'],
                   keep='last',
                   inplace=True)
#df.fillna(-1, inplace=True)
conn = pymssql.connect(server='192.168.13.60',
                       user='******',
                       password='******',
                       database='DWHRMS_DEMO')
last = len(df) - 1
sql = 'DELETE FROM [t_MobBusStops]'
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()

folder = expanduser('~/Desktop/rstops')
示例#27
0
def convert(input_name,
            output_name = None,
            options = {}):
    """Convert MDB table to a GLM object list
    """

    debug(f"input_name = '{input_name}'")
    debug(f"output_name = '{output_name}'")
    debug(f"options = '{options}'")

    global PROPERTIES
    # options
    if "chunksize" in options.keys():
        global CHUNKSIZE
        CHUNKSIZE = int(options.pop("chunksize"))
        debug(f"chunksize = {CHUNKSIZE}")
    if "table" in options.keys():
        global TABLE
        TABLE = options.pop("table")
        debug(f"table = {TABLE}")
    if "index" in options.keys():
        global INDEXCOLS
        INDEXCOLS = options.pop("index").split(",")
        debug(f"index = {INDEXCOLS}")
    if "parent" in options.keys():
        global PARENTCOL
        PARENTCOL = options.pop("parent")
        debug(f"parent = {PARENTCOL}")
    if "values" in options.keys():
        global VALUECOLS
        VALUECOLS = options.pop("values").split(",")
        PROPERTIES = VALUECOLS
        debug(f"values = {VALUECOLS}")
    if "csvfile" in options.keys():
        global CSVFILE
        CSVFILE = options.pop("csvfile")
        debug(f"csvfile = {CSVFILE}")
    if "scale" in options.keys():
        global SCALE
        SCALE = options.pop("scale").split(",")
        try:
            SCALE = list(map(lambda x:float(x),SCALE))
        except Exception as err:
            error(f"scale is invalid ({err})",1)
        debug(f"scale = {SCALE}")
    if "round" in options.keys():
        global ROUND
        ROUND = options.pop("round)").split(",")
        try:
            ROUND = list(map(lambda x:int(x),ROUND))
        except Exception as err:
            error(f"round is invalid ({err}),1)")
        debug(f"round = {ROUND}")
    if "properties" in options.keys():
        PROPERTIES = options.pop("properties").split(",")
        if len(PROPERTIES) != len(VALUECOLS):
            error("incorrect number of properties",1)
        debug(f"properties = {PROPERTIES}")
    if options:
        error(f"option '{list(options.keys())[0]}' not valid",1)

    # check mdb name
    if not os.path.exists(input_name):
        error(f"'{input_name}' not found",1)
    debug(f"file '{input_name}' ok")

    # check schema
    schema = mdb.read_schema(input_name)
    if TABLE not in schema.keys():
        error(f"table '{TABLE}' not found")
    debug(f"table '{TABLE}' ok")

    # load data
    try:
        DB = mdb.read_table(input_name,TABLE,chunksize=CHUNKSIZE)
    except Exception as err:
        error(f"{input_name} open failed ({err})",1)
    data = []
    index_col = [PARENTCOL]
    index_col.extend(INDEXCOLS)
    usecols = VALUECOLS.copy()
    usecols.extend(index_col)
    for chunk in DB:
        data.append(pandas.read_csv(StringIO(chunk.to_csv()),
            index_col = index_col,
            usecols = usecols,
            parse_dates = [INDEXCOLS[0]],
            infer_datetime_format = False,
            # date_parser = lambda x:[datetime.strptime(y,DATEFORMAT) for y in x],
            ).dropna())
        debug(f"*** Block {len(data)} ***\n {data[-1]}")
        break;
    data = pandas.concat(data).sort_index()
    data.index.names = ["parent","date","hour"]
    data["datetime"] = data.index.get_level_values(1) + pandas.TimedeltaIndex(data.index.get_level_values(2),unit="h")
    data.reset_index(inplace=True)
    data.set_index(["parent","datetime"],inplace=True)
    data.drop(["date","hour"],inplace=True,axis=1)
    for n, name in enumerate(VALUECOLS):
        if type(SCALE) == float:
            data[name] *= SCALE
        elif n > len(SCALE):
            data[name] *= SCALE[-1]
        else:
            data[name] *= SCALE[n]
        if type(ROUND) == int:
            data[name] = data[name].round(ROUND)
        elif n > len(SCALE):
            data[name] = data[name].round(ROUND[-1])
        else:
            data[name] = data[name].round(ROUND[n])

    # write player object
    if not output_name:
        output_name = TABLE.lower() + ".glm"
    try:
        GLM = open(output_name,"w")
    except Exception as err:
        error(f"{output_name} open failed ({err})",1)
    GLM.write(f'''// generated by '{' '.join(sys.argv)}'
#define {TABLE}_STARTTIME={data.index.get_level_values(1).min()}
#define {TABLE}_STOPTIME={data.index.get_level_values(1).max()}
#define {TABLE}_PLAYERS={" ".join(data.index.get_level_values(0).unique())}
module tape;
''')
    for parent in data.index.get_level_values(0).unique():
        GLM.write(f'''object tape.player
{{
    parent "{parent}";
    file "{TABLE}_{parent}.csv";
    property "{','.join(PROPERTIES)}";
}}
''')
        data.loc[parent].to_csv(f"{TABLE}_{parent}.csv",header=False,index=True)
    parser.add_argument('-d', '--dataset', type=str, help='Prometea Dataset filer')

    args = parser.parse_args()
            
    # connect to MongoDB prometea database
    print('STEP 01: Connect to Prometea MongoDB database')
    print('---------------------------------------------')
    client = MongoClient(port=27017)
    db=client.prometeadb
    print()

    # load to MS Access prometea database        
    print('STEP 02: Load Prometea MS Access dataset')
    print('----------------------------------------')
    print('Loading Prometea Pacientes Dataset from ' + args.dataset)    
    datasetPaciente = mdb.read_table(args.dataset, 'Pacientes')        
    print()
          
    # Iterate over parent datasets
    print('STEP 03: Start Update Pacientes Dataset')    
    print('--------------------------------------')
    # insert child pacientes dataset
    pbar = tqdm(total=datasetPaciente.shape[0], desc='Pacientes Dataset')
    num = 0
    
    for index, rows in datasetPaciente.iterrows(): 
        # get first paciente from centro and nhc
        paciente = db.Pacientes.find_one({'centro': rows.centro, 'nhc': rows.nhc})
        
        # if not exist the paciente insert the new one
        if paciente is None:
示例#29
0
import pandas_access as mdb

db_filename = inputfile

# Listing the tables.
for tbl in mdb.list_tables(db_filename):
    print(tbl)

# import all the necessary libraries
import csv
import pyodbc
import pandas as pd
import numpy as np

# In[4]:
dfcl = mdb.read_table(db_filename, table_name="CRASH_LEVEL")
dfcl.to_csv(outputcl, index=False)

# In[5]:
dfvl = mdb.read_table(db_filename, table_name="VEHICLE_LEVEL")
dfvl.to_csv(outputcl, index=False)

# In[6]:
dfpl = mdb.read_table(db_filename, table_name="PERSON_LEVEL")
dfpl.to_csv(outputpl, index=False)

# In[7]:
import csv
import pyodbc
import pandas as pd
import numpy as np
示例#30
0
import pandas
import numpy as np

import geopandas

from simpledbf import Dbf5
import pandas_access as mdb

# Parcel Shape file
prcl_shp = geopandas.GeoDataFrame.from_file(
    'data/open-data-extracted/prcl_shape/prcl.shp')

# Join Parcel Info
Prcl = mdb.read_table("data/open-data-extracted/prcl/prcl.mdb", "Prcl")
parcels = pandas.merge(prcl_shp, Prcl, left_on='HANDLE', right_on='Handle')
parcels.loc[:, 'Nbrhd'] = parcels.Nbrhd.astype(int)

# Join in Neighborhood names
neighborhoods = geopandas.read_file(
    'data/open-data-extracted/nbrhds_wards/BND_Nhd88_cw.shp')
parcels = pandas.merge(parcels,
                       neighborhoods[['NHD_NAME', 'NHD_NUM']],
                       left_on='Nbrhd',
                       right_on='NHD_NUM')

# Join More Parcel info, this one has nice address names to join to CSB data
par = Dbf5('data/open-data-extracted/par/par.dbf',
           codec='cp1250').to_dataframe()
parcels = pandas.merge(parcels, par, on='HANDLE')

parcels.loc[:, 'parcel_address'] = parcels.SITEADDR.apply(