def get_time_system_from_data_file(in_file): """Get the time system from the data file. The basic format is: #<time_system> [<time_zone>], e.g., #LOCALTIME 8, #UTCTIME """ time_sys = 'LOCALTIME' time_zone = time.timezone / -3600 f = open(in_file) for line in f: str_line = line for LF in LFs: if LF in line: str_line = line.split(LF)[0] break if str_line[0] != '#': break if str_line.lower().find('utc') >= 0: time_sys = 'UTCTIME' time_zone = 0 break if str_line.lower().find('local') >= 0: line_list = StringClass.split_string(str_line, [',']) if len(line_list) == 2 and MathClass.isnumerical(line_list[1]): time_zone = -1 * int(line_list[1]) break f.close() return time_sys, time_zone
def read_data_items_from_txt(txt_file): """Read data items include title from text file, each data element are split by TAB or COMMA. Be aware, the separator for each line can only be TAB or COMMA, and COMMA is the recommended. Args: txt_file: full path of text data file Returns: 2D data array """ f = open(txt_file) data_items = [] for line in f: str_line = line for LF in LFs: if LF in line: str_line = line.split(LF)[0] break if str_line != '' and str_line.find('#') < 0: line_list = StringClass.split_string(str_line, ['\t']) if len(line_list) <= 1: line_list = StringClass.split_string(str_line, [',']) data_items.append(line_list) f.close() return data_items
def model_io_configuration(cfg, maindb): """ Import Input and Output Configuration of SEIMS, i.e., file.in and file.out Args: cfg: SEIMS config object maindb: MongoDB database object """ file_in_path = cfg.modelcfgs.filein file_out_path = cfg.paramcfgs.init_outputs_file # create if collection not existed c_list = maindb.collection_names() conf_tabs = [DBTableNames.main_filein, DBTableNames.main_fileout] for item in conf_tabs: if not StringClass.string_in_list(item, c_list): maindb.create_collection(item) else: maindb.drop_collection(item) file_in_items = read_data_items_from_txt(file_in_path) file_out_items = read_data_items_from_txt(file_out_path) for item in file_in_items: file_in_dict = dict() values = StringClass.split_string(StringClass.strip_string(item[0]), ['|']) if len(values) != 2: raise ValueError("One item should only have one Tag and one value string," " split by '|'") file_in_dict[ModelCfgFields.tag] = values[0] file_in_dict[ModelCfgFields.value] = values[1] maindb[DBTableNames.main_filein].insert(file_in_dict) # begin to import initial outputs settings bulk = maindb[DBTableNames.main_fileout].initialize_unordered_bulk_op() out_field_array = file_out_items[0] out_data_array = file_out_items[1:] # print out_data_array for item in out_data_array: file_out_dict = dict() for i, v in enumerate(out_field_array): if StringClass.string_match(ModelCfgFields.mod_cls, v): file_out_dict[ModelCfgFields.mod_cls] = item[i] elif StringClass.string_match(ModelCfgFields.output_id, v): file_out_dict[ModelCfgFields.output_id] = item[i] elif StringClass.string_match(ModelCfgFields.desc, v): file_out_dict[ModelCfgFields.desc] = item[i] elif StringClass.string_match(ModelCfgFields.unit, v): file_out_dict[ModelCfgFields.unit] = item[i] elif StringClass.string_match(ModelCfgFields.type, v): file_out_dict[ModelCfgFields.type] = item[i] elif StringClass.string_match(ModelCfgFields.stime, v): file_out_dict[ModelCfgFields.stime] = item[i] elif StringClass.string_match(ModelCfgFields.etime, v): file_out_dict[ModelCfgFields.etime] = item[i] elif StringClass.string_match(ModelCfgFields.interval, v): file_out_dict[ModelCfgFields.interval] = item[i] elif StringClass.string_match(ModelCfgFields.interval_unit, v): file_out_dict[ModelCfgFields.interval_unit] = item[i] elif StringClass.string_match(ModelCfgFields.filename, v): file_out_dict[ModelCfgFields.filename] = item[i] elif StringClass.string_match(ModelCfgFields.use, v): file_out_dict[ModelCfgFields.use] = item[i] elif StringClass.string_match(ModelCfgFields.subbsn, v): file_out_dict[ModelCfgFields.subbsn] = item[i] if file_out_dict.keys() is []: raise ValueError("There are not any valid output item stored in file.out!") bulk.insert(file_out_dict) bulk.execute() # begin to import the desired outputs # create bulk operator bulk = maindb[DBTableNames.main_fileout].initialize_ordered_bulk_op() # read initial parameters from txt file data_items = read_data_items_from_txt(cfg.modelcfgs.fileout) # print (field_names) for i, cur_data_item in enumerate(data_items): data_import = dict() cur_filter = dict() # print (cur_data_item) if len(cur_data_item) == 7: data_import[ModelCfgFields.output_id] = cur_data_item[0] data_import[ModelCfgFields.type] = cur_data_item[1] data_import[ModelCfgFields.stime] = cur_data_item[2] data_import[ModelCfgFields.etime] = cur_data_item[3] data_import[ModelCfgFields.interval] = cur_data_item[4] data_import[ModelCfgFields.interval_unit] = cur_data_item[5] data_import[ModelCfgFields.subbsn] = cur_data_item[6] data_import[ModelCfgFields.use] = 1 cur_filter[ModelCfgFields.output_id] = cur_data_item[0] else: raise RuntimeError("Items in file.out must have 7 columns, i.e., OUTPUTID," "TYPE,STARTTIME,ENDTIME,INTERVAL,INTERVAL_UNIT,SUBBASIN.") bulk.find(cur_filter).update({'$set': data_import}) # execute import operators bulk.execute()
def data_from_txt(hydro_clim_db, obs_txts_list, sites_info_txts_list, subbsn_file): """ Read observed data from txt file Args: hydro_clim_db: hydro-climate dababase obs_txts_list: txt file paths of observed data sites_info_txts_list: txt file paths of site information subbsn_file: subbasin raster file Returns: True or False """ # 1. Read monitor station information, and store variables information and station IDs variable_lists = [] site_ids = [] for site_file in sites_info_txts_list: site_data_items = read_data_items_from_txt(site_file) site_flds = site_data_items[0] for i in range(1, len(site_data_items)): dic = {} for j in range(len(site_data_items[i])): if StringClass.string_match(site_flds[j], StationFields.id): dic[StationFields.id] = int(site_data_items[i][j]) site_ids.append(dic[StationFields.id]) elif StringClass.string_match(site_flds[j], StationFields.name): dic[StationFields.name] = StringClass.strip_string( site_data_items[i][j]) elif StringClass.string_match(site_flds[j], StationFields.type): types = StringClass.split_string( StringClass.strip_string(site_data_items[i][j]), ',') elif StringClass.string_match(site_flds[j], StationFields.lat): dic[StationFields.lat] = float(site_data_items[i][j]) elif StringClass.string_match(site_flds[j], StationFields.lon): dic[StationFields.lon] = float(site_data_items[i][j]) elif StringClass.string_match(site_flds[j], StationFields.x): dic[StationFields.x] = float(site_data_items[i][j]) elif StringClass.string_match(site_flds[j], StationFields.y): dic[StationFields.y] = float(site_data_items[i][j]) elif StringClass.string_match(site_flds[j], StationFields.unit): dic[StationFields.unit] = StringClass.strip_string( site_data_items[i][j]) elif StringClass.string_match(site_flds[j], StationFields.elev): dic[StationFields.elev] = float(site_data_items[i][j]) elif StringClass.string_match(site_flds[j], StationFields.outlet): dic[StationFields.outlet] = float( site_data_items[i][j]) for j, cur_type in enumerate(types): site_dic = dict() site_dic[StationFields.id] = dic[StationFields.id] site_dic[StationFields.name] = dic[StationFields.name] site_dic[StationFields.type] = cur_type site_dic[StationFields.lat] = dic[StationFields.lat] site_dic[StationFields.lon] = dic[StationFields.lon] site_dic[StationFields.x] = dic[StationFields.x] site_dic[StationFields.y] = dic[StationFields.y] site_dic[StationFields.elev] = dic[StationFields.elev] site_dic[StationFields.outlet] = dic[StationFields.outlet] # Add SubbasinID field matched, cur_subbsn_id = ImportObservedData.match_subbasin( subbsn_file, site_dic) if not matched: break cur_subbsn_id_str = '' for tmp_id in cur_subbsn_id: if tmp_id is not None: cur_subbsn_id_str += str(tmp_id) + ',' cur_subbsn_id_str = cur_subbsn_id_str[:-1] site_dic[StationFields.id] = cur_subbsn_id_str curfilter = { StationFields.id: site_dic[StationFields.id], StationFields.type: site_dic[StationFields.type] } # print (curfilter) hydro_clim_db[DBTableNames.sites].find_one_and_replace( curfilter, site_dic, upsert=True) var_dic = dict() var_dic[StationFields.type] = types[j] var_dic[StationFields.unit] = dic[StationFields.unit] if var_dic not in variable_lists: variable_lists.append(var_dic) site_ids = list(set(site_ids)) # 2. Read measurement data and import to MongoDB bulk = hydro_clim_db[ DBTableNames.observes].initialize_ordered_bulk_op() count = 0 for measDataFile in obs_txts_list: # print measDataFile obs_data_items = read_data_items_from_txt(measDataFile) # If the data items is EMPTY or only have one header row, then goto # next data file. if obs_data_items == [] or len(obs_data_items) == 1: continue obs_flds = obs_data_items[0] required_flds = [ StationFields.id, DataValueFields.y, DataValueFields.m, DataValueFields.d, DataValueFields.type, DataValueFields.value ] for fld in required_flds: if not StringClass.string_in_list( fld, obs_flds): # data can not meet the request! raise ValueError( "The %s can not meet the required format!" % measDataFile) for i in range(1, len(obs_data_items)): dic = dict() cur_y = 0 cur_m = 0 cur_d = 0 for j in range(len(obs_data_items[i])): if StringClass.string_match(obs_flds[j], StationFields.id): dic[StationFields.id] = int(obs_data_items[i][j]) # if current site ID is not included, goto next data item if dic[StationFields.id] not in site_ids: continue elif StringClass.string_match(obs_flds[j], DataValueFields.y): cur_y = int(obs_data_items[i][j]) elif StringClass.string_match(obs_flds[j], DataValueFields.m): cur_m = int(obs_data_items[i][j]) elif StringClass.string_match(obs_flds[j], DataValueFields.d): cur_d = int(obs_data_items[i][j]) elif StringClass.string_match(obs_flds[j], DataValueFields.type): dic[DataValueFields.type] = obs_data_items[i][j] elif StringClass.string_match(obs_flds[j], DataValueFields.value): dic[DataValueFields.value] = float( obs_data_items[i][j]) dt = datetime(cur_y, cur_m, cur_d, 0, 0) sec = time.mktime(dt.timetuple()) utc_time = time.gmtime(sec) dic[DataValueFields.local_time] = dt dic[DataValueFields.time_zone] = time.timezone / 3600 dic[DataValueFields.utc] = datetime(utc_time[0], utc_time[1], utc_time[2], utc_time[3]) curfilter = { StationFields.id: dic[StationFields.id], DataValueFields.type: dic[DataValueFields.type], DataValueFields.utc: dic[DataValueFields.utc] } bulk.find(curfilter).replace_one(dic) count += 1 if count % 500 == 0: bulk.execute() bulk = hydro_clim_db[ DBTableNames.observes].initialize_ordered_bulk_op() # db[DBTableNames.observes].find_one_and_replace(curfilter, dic, upsert=True) if count % 500 != 0: bulk.execute() # 3. Add measurement data with unit converted # loop variables list added_dics = [] for curVar in variable_lists: # print curVar # if the unit is mg/L, then change the Type name with the suffix "Conc", # and convert the corresponding data to kg if the discharge data is # available. cur_type = curVar[StationFields.type] cur_unit = curVar[StationFields.unit] # Find data by Type for item in hydro_clim_db[DBTableNames.observes].find( {StationFields.type: cur_type}): # print item dic = dict() dic[StationFields.id] = item[StationFields.id] dic[DataValueFields.value] = item[DataValueFields.value] dic[StationFields.type] = item[StationFields.type] dic[DataValueFields.local_time] = item[ DataValueFields.local_time] dic[DataValueFields.time_zone] = item[ DataValueFields.time_zone] dic[DataValueFields.utc] = item[DataValueFields.utc] if cur_unit == "mg/L": # update the Type name dic[StationFields.type] = cur_type + "Conc" curfilter = { StationFields.id: dic[StationFields.id], DataValueFields.type: cur_type, DataValueFields.utc: dic[DataValueFields.utc] } hydro_clim_db[DBTableNames.observes].find_one_and_replace( curfilter, dic, upsert=True) dic[StationFields.type] = cur_type # find discharge on current day cur_filter = { StationFields.type: "Q", DataValueFields.utc: dic[DataValueFields.utc], StationFields.id: dic[StationFields.id] } q_dic = hydro_clim_db[DBTableNames.observes].find_one( filter=cur_filter) q = -9999. if q_dic is not None: # and q_dic.has_key(DataValueFields.value): q = q_dic[DataValueFields.value] else: continue if cur_unit == "mg/L": # convert mg/L to kg dic[DataValueFields.value] = round( dic[DataValueFields.value] * q * 86400. / 1000., 2) elif cur_unit == "kg": dic[StationFields.type] = cur_type + "Conc" # convert kg to mg/L dic[DataValueFields.value] = round( dic[DataValueFields.value] / q * 1000. / 86400., 2) # add new data item added_dics.append(dic) # import to MongoDB for dic in added_dics: curfilter = { StationFields.id: dic[StationFields.id], DataValueFields.type: dic[DataValueFields.type], DataValueFields.utc: dic[DataValueFields.utc] } hydro_clim_db[DBTableNames.observes].find_one_and_replace( curfilter, dic, upsert=True)
def interpolate_observed_data_to_regular_interval(in_file, time_interval, start_time, end_time, eliminate_zero=False, time_sys_output='UTCTIME', day_divided_hour=0): """ Interpolate not regular observed data to regular time interval data. Args: in_file: input data file, the basic format is as follows: line 1: #<time_system> [<time_zone>], e.g., #LOCALTIME 8, #UTCTIME line 2: DATETIME,field1,field2,... line 3: YYYY-mm-dd HH:MM:SS,field1_value,field2_value,... line 4: ... ... Field name can be PCP, FLOW, SED the unit is mm/h, m3/s, g/L (i.e., kg/m3), respectively. time_interval: time interval, unit is minute, e.g., daily output is 1440 start_time: start time, the format must be 'YYYY-mm-dd HH:MM:SS', and the time system is based on time_sys. end_time: end time, see also start_time. eliminate_zero: Boolean flag. If true, the time interval without original records will not be output. time_sys_output: time system of output time_system, the format must be '<time_system> [<time_zone>]', e.g., 'LOCALTIME' 'LOCALTIME 8' 'UTCTIME' (default) day_divided_hour: If the time_interval is equal to N*1440, this parameter should be carefully specified. The value must range from 0 to 23. e.g., day_divided_hour ==> day ranges (all expressed as 2013-02-03) 0 ==> 2013-02-03 00:00:00 to 2013-02-03 23:59:59 (default) 8 ==> 2013-02-03 08:00:00 to 2013-02-04 07:59:59 20 ==> 2013-02-03 20:00:00 to 2013-02-04 19:59:59 Returns: The output data files are located in the same directory with the input file. The nomenclature is: <field name>_<time system>_<time interval>_<nonzero>, e.g., pcp_utctime_1440_nonzero.txt, flow_localtime_60.txt """ FileClass.check_file_exists(in_file) time_sys_input, time_zone_input = HydroClimateUtilClass.get_time_system_from_data_file(in_file) data_items = read_data_items_from_txt(in_file) flds = data_items[0][:] data_items.remove(flds) if not 0 <= day_divided_hour <= 23: raise ValueError("Day divided hour must range from 0 to 23!") try: date_idx = flds.index('DATETIME') flds.remove('DATETIME') except ValueError: raise ValueError("DATETIME must be one of the fields!") # available field available_flds = ['FLOW', 'SED', 'PCP'] def check_avaiable_field(cur_fld): """Check if the given field name is supported.""" support_flag = False for fff in available_flds: if fff.lower() in cur_fld.lower(): support_flag = True break return support_flag ord_data = OrderedDict() time_zone_output = time.timezone / -3600 if time_sys_output.lower().find('local') >= 0: tmpstrs = StringClass.split_string(time_sys_output, [' ']) if len(tmpstrs) == 2 and MathClass.isnumerical(tmpstrs[1]): time_zone_output = int(tmpstrs[1]) time_sys_output = 'LOCALTIME' else: time_sys_output = 'UTCTIME' time_zone_output = 0 for item in data_items: org_datetime = HydroClimateUtilClass.get_datetime_from_string(item[date_idx]) if time_sys_input == 'LOCALTIME': org_datetime -= timedelta(hours=time_zone_input) # now, org_datetime is UTC time. if time_sys_output == 'LOCALTIME': org_datetime += timedelta(hours=time_zone_output) # now, org_datetime is consistent with the output time system ord_data[org_datetime] = [] for i, v in enumerate(item): if i == date_idx: continue if MathClass.isnumerical(v): ord_data[org_datetime].append(float(v)) else: ord_data[org_datetime].append(v) # print (ord_data) itp_data = OrderedDict() out_time_delta = timedelta(minutes=time_interval) sdatetime = HydroClimateUtilClass.get_datetime_from_string(start_time) edatetime = HydroClimateUtilClass.get_datetime_from_string(end_time) item_dtime = sdatetime if time_interval % 1440 == 0: item_dtime = sdatetime.replace(hour=0, minute=0, second=0) + \ timedelta(minutes=day_divided_hour * 60) while item_dtime <= edatetime: # print (item_dtime) # if item_dtime.month == 12 and item_dtime.day == 31: # print ("debug") sdt = item_dtime # start datetime of records edt = item_dtime + out_time_delta # end datetime of records # get original data items org_items = [] pre_dt = list(ord_data.keys())[0] pre_added = False for i, v in ord_data.items(): if sdt <= i < edt: if not pre_added and pre_dt < sdt < i and sdt - pre_dt < out_time_delta: # only add one item that less than sdt. org_items.append([pre_dt] + ord_data.get(pre_dt)) pre_added = True org_items.append([i] + v) if i > edt: break pre_dt = i if len(org_items) > 0: org_items.append([edt]) # Just add end time for compute convenient if org_items[0][0] < sdt: org_items[0][0] = sdt # set the begin datetime of current time interval # if eliminate time interval without original records # initial interpolated list itp_data[item_dtime] = [0.] * len(flds) if len(org_items) == 0: if eliminate_zero: itp_data.popitem() item_dtime += out_time_delta continue # core interpolation code flow_idx = -1 for v_idx, v_name in enumerate(flds): if not check_avaiable_field(v_name): continue if 'SED' in v_name.upper(): # FLOW must be existed for v_idx2, v_name2 in enumerate(flds): if 'FLOW' in v_name2.upper(): flow_idx = v_idx2 break if flow_idx < 0: raise RuntimeError("To interpolate SED, FLOW must be provided!") for v_idx, v_name in enumerate(flds): if not check_avaiable_field(v_name): continue itp_value = 0. itp_auxiliary_value = 0. for org_item_idx, org_item_dtv in enumerate(org_items): if org_item_idx == 0: continue org_item_dt = org_item_dtv[0] pre_item_dtv = org_items[org_item_idx - 1] pre_item_dt = pre_item_dtv[0] tmp_delta_dt = org_item_dt - pre_item_dt tmp_delta_secs = tmp_delta_dt.days * 86400 + tmp_delta_dt.seconds if 'SED' in v_name.upper(): itp_value += pre_item_dtv[v_idx + 1] * pre_item_dtv[flow_idx + 1] * \ tmp_delta_secs itp_auxiliary_value += pre_item_dtv[flow_idx + 1] * tmp_delta_secs else: itp_value += pre_item_dtv[v_idx + 1] * tmp_delta_secs if 'SED' in v_name.upper(): if MathClass.floatequal(itp_auxiliary_value, 0.): itp_value = 0. print ("WARNING: Flow is 0 for %s, please check!" % item_dtime.strftime('%Y-%m-%d %H:%M:%S')) itp_value /= itp_auxiliary_value elif 'FLOW' in v_name.upper(): itp_value /= (out_time_delta.days * 86400 + out_time_delta.seconds) elif 'PCP' in v_name.upper(): # the input is mm/h, and output is mm itp_value /= 3600. itp_data[item_dtime][v_idx] = round(itp_value, 4) item_dtime += out_time_delta # for i, v in itp_data.items(): # print (i, v) # output to files work_path = os.path.dirname(in_file) header_str = '#' + time_sys_output if time_sys_output == 'LOCALTIME': header_str = header_str + ' ' + str(time_zone_output) for idx, fld in enumerate(flds): if not check_avaiable_field(fld): continue file_name = fld + '_' + time_sys_output + '_' + str(time_interval) if eliminate_zero: file_name += '_nonzero' file_name += '.txt' out_file = work_path + os.sep + file_name f = open(out_file, 'w') f.write(header_str + '\n') f.write('DATETIME,' + fld + '\n') for i, v in itp_data.items(): cur_line = i.strftime('%Y-%m-%d %H:%M:%S') + ',' + str(v[idx]) + '\n' f.write(cur_line) f.close()