def updateFull(quant_engine, spider_engine): # fetch data from source tmp_fields = list(map(lambda x: '`%s`' % x, sourceFields)) tmp_fields = ','.join(tmp_fields) sql_statement = "select %s from `%s`" % (tmp_fields, sourceTableName) full_data = pd.read_sql(sql_statement, spider_engine) # rename columns full_data = renameDF(full_data, sourceFields, targetFields) # 爬虫库周末也有运行,会爬到重复的数据,所以需要用drop_duplicates full_data = full_data.drop_duplicates(targetTimeStamp) # change data type full_data = chgDFDataType(full_data, chgDataTypeCol, 'float') # change datetime format full_data.loc[:, targetTimeStamp] = full_data[targetTimeStamp].apply( lambda x: x[:10]) full_data = full_data.sort_values(targetTimeStamp) # add time stamp full_data[targetNewTimeStamp] = datetime.now() # write data tot target if not full_data.empty: full_data.to_sql(targetTableName, quant_engine, index=False, if_exists='replace')
def updateFull(start_date='2007-01-01'): # create source engine spider_engine = create_engine( 'mysql+pymysql://{user}:{password}@{host}/{db}?charset={charset}'.format(**ConfigSpider2)) tmp_fields = map(lambda x:'`%s`' % x, sourceField) tmp_fields = ','.join(tmp_fields) # get data from file sql_statement = "select %s from %s" % (tmp_fields, sourceTableName) data_full = pd.read_sql(sql_statement, spider_engine) # change column name data_full = data_full.rename(columns={sourceTimestampField: targetTimeStampField}) # change data type data_full = chgDFDataType(data_full, chgDataTypeCol, 'float') # add time stamp data_full.loc[:, targetNewTimeStampField] = datetime.now() # create target engine quant_engine = create_engine( 'mysql+pymysql://{user}:{password}@{host}/{db}?charset={charset}'.format(**ConfigQuant)) data_full.to_sql(targetTableName, quant_engine, index=False, if_exists='replace')
def updateIncrm(): # create source engine spider_engine = create_engine( 'mysql+pymysql://{user}:{password}@{host}/{db}?charset={charset}'.format(**ConfigSpider2)) # create target engine quant_engine = create_engine( 'mysql+pymysql://{user}:{password}@{host}/{db}?charset={charset}'.format(**ConfigQuant)) # get lastest tradedate sql_statement = "select max(`%s`) from %s" % (targetTimeStampField, targetTableName) latest_date = pd.read_sql(sql_statement, quant_engine) if not latest_date.empty: latest_date = latest_date.iloc[0, 0] tmp_fields = map(lambda x: '`%s`' % x, sourceField) tmp_fields = ','.join(tmp_fields) # get data from file sql_statement = "select %s from %s where %s > '%s'" % (tmp_fields, sourceTableName, sourceTimestampField, latest_date) data_incrm = pd.read_sql(sql_statement, spider_engine) if data_incrm.empty: return # change column name data_incrm = data_incrm.rename(columns={sourceTimestampField: targetTimeStampField}) # change data type data_incrm = chgDFDataType(data_incrm, chgDataTypeCol, 'float') # add time stamp data_incrm.loc[:, targetNewTimeStampField] = datetime.now() data_incrm.to_sql(targetTableName, quant_engine, index=False, if_exists='append')
def updateFull(quant_engine, spider_engine): # fetch data from source tmp_fields = list(map(lambda x: '`%s`' % x, sourceFields)) tmp_fields = ','.join(tmp_fields) sql_statement = "select %s from `%s`" % (tmp_fields, sourceTableName) full_data = pd.read_sql(sql_statement, spider_engine) # rename columns full_data = renameDF(full_data, sourceFields, targetFields) # change data type full_data = full_data.replace('--', 'nan') full_data.loc[:, 'change_ratio'] = full_data['change_ratio'].apply( lambda x: float(x.strip('%')) / 100.) full_data.loc[:, 'volume'] = full_data['volume'].apply( lambda x: x.replace(',', '')) full_data.loc[:, 'amount'] = full_data['amount'].apply( lambda x: x.replace(',', '')) full_data = chgDFDataType(full_data, chgDataTypeCol, 'float') # drop duplicates full_data = full_data.drop_duplicates(['code', 'date']) # add time stamp full_data[targetNewTimeStamp] = datetime.now() # write data tot target if not full_data.empty: full_data.to_sql(targetTableName, quant_engine, index=False, if_exists='replace')
def updateIncrm(quant_engine, spider_engine): # get target latest date sql_statement = 'select max(`%s`) from `%s`' % (targetTimeStamp, targetTableNameMarketIndex) target_max_timestamp = pd.read_sql(sql_statement, quant_engine) # quant schema target_max_timestamp = target_max_timestamp.iloc[0, 0] sql_statement = 'select max(`%s`) from `%s`' % (targetTimeStamp, targetTableNameHS) target_max_timestamp_hs = pd.read_sql(sql_statement, quant_engine) # quant schema target_max_timestamp_hs = target_max_timestamp_hs.iloc[0, 0] sql_timestamp = min(target_max_timestamp, target_max_timestamp_hs) # fetch data from source tmp_fields = list(map(lambda x: '`%s`' % x, sourceFields)) tmp_fields = ','.join(tmp_fields) sql_statement = "select %s from `%s` where `%s` > '%s'" % ( tmp_fields, sourceTableName, sourceTimeStamp, sql_timestamp) incrm_data = pd.read_sql(sql_statement, spider_engine) # spider schema if incrm_data.empty: return # rename columns incrm_data = renameDF(incrm_data, sourceFields, targetFields) # drop duplicates incrm_data = incrm_data.drop_duplicates(['date', 'code']) # change data type incrm_data.loc[:, 'amount'] = incrm_data['amount'].apply(lambda x: float(x[:-1]) * 100000000 if x[-1] == u'亿' else ( float(x[:-1]) * 10000 if x[-1] == u'万' else float(x))) incrm_data = chgDFDataType(incrm_data, chgDataTypeCol, 'float') market_index_data = incrm_data.loc[incrm_data['code'].isin(list(marketToCode.values()))] hs300_data = incrm_data.loc[incrm_data['code'] == '000300'] # trim by date market_index_data = market_index_data.loc[market_index_data['date'] > target_max_timestamp] hs300_data = hs300_data.loc[hs300_data['date'] > target_max_timestamp_hs] # convert code to market market_index_data['market'] = '' for (tmp_market, tmp_code) in marketToCode.items(): market_index_data.loc[market_index_data['code'] == tmp_code, 'market'] = tmp_market # drop column market_index_data = market_index_data.drop('code', axis=1) hs300_data = hs300_data.drop('code', axis=1) # add time stamp & write data to target if not incrm_data.empty: market_index_data[targetNewTimeStamp] = datetime.now() market_index_data.to_sql(targetTableNameMarketIndex, quant_engine, index=False, if_exists='append') if not hs300_data.empty: hs300_data[targetNewTimeStamp] = datetime.now() hs300_data.to_sql(targetTableNameHS, quant_engine, index=False, if_exists='append')
def updateXueQiuIncrm(sql_conn_quant, sql_conn_spider, target_max_timestamp, supposed_date_num): target_max_timestamp_format = target_max_timestamp.replace( '-', '') # 2015-01-01 --> 20150101 # fetch data from source tmp_fields = list(map(lambda x: '`%s`' % x, sourceFields)) tmp_fields = ','.join(tmp_fields) # there maybe some record update later (so use ">=" instead of ">"), fetch data including the latest day in target, drop duplicates later sql_statement = "select %s from `%s` where `%s` >= '%s'" % ( tmp_fields, sourceTableName, sourceTimeStamp, target_max_timestamp_format) incrm_data = pd.read_sql(sql_statement, sql_conn_spider) # rename columns incrm_data = renameDF(incrm_data, sourceFields, targetFields) # change date format incrm_data.loc[:, 'date'] = incrm_data['date'].apply( lambda x: '-'.join([x[:4], x[4:6], x[6:8]])) incrm_data = incrm_data.loc[incrm_data['date'] >= target_max_timestamp] # drop duplicates incrm_data = incrm_data.drop_duplicates(['date', 'code']) # fetch latest data in target table tmp_fields = list(map(lambda x: '`%s`' % x, targetFields)) tmp_fields = ','.join(tmp_fields) sql_statement = "select %s from `%s` where `%s` >= '%s'" % ( tmp_fields, targetTableName, targetTimeStamp, target_max_timestamp) existing_data = pd.read_sql(sql_statement, sql_conn_quant) # combine existing and increment, and drop duplicates --> remain the real increment and missing data incrm_data = existing_data.append(incrm_data) incrm_data = incrm_data.drop_duplicates(['date', 'code'], keep=False) # check if there are missing data from this source incrm_data_date_num = incrm_data['date'].unique().size if incrm_data_date_num < supposed_date_num: return False # signal of missing data from this source else: # change data type incrm_data = chgDFDataType(incrm_data, chgDataTypeCol, 'float') # # calculate from raw data to fill the missings in spider data # sup_data = supplementByRawData(quant_engine, target_max_timestamp, incrm_data) # sort by date incrm_data = incrm_data.sort_values('date') # add time stamp incrm_data[targetNewTimeStamp] = datetime.now() writeDB(sql_conn_quant, targetTableName, incrm_data) return True # signal of successfully written data to database
def updateIncrm(quant_engine, spider_engine): # get trade calendar sql_statement = 'select `%s` from %s' % (calendarField, tradeCalendarTableName) trade_calendar = pd.read_sql(sql_statement, quant_engine) trade_calendar = trade_calendar.values.T[0] # get target latest date for (table_name, index_code) in zip(targetTableName, sourceIndexCode): sql_statement = 'select max(`%s`) from `%s`' % (targetTimeStamp, table_name) target_max_timestamp = pd.read_sql(sql_statement, quant_engine) # quant schema target_max_timestamp = target_max_timestamp.iloc[0, 0] # fetch data from source tmp_fields = list(map(lambda x: '`%s`' % x, sourceFields)) tmp_fields = ','.join(tmp_fields) sql_statement = "select %s from `%s` where `%s` > '%s' and `%s` = '%s'" % ( tmp_fields, sourceTableName, sourceTimeStamp, target_max_timestamp, sourceCodeField, index_code) incrm_data = pd.read_sql(sql_statement, spider_engine) # spider schema if incrm_data.empty: return # rename columns incrm_data = renameDF(incrm_data, sourceFields, targetFields) # change date format tmp_date = incrm_data['date'].apply(lambda x: datetime.strptime( x[:11], r'%Y年%m月%d日')) - timedelta(days=1) incrm_data.loc[:, 'date'] = tmp_date.apply( lambda x: datetime.strftime(x, '%Y-%m-%d')) incrm_data = incrm_data.loc[incrm_data['date'] > target_max_timestamp] # drop duplicates incrm_data = incrm_data.drop_duplicates(['date']) incrm_data = incrm_data.loc[incrm_data['date'].isin( trade_calendar)] # holiday also creep data if incrm_data.empty: return # change data type incrm_data = chgDFDataType(incrm_data, chgDataTypeCol, 'float') incrm_data.loc[:, 'change'] = incrm_data['change'] / 100 # add time stamp & write data to target if not incrm_data.empty: incrm_data[targetNewTimeStamp] = datetime.now() incrm_data.to_sql(table_name, quant_engine, index=False, if_exists='append')
def updateFull(quant_engine, spider_engine, chunk_size, start_date='2007-01-01'): # get distinct code sql_statement = "select distinct `%s` from %s where %s >= '%s'" % (sourceCode, sourceTableName, sourceTimeStamp, start_date) tot_codes = pd.read_sql(sql_statement, spider_engine) tot_codes = tot_codes.values.T[0] # drop B share codes tmp_idx = list(map(lambda x: x[0] != '9', tot_codes)) tot_codes = tot_codes[tmp_idx] # fetch data from source tmp_fields = list(map(lambda x: '`%s`' % x, sourceFields)) tmp_fields = ','.join(tmp_fields) write_method = 'replace' loop_num = int(tot_codes.size / chunk_size) if tot_codes.size > loop_num * chunk_size: loop_num += 1 for i in range(loop_num): tmp_code = tot_codes[i*chunk_size:(i+1)*chunk_size] tmp_code_str = list(map(lambda x:"'%s'"%x, tmp_code)) tmp_code_str = ','.join(tmp_code_str) sql_statement = "select %s from %s where (`%s` > '%s') and (`%s` != 'null') and (`%s` in (%s))" % (tmp_fields, sourceTableName, sourceTimeStamp, start_date, sourceTimeStamp, sourceCode, tmp_code_str) chunk_data = pd.read_sql(sql_statement, spider_engine) # rename columns rename_dict = {} for field in zip(sourceFields, targetFields): rename_dict[field[0]] = field[1] chunk_data = chunk_data.rename(columns=rename_dict) # drop duplicates chunk_data = chunk_data.drop_duplicates(['date', 'code']) # change data type chunk_data.loc[:, 'amount'] = chunk_data['amount'].apply(lambda x: float(x[:-1]) * 100000000 if x[-1] == u'亿' else ( float(x[:-1]) * 10000 if x[-1] == u'万' else float(x))) chunk_data.loc[:, 'turnover'] = chunk_data['turnover'].apply(lambda x: x if x != '-' else 0) chunk_data.loc[:, 'turnover'] = chunk_data['turnover'].apply(lambda x: float(x) if x != 'null' else np.nan) chunk_data = chgDFDataType(chunk_data, chgDataTypeCol, 'float') # add time stamp chunk_data[targetNewTimeStamp] = datetime.now() # write data to db chunk_data.to_sql(targetTableName, quant_engine, index=False, if_exists=write_method) write_method = 'append' pass
def updateIncrm(quant_engine, spider_engine): # get target latest date sql_statement = 'select max(`%s`) from `%s`' % (targetTimeStamp, targetTableName) target_max_timestamp = pd.read_sql(sql_statement, quant_engine) target_max_timestamp = target_max_timestamp.iloc[0, 0] # fetch data from source tmp_fields = list(map(lambda x: '`%s`' % x, sourceFields)) tmp_fields = ','.join(tmp_fields) sql_statement = "select %s from `%s` where `%s` > '%s'" % ( tmp_fields, sourceTableName, sourceTimeStamp, target_max_timestamp) incrm_data = pd.read_sql(sql_statement, spider_engine) # rename columns incrm_data = renameDF(incrm_data, sourceFields, targetFields) # drop duplicates incrm_data = incrm_data.drop_duplicates(['code', 'date']) if not incrm_data.empty: # change data type incrm_data = incrm_data.replace('--', 'nan') incrm_data.loc[:, 'change_ratio'] = incrm_data['change_ratio'].apply( lambda x: float(x.strip('%')) / 100.) incrm_data.loc[:, 'volume'] = incrm_data['volume'].apply( lambda x: x.replace(',', '')) incrm_data.loc[:, 'amount'] = incrm_data['amount'].apply( lambda x: x.replace(',', '')) incrm_data = chgDFDataType(incrm_data, chgDataTypeCol, 'float') # sort by date incrm_data = incrm_data.sort_values(targetTimeStamp) # add time stamp incrm_data[targetNewTimeStamp] = datetime.now() # write data tot target incrm_data.to_sql(targetTableName, quant_engine, index=False, if_exists='append')
def updateFull(quant_engine, spider_engine): # fetch data from source tmp_fields = list(map(lambda x: '`%s`' % x, sourceFields)) tmp_fields = ','.join(tmp_fields) sql_statement = "select %s from `%s`" % (tmp_fields, sourceTableName) full_data = pd.read_sql(sql_statement, spider_engine) # spider schema if full_data.empty: return # rename columns full_data = renameDF(full_data, sourceFields, targetFields) # drop duplicates (if any) full_data = full_data.drop_duplicates(['date', 'code']) # change data type full_data.loc[:, 'amount'] = full_data['amount'].apply(lambda x: float(x[:-1]) * 100000000 if x[-1] == u'亿' else (float(x[:-1]) * 10000 if x[-1] == u'万' else float(x))) full_data = chgDFDataType(full_data, chgDataTypeCol, 'float') market_index_data = full_data.loc[full_data['code'].isin(list(marketToCode.values()))] hs300_data = full_data.loc[full_data['code'] == '000300'] # convert code to market market_index_data['market'] = '' for (tmp_market, tmp_code) in marketToCode.items(): market_index_data.loc[market_index_data['code'] == tmp_code, 'market'] = tmp_market market_index_data = market_index_data.drop('code', axis=1) hs300_data = hs300_data.drop('code', axis=1) # add time stamp market_index_data[targetNewTimeStamp] = datetime.now() hs300_data[targetNewTimeStamp] = datetime.now() # write data to target if not market_index_data.empty: market_index_data.to_sql(targetTableNameMarketIndex, quant_engine, index=False, if_exists='replace') if not hs300_data.empty: hs300_data.to_sql(targetTableNameHS, quant_engine, index=False, if_exists='replace') pass
def updateIncrm(quant_engine, spider_engine): # get lastest tradedate sql_statement = "select max(`%s`) from %s where `%s` != 'null'" % (targetTimeStamp, targetTableName, targetTimeStamp) latest_date = pd.read_sql(sql_statement, quant_engine).iloc[0,0] # get incremental data tmp_fields = list(map(lambda x: '`%s`' % x, sourceFields)) tmp_fields = ','.join(tmp_fields) sql_statement = "select %s from %s where (%s > '%s') and (%s != 'null')" % (tmp_fields, sourceTableName, sourceTimeStamp, latest_date, sourceTimeStamp) incrm_data = pd.read_sql(sql_statement, spider_engine) print('data from spider:', incrm_data.shape) # rename data rename_dict = {} for field in zip(sourceFields, targetFields): rename_dict[field[0]] = field[1] incrm_data = incrm_data.rename(columns=rename_dict) # drop duplicates incrm_data = incrm_data.drop_duplicates(['date', 'code']) # drop B shares incrm_data = incrm_data.loc[incrm_data['code'].apply(lambda x: x[0] != '9')] # change data type incrm_data.loc[:, 'amount'] = incrm_data['amount'].apply(lambda x: float(x[:-1]) * 100000000 if x[-1] == u'亿' else ( float(x[:-1]) * 10000 if x[-1] == u'万' else float(x))) incrm_data.loc[:, 'turnover'] = incrm_data['turnover'].apply(lambda x: x if x != '-' else 0) incrm_data.loc[:, 'turnover'] = incrm_data['turnover'].apply(lambda x: float(x) if x != 'null' else np.nan) incrm_data = chgDFDataType(incrm_data, chgDataTypeCol, 'float') # add time stamp incrm_data[targetNewTimeStamp] = datetime.now() print('data to write:', incrm_data.shape) # write data to db if not incrm_data.empty: incrm_data.to_sql(targetTableName, quant_engine, index=False, if_exists='append') pass
def updateIncrm(quant_engine, spider_engine): # get target latest date sql_statement = 'select max(`%s`) from `%s`' % (targetTimeStamp, targetTableName) target_max_timestamp = pd.read_sql(sql_statement, quant_engine) target_max_timestamp = target_max_timestamp.iloc[0, 0] # fetch data from source tmp_fields = list(map(lambda x: '`%s`' % x, sourceFields)) tmp_fields = ','.join(tmp_fields) sql_statement = "select %s from `%s` where `%s` > '%s'" % ( tmp_fields, sourceTableName, sourceTimeStamp, target_max_timestamp) incrm_data = pd.read_sql(sql_statement, spider_engine) # rename columns incrm_data = renameDF(incrm_data, sourceFields, targetFields) # 爬虫库周末也有运行,会爬到重复的数据,所以需要用drop_duplicates incrm_data = incrm_data.drop_duplicates(targetTimeStamp) # change data type incrm_data = chgDFDataType(incrm_data, chgDataTypeCol, 'float') # change datetime format incrm_data.loc[:, targetTimeStamp] = incrm_data[targetTimeStamp].apply( lambda x: x[:10]) incrm_data = incrm_data.loc[ incrm_data[targetTimeStamp] > target_max_timestamp] incrm_data = incrm_data.sort_values(targetTimeStamp) # add time stamp incrm_data[targetNewTimeStamp] = datetime.now() # write data tot target if not incrm_data.empty: incrm_data.to_sql(targetTableName, quant_engine, index=False, if_exists='append')
def updateFull(start_date='2007-01-01'): # create source engine spider_engine = create_engine( 'mysql+pymysql://{user}:{password}@{host}/{db}?charset={charset}'. format(**ConfigSpider2)) tmp_fields = map(lambda x: '`%s`' % x, sourceField) tmp_fields = ','.join(tmp_fields) # get data from file sql_statement = "select %s from %s" % (tmp_fields, sourceTableName) data_full = pd.read_sql(sql_statement, spider_engine) # change column name tmp_rename_dict = dict(zip(sourceField, targetField)) data_full = data_full.rename(columns=tmp_rename_dict) data_full.loc[:, targetTimeStampField] = data_full[ targetTimeStampField].apply(lambda x: x[:10]) # drop duplicated data_full = data_full.drop_duplicates(targetTimeStampField) # change data type tmp_fields = targetField.copy() tmp_fields.remove(targetTimeStampField) data_full = chgDFDataType(data_full, tmp_fields, 'float') # add time stamp data_full.loc[:, targetNewTimeStampField] = datetime.now() # create target engine quant_engine = create_engine( 'mysql+pymysql://{user}:{password}@{host}/{db}?charset={charset}'. format(**ConfigQuant)) data_full.to_sql(targetTableName, quant_engine, index=False, if_exists='replace')
def updateTSOldIncrm(calendar, sql_conn_quant, sql_conn_spider, target_max_timestamp, supposed_date_num): # fetch data from source tmp_fields = list(map(lambda x: '`%s`' % x, sourceFields)) tmp_fields = ','.join(tmp_fields) sql_statement = "select %s from `%s` where `%s` > '%s'" % ( tmp_fields, sourceTableName, sourceTimeStamp, target_max_timestamp) incrm_data = pd.read_sql(sql_statement, sql_conn_spider) # rename columns incrm_data = renameDF(incrm_data, sourceFields, targetFields) # change date format incrm_data.loc[:, 'date'] = incrm_data['date'].apply(lambda x: x[:10]) incrm_data = incrm_data.loc[incrm_data['date'] > target_max_timestamp] # use trade calendar to drop duplicates incrm_data = incrm_data.loc[incrm_data['date'].isin(calendar)] # drop duplicates incrm_data = incrm_data.drop_duplicates(['date', 'code']) incrm_data_date_num = incrm_data['date'].unique().size if incrm_data_date_num == supposed_date_num: # records from source 2 are complete # change data type incrm_data = chgDFDataType(incrm_data, chgDataTypeCol, 'float') # add time stamp incrm_data[targetNewTimeStamp] = datetime.now() # write data tot target writeDB(sql_conn_quant, targetTableName, incrm_data) return True # successfully else: return False # missing data in this source return incrm_data
def updateIncrm(quant_engine, spider_engine): # get lastest tradedate sql_statement = "select max(`%s`) from %s where `%s` != 'null'" % ( targetTimeStamp, targetTableName, targetTimeStamp) quant_conn = quant_engine.connect() latest_date = pd.read_sql(sql_statement, quant_conn).iloc[0, 0] quant_conn.close() # get incremental data tmp_fields = list(map(lambda x: '`%s`' % x, sourceFields)) tmp_fields = ','.join(tmp_fields) tmp_sup_fields = list(map(lambda x: '`%s`' % x, sourceSupFields)) tmp_sup_fields = ','.join(tmp_sup_fields) sql_statement = "select %s from %s where (%s > '%s') and (%s != 'null')" % ( tmp_fields, sourceTableName, sourceTimeStamp, latest_date, sourceTimeStamp) spider_conn = spider_engine.connect() incrm_data = pd.read_sql(sql_statement, spider_conn) # ========== supplement ============== sql_statement = "select %s from %s where (`%s` > '%s') and (`%s` != 'null')" % ( tmp_sup_fields, sourceSupplementTableName, sourceTimeStamp, latest_date, sourceTimeStamp) incrm_sup_data = pd.read_sql(sql_statement, spider_conn) spider_conn.close() # process raw data incrm_sup_data = incrm_sup_data.drop_duplicates( [sourceCode, sourceTimeStamp]) incrm_sup_data.loc[:, sourceTurnoverField] = incrm_sup_data[ sourceTurnoverField].apply(lambda x: x if x != '-' else 0) # merge main data set with supplement data set incrm_data = incrm_data.merge(incrm_sup_data, how='inner', on=[sourceTimeStamp, sourceCode], suffixes=['', '_sup']) incrm_data.loc[:, sourceTurnoverField] = incrm_data[ sourceTurnoverField].fillna(0) # change column name set sourceFields.append(sourceTurnoverField + '_sup') # =========================== print('data from spider:', incrm_data.shape) # rename data rename_dict = {} for field in zip(sourceFields, targetFields): rename_dict[field[0]] = field[1] incrm_data = incrm_data.rename(columns=rename_dict) # drop duplicates incrm_data = incrm_data.drop_duplicates(['date', 'code']) # change data type incrm_data.loc[:, 'amount'] = incrm_data['amount'].apply( lambda x: float(x[:-1]) * 100000000 if x[-1] == u'亿' else (float(x[:-1]) * 10000 if x[-1] == u'万' else float(x))) incrm_data.loc[:, 'turnover'] = incrm_data['turnover'].apply( lambda x: float(x) if x != 'null' else np.nan) incrm_data = chgDFDataType(incrm_data, chgDataTypeCol, 'float') # add time stamp incrm_data[targetNewTimeStamp] = datetime.now() print('data to write:', incrm_data.shape) # write data to db if not incrm_data.empty: quant_conn = quant_engine.connect() incrm_data.to_sql(targetTableName, quant_conn, index=False, if_exists='append') quant_conn.close()