def updateFull(quant_engine, spider_engine):
    # fetch data from source
    tmp_fields = list(map(lambda x: '`%s`' % x, sourceFields))
    tmp_fields = ','.join(tmp_fields)
    sql_statement = "select %s from `%s`" % (tmp_fields, sourceTableName)
    full_data = pd.read_sql(sql_statement, spider_engine)

    # rename columns
    full_data = renameDF(full_data, sourceFields, targetFields)

    # 爬虫库周末也有运行,会爬到重复的数据,所以需要用drop_duplicates
    full_data = full_data.drop_duplicates(targetTimeStamp)

    # change data type
    full_data = chgDFDataType(full_data, chgDataTypeCol, 'float')

    # change datetime format
    full_data.loc[:, targetTimeStamp] = full_data[targetTimeStamp].apply(
        lambda x: x[:10])
    full_data = full_data.sort_values(targetTimeStamp)

    # add time stamp
    full_data[targetNewTimeStamp] = datetime.now()

    # write data tot target
    if not full_data.empty:
        full_data.to_sql(targetTableName,
                         quant_engine,
                         index=False,
                         if_exists='replace')
def updateFull(start_date='2007-01-01'):
    # create source engine
    spider_engine = create_engine(
        'mysql+pymysql://{user}:{password}@{host}/{db}?charset={charset}'.format(**ConfigSpider2))

    tmp_fields = map(lambda x:'`%s`' % x, sourceField)
    tmp_fields = ','.join(tmp_fields)

    # get data from file
    sql_statement = "select %s from %s" % (tmp_fields, sourceTableName)
    data_full = pd.read_sql(sql_statement, spider_engine)

    # change column name
    data_full = data_full.rename(columns={sourceTimestampField: targetTimeStampField})

    # change data type
    data_full = chgDFDataType(data_full, chgDataTypeCol, 'float')

    # add time stamp
    data_full.loc[:, targetNewTimeStampField] = datetime.now()

    # create target engine
    quant_engine = create_engine(
        'mysql+pymysql://{user}:{password}@{host}/{db}?charset={charset}'.format(**ConfigQuant))

    data_full.to_sql(targetTableName, quant_engine, index=False, if_exists='replace')
def updateIncrm():
    # create source engine
    spider_engine = create_engine(
        'mysql+pymysql://{user}:{password}@{host}/{db}?charset={charset}'.format(**ConfigSpider2))

    # create target engine
    quant_engine = create_engine(
        'mysql+pymysql://{user}:{password}@{host}/{db}?charset={charset}'.format(**ConfigQuant))

    # get lastest tradedate
    sql_statement = "select max(`%s`) from %s" % (targetTimeStampField, targetTableName)
    latest_date = pd.read_sql(sql_statement, quant_engine)
    if not latest_date.empty:
        latest_date = latest_date.iloc[0, 0]

    tmp_fields = map(lambda x: '`%s`' % x, sourceField)
    tmp_fields = ','.join(tmp_fields)

    # get data from file
    sql_statement = "select %s from %s where %s > '%s'" % (tmp_fields, sourceTableName, sourceTimestampField, latest_date)
    data_incrm = pd.read_sql(sql_statement, spider_engine)

    if data_incrm.empty:
        return

    # change column name
    data_incrm = data_incrm.rename(columns={sourceTimestampField: targetTimeStampField})

    # change data type
    data_incrm = chgDFDataType(data_incrm, chgDataTypeCol, 'float')

    # add time stamp
    data_incrm.loc[:, targetNewTimeStampField] = datetime.now()

    data_incrm.to_sql(targetTableName, quant_engine, index=False, if_exists='append')
def updateFull(quant_engine, spider_engine):
    # fetch data from source
    tmp_fields = list(map(lambda x: '`%s`' % x, sourceFields))
    tmp_fields = ','.join(tmp_fields)
    sql_statement = "select %s from `%s`" % (tmp_fields, sourceTableName)
    full_data = pd.read_sql(sql_statement, spider_engine)

    # rename columns
    full_data = renameDF(full_data, sourceFields, targetFields)

    # change data type
    full_data = full_data.replace('--', 'nan')
    full_data.loc[:, 'change_ratio'] = full_data['change_ratio'].apply(
        lambda x: float(x.strip('%')) / 100.)
    full_data.loc[:, 'volume'] = full_data['volume'].apply(
        lambda x: x.replace(',', ''))
    full_data.loc[:, 'amount'] = full_data['amount'].apply(
        lambda x: x.replace(',', ''))
    full_data = chgDFDataType(full_data, chgDataTypeCol, 'float')

    # drop duplicates
    full_data = full_data.drop_duplicates(['code', 'date'])

    # add time stamp
    full_data[targetNewTimeStamp] = datetime.now()

    # write data tot target
    if not full_data.empty:
        full_data.to_sql(targetTableName,
                         quant_engine,
                         index=False,
                         if_exists='replace')
Exemplo n.º 5
0
def updateIncrm(quant_engine, spider_engine):
    # get target latest date
    sql_statement = 'select max(`%s`) from `%s`' % (targetTimeStamp, targetTableNameMarketIndex)
    target_max_timestamp = pd.read_sql(sql_statement, quant_engine) # quant schema
    target_max_timestamp = target_max_timestamp.iloc[0, 0]

    sql_statement = 'select max(`%s`) from `%s`' % (targetTimeStamp, targetTableNameHS)
    target_max_timestamp_hs = pd.read_sql(sql_statement, quant_engine)  # quant schema
    target_max_timestamp_hs = target_max_timestamp_hs.iloc[0, 0]

    sql_timestamp = min(target_max_timestamp, target_max_timestamp_hs)

    # fetch data from source
    tmp_fields = list(map(lambda x: '`%s`' % x, sourceFields))
    tmp_fields = ','.join(tmp_fields)
    sql_statement = "select %s from `%s` where `%s` > '%s'" % (
        tmp_fields, sourceTableName, sourceTimeStamp, sql_timestamp)
    incrm_data = pd.read_sql(sql_statement, spider_engine) # spider schema

    if incrm_data.empty:
        return

    # rename columns
    incrm_data = renameDF(incrm_data, sourceFields, targetFields)

    # drop duplicates
    incrm_data = incrm_data.drop_duplicates(['date', 'code'])

    # change data type
    incrm_data.loc[:, 'amount'] = incrm_data['amount'].apply(lambda x: float(x[:-1]) * 100000000 if x[-1] == u'亿' else (
        float(x[:-1]) * 10000 if x[-1] == u'万' else float(x)))
    incrm_data = chgDFDataType(incrm_data, chgDataTypeCol, 'float')

    market_index_data = incrm_data.loc[incrm_data['code'].isin(list(marketToCode.values()))]
    hs300_data = incrm_data.loc[incrm_data['code'] == '000300']

    # trim by date
    market_index_data = market_index_data.loc[market_index_data['date'] > target_max_timestamp]
    hs300_data = hs300_data.loc[hs300_data['date'] > target_max_timestamp_hs]

    # convert code to market
    market_index_data['market'] = ''
    for (tmp_market, tmp_code) in marketToCode.items():
        market_index_data.loc[market_index_data['code'] == tmp_code, 'market'] = tmp_market

    # drop column
    market_index_data = market_index_data.drop('code', axis=1)
    hs300_data = hs300_data.drop('code', axis=1)

    # add time stamp & write data to target
    if not incrm_data.empty:
        market_index_data[targetNewTimeStamp] = datetime.now()
        market_index_data.to_sql(targetTableNameMarketIndex, quant_engine, index=False, if_exists='append')
    if not hs300_data.empty:
        hs300_data[targetNewTimeStamp] = datetime.now()
        hs300_data.to_sql(targetTableNameHS, quant_engine, index=False, if_exists='append')
Exemplo n.º 6
0
def updateXueQiuIncrm(sql_conn_quant, sql_conn_spider, target_max_timestamp,
                      supposed_date_num):
    target_max_timestamp_format = target_max_timestamp.replace(
        '-', '')  # 2015-01-01 --> 20150101

    # fetch data from source
    tmp_fields = list(map(lambda x: '`%s`' % x, sourceFields))
    tmp_fields = ','.join(tmp_fields)
    # there maybe some record update later (so use ">="  instead of ">"), fetch data including the latest day in target, drop duplicates later
    sql_statement = "select %s from `%s` where `%s` >= '%s'" % (
        tmp_fields, sourceTableName, sourceTimeStamp,
        target_max_timestamp_format)
    incrm_data = pd.read_sql(sql_statement, sql_conn_spider)

    # rename columns
    incrm_data = renameDF(incrm_data, sourceFields, targetFields)

    # change date format
    incrm_data.loc[:, 'date'] = incrm_data['date'].apply(
        lambda x: '-'.join([x[:4], x[4:6], x[6:8]]))
    incrm_data = incrm_data.loc[incrm_data['date'] >= target_max_timestamp]

    # drop duplicates
    incrm_data = incrm_data.drop_duplicates(['date', 'code'])

    # fetch latest data in target table
    tmp_fields = list(map(lambda x: '`%s`' % x, targetFields))
    tmp_fields = ','.join(tmp_fields)
    sql_statement = "select %s from `%s` where `%s` >= '%s'" % (
        tmp_fields, targetTableName, targetTimeStamp, target_max_timestamp)
    existing_data = pd.read_sql(sql_statement, sql_conn_quant)

    # combine existing and increment, and drop duplicates --> remain the real increment and missing data
    incrm_data = existing_data.append(incrm_data)
    incrm_data = incrm_data.drop_duplicates(['date', 'code'], keep=False)

    # check if there are missing data from this source
    incrm_data_date_num = incrm_data['date'].unique().size
    if incrm_data_date_num < supposed_date_num:
        return False  # signal of missing data from this source
    else:
        # change data type
        incrm_data = chgDFDataType(incrm_data, chgDataTypeCol, 'float')

        # # calculate from raw data to fill the missings in spider data
        # sup_data = supplementByRawData(quant_engine, target_max_timestamp, incrm_data)

        # sort by date
        incrm_data = incrm_data.sort_values('date')

        # add time stamp
        incrm_data[targetNewTimeStamp] = datetime.now()

        writeDB(sql_conn_quant, targetTableName, incrm_data)

        return True  # signal of successfully written data to database
Exemplo n.º 7
0
def updateIncrm(quant_engine, spider_engine):
    # get trade calendar
    sql_statement = 'select `%s` from %s' % (calendarField,
                                             tradeCalendarTableName)
    trade_calendar = pd.read_sql(sql_statement, quant_engine)
    trade_calendar = trade_calendar.values.T[0]

    # get target latest date
    for (table_name, index_code) in zip(targetTableName, sourceIndexCode):
        sql_statement = 'select max(`%s`) from `%s`' % (targetTimeStamp,
                                                        table_name)
        target_max_timestamp = pd.read_sql(sql_statement,
                                           quant_engine)  # quant schema
        target_max_timestamp = target_max_timestamp.iloc[0, 0]

        # fetch data from source
        tmp_fields = list(map(lambda x: '`%s`' % x, sourceFields))
        tmp_fields = ','.join(tmp_fields)
        sql_statement = "select %s from `%s` where `%s` > '%s' and `%s` = '%s'" % (
            tmp_fields, sourceTableName, sourceTimeStamp, target_max_timestamp,
            sourceCodeField, index_code)
        incrm_data = pd.read_sql(sql_statement, spider_engine)  # spider schema

        if incrm_data.empty:
            return

        # rename columns
        incrm_data = renameDF(incrm_data, sourceFields, targetFields)

        # change date format
        tmp_date = incrm_data['date'].apply(lambda x: datetime.strptime(
            x[:11], r'%Y年%m月%d日')) - timedelta(days=1)
        incrm_data.loc[:, 'date'] = tmp_date.apply(
            lambda x: datetime.strftime(x, '%Y-%m-%d'))
        incrm_data = incrm_data.loc[incrm_data['date'] > target_max_timestamp]

        # drop duplicates
        incrm_data = incrm_data.drop_duplicates(['date'])
        incrm_data = incrm_data.loc[incrm_data['date'].isin(
            trade_calendar)]  # holiday also creep data

        if incrm_data.empty:
            return

        # change data type
        incrm_data = chgDFDataType(incrm_data, chgDataTypeCol, 'float')
        incrm_data.loc[:, 'change'] = incrm_data['change'] / 100

        # add time stamp & write data to target
        if not incrm_data.empty:
            incrm_data[targetNewTimeStamp] = datetime.now()
            incrm_data.to_sql(table_name,
                              quant_engine,
                              index=False,
                              if_exists='append')
def updateFull(quant_engine, spider_engine, chunk_size, start_date='2007-01-01'):
    # get distinct code
    sql_statement = "select distinct `%s` from %s where %s >= '%s'" % (sourceCode, sourceTableName, sourceTimeStamp, start_date)
    tot_codes = pd.read_sql(sql_statement, spider_engine)
    tot_codes = tot_codes.values.T[0]

    # drop B share codes
    tmp_idx = list(map(lambda x: x[0] != '9', tot_codes))
    tot_codes = tot_codes[tmp_idx]

    # fetch data from source
    tmp_fields = list(map(lambda x: '`%s`' % x, sourceFields))
    tmp_fields = ','.join(tmp_fields)
    write_method = 'replace'
    loop_num = int(tot_codes.size / chunk_size)
    if tot_codes.size > loop_num * chunk_size:
        loop_num += 1
    for i in range(loop_num):
        tmp_code = tot_codes[i*chunk_size:(i+1)*chunk_size]
        tmp_code_str = list(map(lambda x:"'%s'"%x, tmp_code))
        tmp_code_str = ','.join(tmp_code_str)

        sql_statement = "select %s from %s where (`%s` > '%s') and (`%s` != 'null') and (`%s` in (%s))" % (tmp_fields, sourceTableName, sourceTimeStamp,
                        start_date, sourceTimeStamp, sourceCode, tmp_code_str)
        chunk_data = pd.read_sql(sql_statement, spider_engine)

        # rename columns
        rename_dict = {}
        for field in zip(sourceFields, targetFields):
            rename_dict[field[0]] = field[1]
        chunk_data = chunk_data.rename(columns=rename_dict)

        # drop duplicates
        chunk_data = chunk_data.drop_duplicates(['date', 'code'])

        # change data type
        chunk_data.loc[:, 'amount'] = chunk_data['amount'].apply(lambda x: float(x[:-1]) * 100000000 if x[-1] == u'亿' else (
                float(x[:-1]) * 10000 if x[-1] == u'万' else float(x)))
        chunk_data.loc[:, 'turnover'] = chunk_data['turnover'].apply(lambda x: x if x != '-' else 0)
        chunk_data.loc[:, 'turnover'] = chunk_data['turnover'].apply(lambda x: float(x) if x != 'null' else np.nan)
        chunk_data = chgDFDataType(chunk_data, chgDataTypeCol, 'float')

        # add time stamp
        chunk_data[targetNewTimeStamp] = datetime.now()

        # write data to db
        chunk_data.to_sql(targetTableName, quant_engine, index=False, if_exists=write_method)
        write_method = 'append'

    pass
def updateIncrm(quant_engine, spider_engine):
    # get target latest date
    sql_statement = 'select max(`%s`) from `%s`' % (targetTimeStamp,
                                                    targetTableName)
    target_max_timestamp = pd.read_sql(sql_statement, quant_engine)
    target_max_timestamp = target_max_timestamp.iloc[0, 0]

    # fetch data from source
    tmp_fields = list(map(lambda x: '`%s`' % x, sourceFields))
    tmp_fields = ','.join(tmp_fields)
    sql_statement = "select %s from `%s` where `%s` > '%s'" % (
        tmp_fields, sourceTableName, sourceTimeStamp, target_max_timestamp)
    incrm_data = pd.read_sql(sql_statement, spider_engine)

    # rename columns
    incrm_data = renameDF(incrm_data, sourceFields, targetFields)

    # drop duplicates
    incrm_data = incrm_data.drop_duplicates(['code', 'date'])

    if not incrm_data.empty:
        # change data type
        incrm_data = incrm_data.replace('--', 'nan')
        incrm_data.loc[:, 'change_ratio'] = incrm_data['change_ratio'].apply(
            lambda x: float(x.strip('%')) / 100.)
        incrm_data.loc[:, 'volume'] = incrm_data['volume'].apply(
            lambda x: x.replace(',', ''))
        incrm_data.loc[:, 'amount'] = incrm_data['amount'].apply(
            lambda x: x.replace(',', ''))
        incrm_data = chgDFDataType(incrm_data, chgDataTypeCol, 'float')

        # sort by date
        incrm_data = incrm_data.sort_values(targetTimeStamp)

        # add time stamp
        incrm_data[targetNewTimeStamp] = datetime.now()

        # write data tot target
        incrm_data.to_sql(targetTableName,
                          quant_engine,
                          index=False,
                          if_exists='append')
Exemplo n.º 10
0
def updateFull(quant_engine, spider_engine):
    # fetch data from source
    tmp_fields = list(map(lambda x: '`%s`' % x, sourceFields))
    tmp_fields = ','.join(tmp_fields)
    sql_statement = "select %s from `%s`" % (tmp_fields, sourceTableName)
    full_data = pd.read_sql(sql_statement, spider_engine)  # spider schema

    if full_data.empty:
        return

    # rename columns
    full_data = renameDF(full_data, sourceFields, targetFields)

    # drop duplicates (if any)
    full_data = full_data.drop_duplicates(['date', 'code'])

    # change data type
    full_data.loc[:, 'amount'] = full_data['amount'].apply(lambda x: float(x[:-1]) * 100000000 if x[-1] == u'亿' else (float(x[:-1]) * 10000 if x[-1] == u'万' else float(x)))
    full_data = chgDFDataType(full_data, chgDataTypeCol, 'float')

    market_index_data = full_data.loc[full_data['code'].isin(list(marketToCode.values()))]
    hs300_data = full_data.loc[full_data['code'] == '000300']

    # convert code to market
    market_index_data['market'] = ''
    for (tmp_market, tmp_code) in marketToCode.items():
        market_index_data.loc[market_index_data['code'] == tmp_code, 'market'] = tmp_market

    market_index_data = market_index_data.drop('code', axis=1)
    hs300_data = hs300_data.drop('code', axis=1)

    # add time stamp
    market_index_data[targetNewTimeStamp] = datetime.now()
    hs300_data[targetNewTimeStamp] = datetime.now()

    # write data to target
    if not market_index_data.empty:
        market_index_data.to_sql(targetTableNameMarketIndex, quant_engine, index=False, if_exists='replace')
    if not hs300_data.empty:
        hs300_data.to_sql(targetTableNameHS, quant_engine, index=False, if_exists='replace')

    pass
def updateIncrm(quant_engine, spider_engine):
    # get lastest tradedate
    sql_statement = "select max(`%s`) from %s where `%s` != 'null'" % (targetTimeStamp, targetTableName, targetTimeStamp)
    latest_date = pd.read_sql(sql_statement, quant_engine).iloc[0,0]

    # get incremental data
    tmp_fields = list(map(lambda x: '`%s`' % x, sourceFields))
    tmp_fields = ','.join(tmp_fields)
    sql_statement = "select %s from %s where (%s > '%s') and (%s != 'null')" % (tmp_fields, sourceTableName, sourceTimeStamp,
                                                                                latest_date, sourceTimeStamp)
    incrm_data = pd.read_sql(sql_statement, spider_engine)

    print('data from spider:', incrm_data.shape)

    # rename data
    rename_dict = {}
    for field in zip(sourceFields, targetFields):
        rename_dict[field[0]] = field[1]
    incrm_data = incrm_data.rename(columns=rename_dict)

    # drop duplicates
    incrm_data = incrm_data.drop_duplicates(['date', 'code'])

    # drop B shares
    incrm_data = incrm_data.loc[incrm_data['code'].apply(lambda x: x[0] != '9')]

    # change data type
    incrm_data.loc[:, 'amount'] = incrm_data['amount'].apply(lambda x: float(x[:-1]) * 100000000 if x[-1] == u'亿' else (
        float(x[:-1]) * 10000 if x[-1] == u'万' else float(x)))
    incrm_data.loc[:, 'turnover'] = incrm_data['turnover'].apply(lambda x: x if x != '-' else 0)
    incrm_data.loc[:, 'turnover'] = incrm_data['turnover'].apply(lambda x: float(x) if x != 'null' else np.nan)
    incrm_data = chgDFDataType(incrm_data, chgDataTypeCol, 'float')

    # add time stamp
    incrm_data[targetNewTimeStamp] = datetime.now()

    print('data to write:', incrm_data.shape)

    # write data to db
    if not incrm_data.empty:
        incrm_data.to_sql(targetTableName, quant_engine, index=False, if_exists='append')
    pass
def updateIncrm(quant_engine, spider_engine):
    # get target latest date
    sql_statement = 'select max(`%s`) from `%s`' % (targetTimeStamp,
                                                    targetTableName)
    target_max_timestamp = pd.read_sql(sql_statement, quant_engine)
    target_max_timestamp = target_max_timestamp.iloc[0, 0]

    # fetch data from source
    tmp_fields = list(map(lambda x: '`%s`' % x, sourceFields))
    tmp_fields = ','.join(tmp_fields)
    sql_statement = "select %s from `%s` where `%s` > '%s'" % (
        tmp_fields, sourceTableName, sourceTimeStamp, target_max_timestamp)
    incrm_data = pd.read_sql(sql_statement, spider_engine)

    # rename columns
    incrm_data = renameDF(incrm_data, sourceFields, targetFields)

    # 爬虫库周末也有运行,会爬到重复的数据,所以需要用drop_duplicates
    incrm_data = incrm_data.drop_duplicates(targetTimeStamp)

    # change data type
    incrm_data = chgDFDataType(incrm_data, chgDataTypeCol, 'float')

    # change datetime format
    incrm_data.loc[:, targetTimeStamp] = incrm_data[targetTimeStamp].apply(
        lambda x: x[:10])
    incrm_data = incrm_data.loc[
        incrm_data[targetTimeStamp] > target_max_timestamp]
    incrm_data = incrm_data.sort_values(targetTimeStamp)

    # add time stamp
    incrm_data[targetNewTimeStamp] = datetime.now()

    # write data tot target
    if not incrm_data.empty:
        incrm_data.to_sql(targetTableName,
                          quant_engine,
                          index=False,
                          if_exists='append')
Exemplo n.º 13
0
def updateFull(start_date='2007-01-01'):
    # create source engine
    spider_engine = create_engine(
        'mysql+pymysql://{user}:{password}@{host}/{db}?charset={charset}'.
        format(**ConfigSpider2))

    tmp_fields = map(lambda x: '`%s`' % x, sourceField)
    tmp_fields = ','.join(tmp_fields)

    # get data from file
    sql_statement = "select %s from %s" % (tmp_fields, sourceTableName)
    data_full = pd.read_sql(sql_statement, spider_engine)

    # change column name
    tmp_rename_dict = dict(zip(sourceField, targetField))
    data_full = data_full.rename(columns=tmp_rename_dict)
    data_full.loc[:, targetTimeStampField] = data_full[
        targetTimeStampField].apply(lambda x: x[:10])

    # drop duplicated
    data_full = data_full.drop_duplicates(targetTimeStampField)

    # change data type
    tmp_fields = targetField.copy()
    tmp_fields.remove(targetTimeStampField)
    data_full = chgDFDataType(data_full, tmp_fields, 'float')

    # add time stamp
    data_full.loc[:, targetNewTimeStampField] = datetime.now()

    # create target engine
    quant_engine = create_engine(
        'mysql+pymysql://{user}:{password}@{host}/{db}?charset={charset}'.
        format(**ConfigQuant))

    data_full.to_sql(targetTableName,
                     quant_engine,
                     index=False,
                     if_exists='replace')
def updateTSOldIncrm(calendar, sql_conn_quant, sql_conn_spider,
                     target_max_timestamp, supposed_date_num):
    # fetch data from source
    tmp_fields = list(map(lambda x: '`%s`' % x, sourceFields))
    tmp_fields = ','.join(tmp_fields)
    sql_statement = "select %s from `%s` where `%s` > '%s'" % (
        tmp_fields, sourceTableName, sourceTimeStamp, target_max_timestamp)
    incrm_data = pd.read_sql(sql_statement, sql_conn_spider)

    # rename columns
    incrm_data = renameDF(incrm_data, sourceFields, targetFields)

    # change date format
    incrm_data.loc[:, 'date'] = incrm_data['date'].apply(lambda x: x[:10])
    incrm_data = incrm_data.loc[incrm_data['date'] > target_max_timestamp]

    # use trade calendar to drop duplicates
    incrm_data = incrm_data.loc[incrm_data['date'].isin(calendar)]

    # drop duplicates
    incrm_data = incrm_data.drop_duplicates(['date', 'code'])

    incrm_data_date_num = incrm_data['date'].unique().size

    if incrm_data_date_num == supposed_date_num:  # records from source 2 are complete
        # change data type
        incrm_data = chgDFDataType(incrm_data, chgDataTypeCol, 'float')

        # add time stamp
        incrm_data[targetNewTimeStamp] = datetime.now()

        # write data tot target
        writeDB(sql_conn_quant, targetTableName, incrm_data)

        return True  # successfully
    else:
        return False  # missing data in this source

    return incrm_data
def updateIncrm(quant_engine, spider_engine):
    # get lastest tradedate
    sql_statement = "select max(`%s`) from %s where `%s` != 'null'" % (
        targetTimeStamp, targetTableName, targetTimeStamp)
    quant_conn = quant_engine.connect()
    latest_date = pd.read_sql(sql_statement, quant_conn).iloc[0, 0]
    quant_conn.close()

    # get incremental data
    tmp_fields = list(map(lambda x: '`%s`' % x, sourceFields))
    tmp_fields = ','.join(tmp_fields)
    tmp_sup_fields = list(map(lambda x: '`%s`' % x, sourceSupFields))
    tmp_sup_fields = ','.join(tmp_sup_fields)
    sql_statement = "select %s from %s where (%s > '%s') and (%s != 'null')" % (
        tmp_fields, sourceTableName, sourceTimeStamp, latest_date,
        sourceTimeStamp)
    spider_conn = spider_engine.connect()
    incrm_data = pd.read_sql(sql_statement, spider_conn)

    #  ========== supplement ==============
    sql_statement = "select %s from %s where (`%s` > '%s') and (`%s` != 'null')" % (
        tmp_sup_fields, sourceSupplementTableName, sourceTimeStamp,
        latest_date, sourceTimeStamp)
    incrm_sup_data = pd.read_sql(sql_statement, spider_conn)
    spider_conn.close()
    # process raw data
    incrm_sup_data = incrm_sup_data.drop_duplicates(
        [sourceCode, sourceTimeStamp])
    incrm_sup_data.loc[:, sourceTurnoverField] = incrm_sup_data[
        sourceTurnoverField].apply(lambda x: x if x != '-' else 0)
    # merge main data set with supplement data set
    incrm_data = incrm_data.merge(incrm_sup_data,
                                  how='inner',
                                  on=[sourceTimeStamp, sourceCode],
                                  suffixes=['', '_sup'])
    incrm_data.loc[:, sourceTurnoverField] = incrm_data[
        sourceTurnoverField].fillna(0)
    # change column name set
    sourceFields.append(sourceTurnoverField + '_sup')
    # ===========================
    print('data from spider:', incrm_data.shape)

    # rename data
    rename_dict = {}
    for field in zip(sourceFields, targetFields):
        rename_dict[field[0]] = field[1]
    incrm_data = incrm_data.rename(columns=rename_dict)

    # drop duplicates
    incrm_data = incrm_data.drop_duplicates(['date', 'code'])

    # change data type
    incrm_data.loc[:, 'amount'] = incrm_data['amount'].apply(
        lambda x: float(x[:-1]) * 100000000 if x[-1] == u'亿' else
        (float(x[:-1]) * 10000 if x[-1] == u'万' else float(x)))
    incrm_data.loc[:, 'turnover'] = incrm_data['turnover'].apply(
        lambda x: float(x) if x != 'null' else np.nan)
    incrm_data = chgDFDataType(incrm_data, chgDataTypeCol, 'float')

    # add time stamp
    incrm_data[targetNewTimeStamp] = datetime.now()

    print('data to write:', incrm_data.shape)

    # write data to db
    if not incrm_data.empty:
        quant_conn = quant_engine.connect()
        incrm_data.to_sql(targetTableName,
                          quant_conn,
                          index=False,
                          if_exists='append')
        quant_conn.close()