예제 #1
0
 def return_url_for_bankuai_stock(self, bankuai, page=1, page_size=10000):
     bankuai_tree = self.__bankuai_tree
     def return_bankuai_code(bankuai_tree, bankuai):
         # bankuai parameter is a list from the top bankuai to the bottom bankuai in the format below
         # ['板块','概念板块','AB股票']
         # Parse the url of bankuai, for the url of bankuai under [概念板块, 地域板块, 行业板块], the numbers before the first underscore is the key to get the stocks belonging to that bankuai
         # e.g. For 板块->概念板块->AB股, the url is list.html#28003498_0_2, 28003498 is the key to get the stocks belonging to AB股
         def drill_to_sub_bankuai(bankuai_dict,sub_bankuai):
             if sub_bankuai in bankuai_dict:
                 return bankuai_dict[sub_bankuai]
             elif "children" in bankuai_dict and sub_bankuai in bankuai_dict["children"]:
                 return bankuai_dict["children"][sub_bankuai]
             else:
                 # This error should not be captured by the except block below
                 raise RuntimeError(sub_bankuai + " is not found.", "in Eastmoney.py") 
                 
         try:
             bankuai_code = re.search(r'#(?P<bankuai_code>\d+)', reduce(drill_to_sub_bankuai, bankuai, bankuai_tree)["url"]).group("bankuai_code")
         except AttributeError:
             # The exception block only captures AttributeError: 'NoneType' object has no attribute 'group'
             print_log("The url of [" + ",".join(bankuai) + "] doesn't contain digits.")
             bankuai_code = "-99"
             
         return bankuai_code
             
     base_url = "http://hqdigi2.eastmoney.com/EM_Quote2010NumericApplication/index.aspx?type=s&sortType=C&sortRule=-1&jsName=quote_123"
     p_page_size = "&pageSize=%(page_size)s"
     p_page = "&page=%(page)s"
     p_bankuai_code = "&style=%(bankuai_code)s" 
     bankuai_url = ( base_url + p_page_size + p_page + p_bankuai_code ) % {"page_size": page_size, "page": page, "bankuai_code": return_bankuai_code(bankuai_tree, bankuai)}
     print_log(bankuai_url) 
     return bankuai_url
def download_log_checker(conn, start_date=options.start_date, end_date=options.end_date, stock_id=options.stock_id):
    start_date_dt = datetime.datetime.strptime(start_date,'%Y%m%d')
    end_date_dt = datetime.datetime.strptime(end_date,'%Y%m%d')
    
    # get stock ids which is_download_success=N
    chk_sql = '''
    select t.biz_date, 
      t.stock_id
    from (
    select 
      biz_date, 
      stock_id, 
      is_download_success, 
      row_number() over(partition by biz_date, stock_id order by download_end_time desc nulls last) rankid
    from dw.log_stock_transaction
    where biz_date between '{start_date}' and '{end_date}' 
    ) t where t.rankid = 1
    and t.is_download_success = 'N' '''.format(start_date=start_date_dt, end_date=end_date_dt)
    if not stock_id is None: chk_sql = chk_sql + ' and t.stock_id = \'' + stock_id + '\''

    cur = get_cur(conn)
    cur.execute(chk_sql)
    rows = list(cur)
    if len(rows) == 0:
        print_log('All the stocks have been downloaded successfully.')
    else:
        for row in rows:
            error_log(str(row['biz_date']) + ':' + row['stock_id'] + ' failed to download.')
    return len(rows)
def load_log_checker(conn,
                     start_date=options.start_date,
                     end_date=options.end_date,
                     stock_id=options.stock_id):
    start_date_dt = datetime.datetime.strptime(start_date, '%Y%m%d')
    end_date_dt = datetime.datetime.strptime(end_date, '%Y%m%d')

    chk_sql = '''
    select biz_date, stock_id
    from dw.log_stock_transaction
    where biz_date between '{start_date}' and '{end_date}'
    and is_download_success = 'Y'
    and (is_load_success = 'N' or is_load_success is null)
    '''.format(start_date=start_date_dt, end_date=end_date_dt)
    if not stock_id is None:
        chk_sql = chk_sql + ' and stock_id = \'' + stock_id + '\''

    cur = get_cur(conn)
    cur.execute(chk_sql)
    rows = list(cur)
    if len(rows) == 0:
        print_log('All the stocks have been loaded successfully.')
    else:
        for row in rows:
            error_log(
                str(row['biz_date']) + ':' + row['stock_id'] +
                ' failed to load.')
    return len(rows)
 def save_formatted_data(self):
     # save formatted data into file, \t as delimiter
     # 9:25:00    50.34   0.15    141 709794  买盘
     with open(self.out_file, 'w') as file:
         file.write(self.stock_trans_object.get_stock_content()[
             self.stock_id][self.date])
     print_log('Formatted data saved to ' + self.out_file)
 def download_to_local(self):
     # save raw data into local
     # if file already exists and size >= 5KB, it doesn't download again.
     if os.path.exists(self.stock_trans_object.download_file) and os.path.getsize(self.stock_trans_object.download_file) >= 1024 * 5:
         print_log(self.stock_trans_object.download_file + ' already exists.')
     else:
         self.stock_trans_object.download_to_local()
     return self.stock_trans_object.download_file
 def delete_existing_records(self):
     del_sql = '''
     delete from dw.stock_transaction where stock_id = '{0}' and biz_date = '{1}'
     '''.format(self.stock_id,
                datetime.datetime.strptime(self.date, '%Y%m%d'))
     get_query_result(self.conn, del_sql)
     print_log('Deletion for {0} {1} completed successfully.'.format(
         self.stock_id, self.date))
예제 #7
0
 def delete_existing_records(self):
     del_sql = """
     delete from dw.stock_transaction where stock_id = '{0}' and biz_date = '{1}'
     """.format(
         self.stock_id, datetime.datetime.strptime(self.date, "%Y%m%d")
     )
     get_query_result(self.conn, del_sql)
     print_log("Deletion for {0} {1} completed successfully.".format(self.stock_id, self.date))
예제 #8
0
def inserter(conn, tabname, colnames, source_type, value, delimiter):
    # this function is used to insert value(a single line or each line in a file) into a specific table
    # tabname: table name
    # colnames: columns in the table
    # source_type: file|str
    # value: when source_type is file, value should be full path of a file, when source_type is str, value is value which will be inserted into table
    # delimiter: delimiter for columns

    column_type_sql = '''select lower(column_name) as column_name, lower(data_type) as data_type
    from information_schema.columns where table_schema || '.' || table_name = '{tabname}' '''.format(
        tabname=tabname)
    rows = get_query_result(conn, column_type_sql)
    sys_col_types = {}
    for row in rows:
        sys_col_types[row['column_name']] = row['data_type']

    types = []
    for colname in colnames.split(','):
        types.append(sys_col_types[colname.strip()])

    if source_type == 'file':  # insert rows in a file into table
        if os.path.exists(value):
            with open(value) as file:
                for row in file:
                    row_splitted = row.strip().split(delimiter)
                    if len(types) == len(row_splitted):
                        out_value = insert_value_formatter(
                            zip(types, row_splitted))
                        ins_sql = 'insert into {0}({1}) values ({2})'.format(
                            tabname, colnames, out_value)
                        sql_result = get_query_result(conn, ins_sql)
                    else:
                        raise RuntimeError(
                            'Len of types and value don\'t match [type:{0}, value:{1}]'
                            .format(','.join(types), value))
        else:
            raise RuntimeError('File doesn\'t exist. [{0}]'.format(value))
    elif source_type == 'str':  # insert a row into table
        # a,b,1,2
        if len(types) == len(value.split(delimiter)):
            out_value = insert_value_formatter(
                zip(types, value.split(delimiter)))
            ins_sql = 'insert into {0}({1}) values ({2})'.format(
                tabname, colnames, out_value)
            sql_result = get_query_result(conn, ins_sql)
        else:
            raise RuntimeError(
                'Len of types and value don\'t match [type:{0}, value:{1}]'.
                format(','.join(types), value))
    else:
        raise RuntimeError('Unknown source type [{0}]'.format(source_type))

    conn.commit()
    print_log('Insertion for {0} is done.'.format(tabname))
 def download_to_local(self):
     # save raw data into local
     # if file already exists and size >= 5KB, it won't be downloaded again.
     if os.path.exists(
             self.stock_trans_object.download_file) and os.path.getsize(
                 self.stock_trans_object.download_file) >= 1024 * 5:
         print_log(self.stock_trans_object.download_file +
                   ' already exists.')
     else:
         self.stock_trans_object.download_to_local()
     return self.stock_trans_object.download_file
예제 #10
0
 def export_bankuai_stock(self, out_file, in_bk=[]):
     # If in_bk parameter is not assigned, export all the bankuai stocks
     # in_bk could be [行业板块, 浙江板块] or [行业板块] 
     bkst_exception = {}
     out_file = return_new_name_for_existing_file(out_file)
     bkstfile = open(out_file, 'wb') # open in wb is used to remove the blank lines
     bkstfile_writer = csv.writer(bkstfile,quoting=csv.QUOTE_NONNUMERIC)
     bkst_head = [u'板块',u'子版块',u'板块名称',u'股票代码',u'股票名称']
     bkstfile_writer.writerow(bkst_head)
     for sub_bk in self.__bankuai_tree[u'板块']["children"]:
         if len(in_bk)>0 and sub_bk != in_bk[0]: continue
         print_log("Start to process -->" + sub_bk + "...")
         for dtl_bk in self.__bankuai_tree[u'板块']["children"][sub_bk]["children"]:
             if len(in_bk)>1 and dtl_bk != in_bk[1]: continue
             print_log("Start to process -->" + sub_bk + "-->" + dtl_bk + "...")
             parent_bk = []
             for i in self.return_stock_in_bankuai([u'板块', sub_bk, dtl_bk]):
                 bkst = []
                 if not isinstance(i, list):
                     parent_bk.append(i)
                 else:
                     for j in i:
                         bkst = parent_bk + j
                         try:
                             bkstfile_writer.writerow(bkst)
                         except:
                             if not j[0] in bkst_exception: bkst_exception[j[0]] = j[1]
     bkstfile.close()
     if len(bkst_exception.keys())>0: 
         print_log("There are " + str(len(bkst_exception.keys())) + " exceptions!")
         for i in bkst_exception:
             print i + bkst_exception[i]
     else:
         print_log("Completed successfully.")
     return bkst_exception
예제 #11
0
def inserter(conn, tabname, colnames, source_type, value, delimiter):
    # this function is used to insert value(a single line or each line in a file) into a specific table
    # tabname: table name
    # colnames: columns in the table
    # source_type: file|str
    # value: when source_type is file, value should be full path of a file, when source_type is str, value is value which will be inserted into table
    # delimiter: delimiter for columns    
    
    column_type_sql = '''select lower(column_name) as column_name, lower(data_type) as data_type
    from information_schema.columns where table_schema || '.' || table_name = '{tabname}' '''.format(tabname=tabname)
    rows = get_query_result(conn, column_type_sql)
    sys_col_types = {}
    for row in rows:
        sys_col_types[row['column_name']] = row['data_type']
    
    types = []
    for colname in colnames.split(','):
        types.append(sys_col_types[colname.strip()])
    
    if source_type == 'file': # insert rows in a file into table
        if os.path.exists(value):
            with open(value) as file:
                for row in file:
                    row_splitted = row.strip().split(delimiter)
                    if len(types) == len(row_splitted):
                        out_value = insert_value_formatter(zip(types,row_splitted))
                        ins_sql = 'insert into {0}({1}) values ({2})' . format(tabname, colnames, out_value)
                        sql_result = get_query_result(conn, ins_sql)
                    else:
                        raise RuntimeError('Len of types and value don\'t match [type:{0}, value:{1}]'.format(','.join(types), value))
        else:
            raise RuntimeError('File doesn\'t exist. [{0}]'.format(value))
    elif source_type == 'str': # insert a row into table
        # a,b,1,2
        if len(types) == len(value.split(delimiter)):
            out_value = insert_value_formatter(zip(types,value.split(delimiter)))
            ins_sql = 'insert into {0}({1}) values ({2})' . format(tabname, colnames, out_value)
            sql_result = get_query_result(conn, ins_sql)
        else:
            raise RuntimeError('Len of types and value don\'t match [type:{0}, value:{1}]'.format(','.join(types), value))
    else:
        raise RuntimeError('Unknown source type [{0}]'.format(source_type))

    conn.commit()
    print_log('Insertion for {0} is done.'.format(tabname))
def downloader(queue,
               conn,
               start_date=options.start_date,
               end_date=options.end_date,
               stock_id=options.stock_id,
               obj_selection=options.obj_selection):
    #-- object list
    obj_mapping = {
        'T': 'Tengxun_stock_transaction',
        'N': 'Netease_stock_transaction',
        'S': 'Sina_stock_transaction',
    }
    if obj_selection is None:
        stock_objects = [
            'Tengxun_stock_transaction', 'Netease_stock_transaction',
            'Sina_stock_transaction'
        ]
    else:
        stock_objects = [
            obj_mapping[o] for o in obj_selection.split('|')
            if o in obj_mapping
        ]

    print_log('|'.join(stock_objects) + ' selected.')

    iter = len(stock_objects)

    cur_date_dt = datetime.datetime.strptime(start_date, '%Y%m%d')
    end_date_dt = datetime.datetime.strptime(end_date, '%Y%m%d')
    while cur_date_dt <= end_date_dt:
        #-- stock list
        stocks = get_stock_list(conn, cur_date_dt, stock_id)
        for stock in stocks:
            cur_date_str = cur_date_dt.strftime('%Y%m%d')
            cur_stock_object = stock_objects[
                iter % len(stock_objects)]  # choose stock object
            while queue.full():
                print_log(
                    '=================> queue is full, wait for 1 second...')
                time.sleep(1)
            s = Stock_trans_downloader(queue, conn, cur_stock_object, stock,
                                       cur_date_str)
            s.start()
            #s.join()
            print_log('-----> queue size: ' + str(queue.qsize()))
            iter += 1
        cur_date_dt = cur_date_dt + datetime.timedelta(1)

    while not queue.empty():
        print_log(
            '=================> queue is not empty yet, wait for 1 second...')
        time.sleep(1)
예제 #13
0
 def export_bankuai_status(self, out_file, in_bk=[]):
     # If in_bk parameter is not assigned, export all the bankuais
     # in_bk could be [行业板块]
     bkbk_exception = []
     out_file = return_new_name_for_existing_file(out_file)
     bkbkfile = open(out_file, 'wb') # open in wb is used to remove the blank lines
     bkbkfile_writer = csv.writer(bkbkfile,quoting=csv.QUOTE_NONNUMERIC)
     bkbk_head = [u'板块',u'子版块',u'板块名称',u'涨跌幅',u'总市值(亿)',u'换手率',u'上涨家数',u'下跌家数',u'领涨股票代码',u'领涨股票',u'领涨股票涨跌幅']
     bkbkfile_writer.writerow(bkbk_head)
     for bk in self.__bankuai_tree[u'板块']["children"]:
         if len(in_bk)>0 and bk != in_bk[0]: continue
         print_log("Start to process -->" + bk + "...")
         parent_bk = []
         for i in self.return_bankuai_in_bankuai([u'板块',bk]):
             bkbk = []
             if not isinstance(i, list):
                 parent_bk.append(i)
             else:
                 for j in i:
                     bkbk = parent_bk + j
                     try:
                         bkbkfile_writer.writerow(bkbk)
                     except:
                         if j[0] not in bkbk_exception: bkbk_exception.append(j[0])
     bkbkfile.close()
     if len(bkbk_exception)>0: 
         print_log("There are " + len(bkbk_exception) + " exceptions!")
         for i in bkbk_exception:
             print i
     else:
         print_log("Completed successfully.")
     return bkbk_exception
예제 #14
0
 def run(self):
     self.check_row_id_existance()
     self.queue.put(self.getName())
     self.log_load_start()
     self.delete_existing_records()
     try:
         if self.enable_copy:
             print_log("psql copy...")
             psql_copy_from(
                 DB_HOST,
                 DB_NAME,
                 DB_UNAME,
                 "dw.stock_transaction",
                 self.file,
                 DB_PORT,
                 args=" with (encoding 'GBK')",
             )
         else:
             print_log("psql insert...")
             inserter(self.conn, TABLE, COLS, "file", self.file, "\t")
         self.log_load_end(is_success=True)
         print_log(
             "Loading {stock_id} for {date} completes successfully.".format(stock_id=self.stock_id, date=self.date)
         )
     except:
         traceback.print_exc()
         self.log_load_end(is_success=False)
         raise RuntimeError("Loading {stock_id} for {date} failed.".format(stock_id=self.stock_id, date=self.date))
     finally:
         queue_name = self.queue.get()
 def run(self):
     self.check_row_id_existance()
     self.queue.put(self.getName())
     self.log_load_start()
     self.delete_existing_records()
     try:
         if self.enable_copy:
             print_log('psql copy...')
             psql_copy_from(DB_HOST,
                            DB_NAME,
                            DB_UNAME,
                            'dw.stock_transaction',
                            self.file,
                            DB_PORT,
                            args=' with (encoding \'GBK\')')
         else:
             print_log('psql insert...')
             inserter(self.conn, TABLE, COLS, 'file', self.file, '\t')
         self.log_load_end(is_success=True)
         print_log(
             'Loading {stock_id} for {date} completes successfully.'.format(
                 stock_id=self.stock_id, date=self.date))
     except:
         traceback.print_exc()
         self.log_load_end(is_success=False)
         raise RuntimeError('Loading {stock_id} for {date} failed.'.format(
             stock_id=self.stock_id, date=self.date))
     finally:
         queue_name = self.queue.get()
def load_log_checker(conn, start_date=options.start_date, end_date=options.end_date, stock_id=options.stock_id):
    start_date_dt = datetime.datetime.strptime(start_date,'%Y%m%d')
    end_date_dt = datetime.datetime.strptime(end_date,'%Y%m%d')

    chk_sql = '''
    select biz_date, stock_id
    from dw.log_stock_transaction
    where biz_date between '{start_date}' and '{end_date}'
    and is_download_success = 'Y'
    and (is_load_success = 'N' or is_load_success is null)
    '''.format(start_date=start_date_dt, end_date=end_date_dt)
    if not stock_id is None: chk_sql = chk_sql + ' and stock_id = \'' + stock_id + '\''

    cur = get_cur(conn)
    cur.execute(chk_sql)
    rows = list(cur)
    if len(rows) == 0:
        print_log('All the stocks have been loaded successfully.')
    else:
        for row in rows:
            error_log(str(row['biz_date']) + ':' + row['stock_id'] + ' failed to load.')
    return len(rows)
예제 #17
0
 def return_bankuai_code(bankuai_tree, bankuai):
     # bankuai parameter is a list from the top bankuai to the bottom bankuai in the format below
     # ['板块','概念板块','AB股票']
     # Parse the url of bankuai, for the url of bankuai under [概念板块, 地域板块, 行业板块], the numbers before the first underscore is the key to get the stocks belonging to that bankuai
     # e.g. For 板块->概念板块->AB股, the url is list.html#28003498_0_2, 28003498 is the key to get the stocks belonging to AB股
     def drill_to_sub_bankuai(bankuai_dict,sub_bankuai):
         if sub_bankuai in bankuai_dict:
             return bankuai_dict[sub_bankuai]
         elif "children" in bankuai_dict and sub_bankuai in bankuai_dict["children"]:
             return bankuai_dict["children"][sub_bankuai]
         else:
             # This error should not be captured by the except block below
             raise RuntimeError(sub_bankuai + " is not found.", "in Eastmoney.py") 
             
     try:
         bankuai_code = re.search(r'#(?P<bankuai_code>\d+)', reduce(drill_to_sub_bankuai, bankuai, bankuai_tree)["url"]).group("bankuai_code")
     except AttributeError:
         # The exception block only captures AttributeError: 'NoneType' object has no attribute 'group'
         print_log("The url of [" + ",".join(bankuai) + "] doesn't contain digits.")
         bankuai_code = "-99"
         
     return bankuai_code
def download_log_checker(conn,
                         start_date=options.start_date,
                         end_date=options.end_date,
                         stock_id=options.stock_id):
    start_date_dt = datetime.datetime.strptime(start_date, '%Y%m%d')
    end_date_dt = datetime.datetime.strptime(end_date, '%Y%m%d')

    # get stock ids which is_download_success=N
    chk_sql = '''
    select t.biz_date, 
      t.stock_id
    from (
    select 
      biz_date, 
      stock_id, 
      is_download_success, 
      row_number() over(partition by biz_date, stock_id order by download_end_time desc nulls last) rankid
    from dw.log_stock_transaction
    where biz_date between '{start_date}' and '{end_date}' 
    ) t where t.rankid = 1
    and t.is_download_success = 'N' '''.format(start_date=start_date_dt,
                                               end_date=end_date_dt)
    if not stock_id is None:
        chk_sql = chk_sql + ' and t.stock_id = \'' + stock_id + '\''

    cur = get_cur(conn)
    cur.execute(chk_sql)
    rows = list(cur)
    if len(rows) == 0:
        print_log('All the stocks have been downloaded successfully.')
    else:
        for row in rows:
            error_log(
                str(row['biz_date']) + ':' + row['stock_id'] +
                ' failed to download.')
    return len(rows)
def download_to_file(stocks, stock_obj_name, start_date, end_date, to_file,
                     log_fh, warn_fh):
    #-- iterate stocks, download eod data from webside
    fh = open(to_file, 'a')
    num = 0
    for s in stocks:
        #-- call method of stock object to get content of url
        try:
            new_class = '%(object)s("%(stock)s", "%(start_date)s", "%(end_date)s")' % {
                'object':
                stock_obj_name,
                'stock':
                s,
                'start_date':
                start_date if stock_obj_name == 'Yahoo_stock' else 'dummy',
                'end_date':
                end_date if stock_obj_name == 'Yahoo_stock' else 'dummy'
            }
            print_log(new_class)

            while True:  # Infinite loop unitl stock download completes successfully
                try:
                    obj = eval(new_class)
                    for k, v in obj.get_stock_content().items():
                        print_log(
                            '%(num)s - Writing %(code)s ...' % {
                                'num': num,
                                'code': k
                            }, log_fh)
                        if re.match(r'pv_none_match', v) or re.match(
                                r'.+"";$',
                                v):  # match empty from tengxun and sina
                            warn_log('No content fetched for ' + k, warn_fh)
                        else:
                            fh.write(v + '\n')
                            num += 1
                    break
                except:
                    warn_log('Connection lost, retry in 10 seconds ...')
                    time.sleep(10)

        except KeyError:
            warn_log(s[0:2] + ' is not setup in ' + stock_obj_name, warn_fh)
            continue
        except HTTPError:  # log and skip for stocks couldn't be returned from yahoo interface
            warn_log('Get content failed when ' + new_class, warn_fh)
            continue
    fh.close()
    print_log(
        '{num} stocks have been written into {file}.'.format(num=num,
                                                             file=to_file),
        log_fh)
def downloader(queue, conn, start_date=options.start_date, end_date=options.end_date, stock_id=options.stock_id, obj_selection=options.obj_selection):
    #-- object list
    obj_mapping = {
        'T': 'Tengxun_stock_transaction',
        'N': 'Netease_stock_transaction',
        'S': 'Sina_stock_transaction',
    }
    if obj_selection is None:
        stock_objects = ['Tengxun_stock_transaction', 'Netease_stock_transaction', 'Sina_stock_transaction']
    else:
        stock_objects = [ obj_mapping[o] for o in obj_selection.split('|') if o in obj_mapping ]
    
    print_log('|'.join(stock_objects) + ' selected.')
    
    iter = len(stock_objects)
    
    cur_date_dt = datetime.datetime.strptime(start_date,'%Y%m%d')
    end_date_dt = datetime.datetime.strptime(end_date,'%Y%m%d')
    while cur_date_dt <= end_date_dt:  
        #-- stock list
        stocks = get_stock_list(conn, cur_date_dt, stock_id)
        for stock in stocks:
            cur_date_str = cur_date_dt.strftime('%Y%m%d')
            cur_stock_object = stock_objects[iter%len(stock_objects)] # choose stock object
            while queue.full():
                print_log('=================> queue is full, wait for 1 second...')
                time.sleep(1)
            s = Stock_trans_downloader(queue, conn, cur_stock_object, stock, cur_date_str)
            s.start()
            #s.join()
            print_log('-----> queue size: ' + str(queue.qsize()))
            iter += 1
        cur_date_dt = cur_date_dt + datetime.timedelta(1)
        
    while not queue.empty():
        print_log('=================> queue is not empty yet, wait for 1 second...')
        time.sleep(1)
예제 #21
0
def download_to_file(stocks, stock_obj_name, start_date, end_date, to_file, log_fh, warn_fh):
    # -- iterate stocks, download eod data from webside
    fh = open(to_file, "a")
    num = 0
    for s in stocks:
        # -- call method of stock object to get content of url
        try:
            new_class = '%(object)s("%(stock)s", "%(start_date)s", "%(end_date)s")' % {
                "object": stock_obj_name,
                "stock": s,
                "start_date": start_date if stock_obj_name == "Yahoo_stock" else "dummy",
                "end_date": end_date if stock_obj_name == "Yahoo_stock" else "dummy",
            }
            print_log(new_class)

            while True:  # Infinite loop unitl stock download completes successfully
                try:
                    obj = eval(new_class)
                    for k, v in obj.get_stock_content().items():
                        print_log("Writing %(code)s ..." % {"code": k}, log_fh)
                        if re.match(r"pv_none_match", v) or re.match(r'.+"";$', v):  # match empty from tengxun and sina
                            warn_log("No content fetched for " + k, warn_fh)
                        else:
                            fh.write(v + "\n")
                            num += 1
                    break
                except:
                    warn_log("Connection lost, retry in 10 seconds ...")
                    time.sleep(10)

        except KeyError:
            warn_log(s[0:2] + " is not setup in " + stock_obj_name, warn_fh)
            continue
        except HTTPError:  # log and skip for stocks couldn't be returned from yahoo interface
            warn_log("Get content failed when " + new_class, warn_fh)
            continue
    fh.close()
    print_log("{num} stocks have been written into {file}.".format(num=num, file=to_file), log_fh)
stock_object = {
    'tengxun': 'Tengxun_stock',
    'sina': 'Sina_stock',
    'yahoo': 'Yahoo_stock',
}

# check validation of object class
if not options.object_class in stock_object:
    exit_error(
        '%(entered_object)s is not a valid object, it could be %(valid_objects)s'
        % {
            'entered_object': options.object_class,
            'valid_objects': '|'.join(stock_object)
        })
else:
    print_log(options.object_class + ' selected.')

# check validation of mode and input file
if not options.mode in ('download', 'load', 'downloadAndLoad'):
    exit_error(
        mode +
        ' is not recognized, it could be download|load|downloadAndLoad.')
elif not options.file is None and not os.path.exists(options.file):
    exit_error(options.file + ' doesn\'t exist.')

# check validation of start_date and end_date
if options.object_class == 'yahoo' and options.mode == 'download':
    if options.start_date is None or options.end_date is None:
        exit_error(
            '--start_date|-s and --end_date|-e must be specified for yahoo class'
        )
예제 #23
0
def load_into_bankuai(db_conn, file, biz_date=None):

    # 板块	子版块		板块名称	涨跌幅	总市值(亿)	换手率	上涨家数	下跌家数	领涨股票代码	领涨股票	领涨股票涨跌幅
    # 板块	概念板块	全息技术	3.95%	365.12		11.65	7			1			600288			大恒科技	10.03
    # 板块	概念板块	网络安全	2.95%	818.79		25.61	19			1			002308			威创股份	10.01

    # biz_date date not null,
    # bankuai_id integer not null,
    # rise varchar(16),
    # market_value_in_million decimal(12,2),
    # turnover_rate decimal(5,2),
    # num_of_rise integer,
    # num_of_drop integer,
    # leading_stock_id varchar(6),
    # rise_of_leading_stock decimal(10,2),
    # primary key(biz_date, bankuai_id)

    bk_id_dict = {}
    csv_data = []
    v_biz_date = ""

    #-- build dict for bankuai name and bankuai id from db
    select_sql = 'select t.name, t.id from dw.dim_bankuai t'
    cur = get_cur(db_conn)
    cur.execute(select_sql)
    db_rows = list(cur)
    for db_row in db_rows:
        db_name = db_row["name"].decode("utf-8")
        db_id = db_row["id"]
        bk_id_dict[db_name] = db_id

    print_log("There are %(num)s records read from %(name)s" % {
        "num": len(bk_id_dict.keys()),
        "name": 'dw.dim_bankuai'
    })

    #-- load CSV
    csvf = open(file)
    csvr = csv.DictReader(csvf)
    for row in csvr:
        bk_name = row[u'板块名称'.encode("gbk")].decode("gbk")
        bk_id = bk_id_dict[bk_name]
        row_dict = {}
        row_dict[bk_id] = {}
        row_dict[bk_id]["rise"] = row[u'涨跌幅'.encode("gbk")].decode("gbk")
        row_dict[bk_id]["market_value_in_million"] = row[u'总市值(亿)'.encode(
            "gbk")]
        row_dict[bk_id]["turnover_rate"] = row[u'换手率'.encode("gbk")]
        row_dict[bk_id]["num_of_rise"] = row[u'上涨家数'.encode("gbk")]
        row_dict[bk_id]["num_of_drop"] = row[u'下跌家数'.encode("gbk")]
        row_dict[bk_id]["leading_stock_id"] = row[u'领涨股票代码'.encode("gbk")]
        row_dict[bk_id]["rise_of_leading_stock"] = row[u'领涨股票涨跌幅'.encode(
            "gbk")]

        csv_data.append(row_dict)

    csvf.close()
    print_log("%(num)s records have been read from %(name)s." % {
        "num": len(csv_data),
        "name": file
    })

    #-- determine biz_date
    if not biz_date is None:
        if re.search(r'\d{8}', biz_date):
            v_biz_date = biz_date
        else:
            raise RuntimeError(
                biz_date +
                " is not a valid date format, the date should be like YYYYMMDD."
            )
    elif re.search(r'.*(?P<date>\d{8})\.csv', file):
        v_biz_date = re.search(r'.*(?P<date>\d{8})\.csv', file).group("date")
    else:
        raise RuntimeError(
            'Can not determine biz_date, please check if file name has date included or pass biz_date when calling the function.'
        )
    v_biz_date_dt = datetime.datetime.strptime(v_biz_date, '%Y%m%d')

    #-- delete biz_date from dw.bankuai
    del_sql = 'delete from dw.bankuai where biz_date = \'%(date)s \'' % {
        'date': v_biz_date_dt
    }
    cur.execute(del_sql)
    db_conn.commit()
    print_log(
        "Deleted records from dw.bankuai where biz_date = '%(biz_date)s'." %
        {"biz_date": v_biz_date})

    #-- insert into dw.bankuai
    iter = 0
    for r in csv_data:
        k = r.keys()[0]
        iter += 1
        ins_sql = '''insert into dw.bankuai(
			biz_date, 
			bankuai_id, 
			rise, 
			market_value_in_million, 
			turnover_rate, 
			num_of_rise, 
			num_of_drop, 
			leading_stock_id, 
			rise_of_leading_stock) values(
			'%(biz_date)s',
			%(bankuai_id)s, 
			'%(rise)s', 
			%(market_value_in_million)s, 
			%(turnover_rate)s, 
			%(num_of_rise)s, 
			%(num_of_drop)s, 
			'%(leading_stock_id)s', 
			%(rise_of_leading_stock)s
			)''' % {
            'biz_date':
            v_biz_date_dt,
            'bankuai_id':
            k,
            'rise':
            r[k]['rise'],
            'market_value_in_million':
            r[k]['market_value_in_million'],
            'turnover_rate':
            r[k]['turnover_rate'],
            'num_of_rise':
            r[k]['num_of_rise'],
            'num_of_drop':
            r[k]['num_of_drop'],
            'leading_stock_id':
            r[k]['leading_stock_id'] if r[k]['leading_stock_id'] != '-' else
            '000000',  # sometimes eastmoney doesn't return valid leading stock id, but '-', for this case, '000000' would replace it as an unknown stock id
            'rise_of_leading_stock':
            r[k]['rise_of_leading_stock']
        }
        cur.execute(ins_sql)

    db_conn.commit()
    print_log(str(iter) + " inserted into dw.bankuai.")
    print_log("dw.bankuai has been refreshed successfully.")
예제 #24
0
def insert_into_table(db_field_yaml, stock_obj_name, in_file, conn, log_fh,
                      warn_fh):
    # based on the fields mapping between db and object, db type defined in yaml, generate delete sql and insert sql, and fire to db
    # this function could be used for any db insert, if yaml and object are setup properly
    # Yaml example
    # biz_date:
    #   type: date
    #   is_pk: Y
    #   stock_object:
    #         Tengxun_stock: date
    from object_impl.Sina_stock import Sina_stock
    from object_impl.Tengxun_stock import Tengxun_stock
    from object_impl.Yahoo_stock import Yahoo_stock

    db_field_mapping = get_yaml(db_field_yaml)
    tab_name = os.path.basename(db_field_yaml).replace(
        '.yml', '')  # yml file name as table name
    tab_fields = []  # table field names
    tab_pk = []  # table pk
    tab_types = []  # table field types
    obj_attrs = []  # attribute names in stock object
    for k, v in db_field_mapping.items():
        tab_type = v['type']
        obj_attr = v['stock_object'][stock_obj_name]
        if obj_attr != None:  # If None|Null is set for fields in yml, remove the fields from insertion
            tab_fields.append(k)
            if v['is_pk'] == 'Y': tab_pk.append(k)  # pk, delete before insert
            tab_types.append(tab_type)
            obj_attrs.append(obj_attr)
    del_sql = 'delete from {tab_name} where 1=1 '.format(tab_name=tab_name)
    ins_sql = 'insert into {tab_name}({fields}) '.format(
        tab_name=tab_name, fields=','.join(tab_fields))
    # iterate each row in the file, insert into table
    num = 0
    with open(in_file) as f:
        for row in f.readlines():
            # get_stock_object_from_str is a function should be available in all the stock objects
            # this function accepts the string returned from website and generate a dict for stock object
            # the dict is like {stock: {date: object}}
            # dynamically import object module, class name and file name should be identical
            #exec('from object_impl.{object} import {object}'.format(object = stock_obj_name), globals())
            stock_dict = eval('{object}.get_stock_object_from_str(row)'.format(
                object=stock_obj_name, row=row))
            for stock in stock_dict:  # for Tengxun or sina interface, there is just one stock in one stock dict
                for date in stock_dict[
                        stock]:  # for Tengxun or sina interface, there is just one date in one stock dict
                    stock_obj = stock_dict[stock][
                        date]  # this object is stock implementation object
                    value_sql = reduce(
                        lambda x, y:
                        (x if re.match(r'stock_obj', x) else 'stock_obj.' + x +
                         ', ') + "stock_obj.{attr_name}, ".format(attr_name=y),
                        obj_attrs
                    )  # add 'stock_obj.' to the first attr, and concatenate attrs to a string
                    value_sql = value_sql[
                        0:
                        -2]  # remove the last comma and the blankspace next to it
                    value_sql = eval(value_sql)  # tupe returned
                    final_value_sql = ''
                    del_where = ''
                    for i, v in enumerate(value_sql):
                        value = "'" + v + "'" if tab_types[
                            i] == 'date' or tab_types[
                                i] == 'varchar' else 'Null' if len(
                                    str(v)
                                ) == 0 else str(
                                    v
                                )  # date and varchar quoted by single quote, otherwise no quote or null(if length of value is 0)
                        final_value_sql = final_value_sql + value + ', '
                        if tab_fields[i] in tab_pk:
                            del_where = del_where + ' and {field}={value}'.format(
                                field=tab_fields[i], value=value)
                    final_value_sql = final_value_sql[0:-2]
                    del_complete_sql = del_sql + del_where
                    ins_complete_sql = ins_sql + ' values( ' + final_value_sql + ')'
                    #print_log('Deleting [{stock},{date}] from {tab_name}...\n {sql}'.format(stock=stock,date=date,tab_name=tab_name,sql=del_complete_sql), log_fh)
                    cur = get_cur(conn)
                    cur.execute(del_complete_sql)
                    cur.execute(ins_complete_sql)
                    print_log(
                        'Inserted [{stock},{date}] into {tab_name}.'.format(
                            stock=stock, date=date, tab_name=tab_name), log_fh)
                    num += 1
                    if num % 1000 == 0: conn.commit()
    conn.commit()
    print_log(
        '{num} records have been written into {tab_name}.'.format(
            num=num, tab_name=tab_name), log_fh)
예제 #25
0
def load_into_dim_stock(db_conn, file):
    #-- load CSV
    csvf = open(file)
    csvr = csv.DictReader(csvf)
    codes = {}
    codes_to_update = {}
    codes_to_valid = []
    codes_to_invalid = []

    # 板块	子版块		板块名称	股票代码	股票名称
    # 板块	概念板块	送转预期	600587		新华医疗
    for row in csvr:
        code = row[u'股票代码'.encode("gbk")].decode("gbk")
        name = row[u'股票名称'.encode("gbk")].decode("gbk")
        codes[code] = name
    csvf.close()
    print_log("%(num)s records have been read from %(fname)s." % {
        "num": len(codes.keys()),
        "fname": file
    })

    #---- get id, name from db, seach the combination in csv dict
    # if id exists but different name, update
    # if id doesn't exist, mark is_valid=N
    select_sql = "select t.id, t.name, t.is_valid from dw.dim_stock t /*where t.is_valid = 'Y'*/"
    cur = get_cur(db_conn)
    cur.execute(select_sql)
    db_rows = list(cur)

    for db_row in db_rows:
        db_name = db_row["name"].decode("utf-8")
        db_id = db_row["id"]
        db_is_valid = db_row["is_valid"]
        if db_id in codes and db_is_valid == "Y":
            if db_name == codes[db_id]:
                #delete from codes if it's already in the table and name is not changed.
                del codes[db_id]
            else:
                #delete from codes, we will use codes_to_update dict to update the name
                codes_to_update[db_id] = codes[db_id]
                del codes[db_id]
        elif db_id in codes and db_is_valid == "N":
            codes_to_valid.append("'" + str(db_id) + "'")
            del codes[db_id]
        elif db_is_valid == "N":
            # not in csv file and it's already invalid in db, do nothing
            pass
        else:
            # not in csv, but in db it's valid, mark it to invalid
            codes_to_invalid.append("'" + str(db_id) + "'")

    #---- mark stocks is_valid=N
    if len(codes_to_invalid) > 0:
        codes_to_invalid_str = ",".join(codes_to_invalid)
        #print_log("Mark stock ids to invalid: " + codes_to_invalid_str)
        print_log(
            "There are %(num)s stocks will be marked invalid. %(stocks)s" % {
                "num": len(codes_to_invalid),
                "stocks": codes_to_invalid_str
            })
        upd_sql = "update dw.dim_stock t set is_valid = 'N', upd_time = now() where t.id in (%(ids)s)" % {
            "ids": codes_to_invalid_str
        }
        cur.execute(upd_sql)
        db_conn.commit()
    else:
        print_log("No stocks need to be marked invalid.")

    #---- mark stocks is_valid=Y
    if len(codes_to_valid) > 0:
        codes_to_valid_str = ",".join(codes_to_valid)
        print_log("There are %(num)s stocks will be marked valid. %(stocks)s" %
                  {
                      "num": len(codes_to_valid),
                      "stocks": codes_to_valid_str
                  })
        upd_sql = "update dw.dim_stock t set is_valid = 'Y', upd_time = now() where t.id in (%(ids)s)" % {
            "ids": codes_to_valid_str
        }
        cur.execute(upd_sql)
        db_conn.commit()
    else:
        print_log("No stocks need to be marked valid.")

    #---- update stock names in dim_stock
    if len(codes_to_update.keys()) > 0:
        print_log("There are %(num)s stocks will be updated." %
                  {"num": len(codes_to_update.keys())})
        for id in codes_to_update:
            print_log(id)
            upd_sql = "update dw.dim_stock t set name = '%(name)s', upd_time = now() where t.id = '%(id)s'" % {
                "id": id,
                "name": codes_to_update[id]
            }
            cur.execute(upd_sql)
        db_conn.commit()
    else:
        print_log("No stocks need to be updated.")

    #---- insert stocks into dim_stock
    if len(codes.keys()) > 0:
        values = []
        print_log("There are %(num)s stocks will be inserted." %
                  {"num": len(codes.keys())})
        for b in codes:
            print_log(b)
            values.append("('%(id)s', '%(name)s', now(), 'Y')" % {
                "id": b,
                "name": codes[b]
            })
        values_str = ",".join(values)
        ins_sql = "insert into dw.dim_stock(id, name, upd_time, is_valid) values %(values)s" % {
            "values": values_str
        }
        cur.execute(ins_sql)
        db_conn.commit()
    else:
        print_log("No new stock ids.")

    print_log("dw.dim_stock has been refreshed successfully.")
예제 #26
0
def insert_into_table(db_field_yaml, stock_obj_name, in_file, conn, log_fh, warn_fh):
    # based on the fields mapping between db and object, db type defined in yaml, generate delete sql and insert sql, and fire to db
    # this function could be used for any db insert, if yaml and object are setup properly
    # Yaml example
    # biz_date: 
    #   type: date
    #   is_pk: Y
    #   stock_object: 
    #         Tengxun_stock: date
    from object_impl.Sina_stock import Sina_stock
    from object_impl.Tengxun_stock import Tengxun_stock
    from object_impl.Yahoo_stock import Yahoo_stock
    
    db_field_mapping = get_yaml(db_field_yaml)
    tab_name = os.path.basename(db_field_yaml).replace('.yml', '') # yml file name as table name
    tab_fields = [] # table field names
    tab_pk = [] # table pk
    tab_types = [] # table field types
    obj_attrs = [] # attribute names in stock object
    for k,v in db_field_mapping.items():
        tab_type = v['type']
        obj_attr = v['stock_object'][stock_obj_name]
        if obj_attr != None: # If None|Null is set for fields in yml, remove the fields from insertion
            tab_fields.append(k)
            if v['is_pk'] == 'Y': tab_pk.append(k) # pk, delete before insert
            tab_types.append(tab_type)
            obj_attrs.append(obj_attr)
    del_sql = 'delete from {tab_name} where 1=1 '.format(tab_name=tab_name)
    ins_sql = 'insert into {tab_name}({fields}) '.format(tab_name=tab_name, fields=','.join(tab_fields))
    # iterate each row in the file, insert into table
    num = 0
    with open(in_file) as f:
        for row in f.readlines():
            # get_stock_object_from_str is a function should be available in all the stock objects
            # this function accepts the string returned from website and generate a dict for stock object
            # the dict is like {stock: {date: object}}
            # dynamically import object module, class name and file name should be identical
            #exec('from object_impl.{object} import {object}'.format(object = stock_obj_name), globals())
            stock_dict = eval('{object}.get_stock_object_from_str(row)'.format(object=stock_obj_name, row=row))
            for stock in stock_dict: # for Tengxun or sina interface, there is just one stock in one stock dict
                for date in stock_dict[stock]: # for Tengxun or sina interface, there is just one date in one stock dict
                    stock_obj = stock_dict[stock][date] # this object is stock implementation object
                    value_sql = reduce(lambda x, y: ( x if re.match(r'stock_obj', x) else 'stock_obj.' + x + ', ' ) + "stock_obj.{attr_name}, ".format(attr_name=y), obj_attrs) # add 'stock_obj.' to the first attr, and concatenate attrs to a string
                    value_sql = value_sql[0:-2] # remove the last comma and the blankspace next to it
                    value_sql = eval(value_sql) # tupe returned
                    final_value_sql = ''
                    del_where = ''
                    for i, v in enumerate(value_sql):
                        value = "'" + v + "'" if tab_types[i] == 'date' or tab_types[i] == 'varchar' else 'Null' if len(str(v)) == 0 else str(v) # date and varchar quoted by single quote, otherwise no quote or null(if length of value is 0)
                        final_value_sql = final_value_sql + value + ', '
                        if tab_fields[i] in tab_pk: 
                            del_where = del_where + ' and {field}={value}'.format(field=tab_fields[i], value=value)
                    final_value_sql = final_value_sql[0:-2]
                    del_complete_sql = del_sql + del_where
                    ins_complete_sql = ins_sql + ' values( ' + final_value_sql + ')'
                    #print_log('Deleting [{stock},{date}] from {tab_name}...\n {sql}'.format(stock=stock,date=date,tab_name=tab_name,sql=del_complete_sql), log_fh)
                    cur = get_cur(conn)
                    cur.execute(del_complete_sql)
                    cur.execute(ins_complete_sql)
                    print_log('Inserted [{stock},{date}] into {tab_name}.'.format(stock=stock,date=date,tab_name=tab_name), log_fh)
                    num += 1
                    if num % 1000 == 0: conn.commit()
    conn.commit()
    print_log('{num} records have been written into {tab_name}.'.format(num=num, tab_name=tab_name), log_fh)
def load_into_dim_stock_bankuai(db_conn, file ):
	#-- load CSV
	csvf = open(file)
	csvr = csv.DictReader(csvf)
	bk_st_pairs = []
	bk_st_pairs_dict = {}
	bk_id_dict = {}
	
	codes_to_valid = []
	codes_to_invalid = []
	
	# 板块	子版块		板块名称	股票代码	股票名称
	# 板块	概念板块	送转预期	600587		新华医疗
	for row in csvr:
		bk_name = row[u'板块名称'.encode("gbk")].decode("gbk")
		st_id = row[u'股票代码'.encode("gbk")].decode("gbk")
		bk_st_pairs.append([bk_name, st_id])
	csvf.close()
	print_log("%(num)s records have been read from %(fname)s." % {"num": len(bk_st_pairs), "fname": file})
	
	#---- get bankuai_id from dim_bankuai
	select_sql = "select t.id, t.name from dw.dim_bankuai t"
	cur = get_cur(db_conn)
	cur.execute(select_sql)
	db_rows = list(cur)
	for db_row in db_rows:
		db_name = db_row["name"].decode("utf-8")
		db_id = db_row["id"]
		bk_id_dict[db_name] = db_id
	
	#---- convert to dict 
	for i in range(len(bk_st_pairs)):
		bk_st_pairs[i][0] = bk_id_dict[bk_st_pairs[i][0]]
		bk_st_pairs[i].append(str(bk_st_pairs[i][0]) + "-" + str(bk_st_pairs[i][1])) # as PK
		bk_st_pairs_dict[bk_st_pairs[i][2]] = {"bk": bk_st_pairs[i][0], "st": bk_st_pairs[i][1]}
		
	#---- get bk_id, st_id from db, seach the combination in csv dict
	select_sql = "select t.stock_id, t.bankuai_id, t.is_valid from dw.dim_stock_bankuai t"
	cur.execute(select_sql)
	db_rows = list(cur)
	for db_row in db_rows:
		db_bk_id = db_row["bankuai_id"]
		db_st_id = db_row["stock_id"]
		db_pk = str(db_bk_id) + "-" + db_st_id
		db_is_valid = db_row["is_valid"]
		
		if db_pk in bk_st_pairs_dict and db_is_valid == "Y":
			del bk_st_pairs_dict[db_pk]
		elif db_pk in bk_st_pairs_dict and db_is_valid == "N":
			codes_to_valid.append(" ( bankuai_id = " + str(db_bk_id) + " and stock_id = '" + str(db_st_id) + "' ) ")
			del bk_st_pairs_dict[db_pk]
		elif db_is_valid == "N":
			# not in csv file and it's already invalid in db, do nothing
			pass
		else:
			# not in csv, but in db it's valid, mark it to invalid
			codes_to_invalid.append(" ( bankuai_id = " + str(db_bk_id) + " and stock_id = '" + str(db_st_id) + "' ) ")
			
	#---- mark is_valid=N
	if len(codes_to_invalid) > 0:
		codes_to_invalid_str = " or ".join(codes_to_invalid)
		print_log("There are %(num)s stock bankuai combination will be marked invalid. %(combination)s" % {"num": len(codes_to_invalid), "combination": codes_to_invalid_str})
		upd_sql = "update dw.dim_stock_bankuai t set is_valid = 'N', upd_time = now() where %(combinations)s" % {"combinations": codes_to_invalid_str}
		cur.execute(upd_sql)
		db_conn.commit()
	else:
		print_log("No stock bankuai combinations need to be marked invalid.")			

	#---- mark is_valid=Y
	if len(codes_to_valid) > 0:
		codes_to_valid_str = " or ".join(codes_to_valid)
		print_log("There are %(num)s stock bankuai combination will be marked valid. %(combination)s" % {"num": len(codes_to_valid), "combination": codes_to_valid_str})
		upd_sql = "update dw.dim_stock_bankuai t set is_valid = 'Y', upd_time = now() where %(combinations)s" % {"combinations": codes_to_valid_str}
		cur.execute(upd_sql)
		db_conn.commit()
	else:
		print_log("No stock bankuai combinations need to be marked valid.")			

	#---- insert stocks into dim_stock_bankuai
	if len(bk_st_pairs_dict.keys()) > 0:
		values = []
		print_log("There are %(num)s stock bankuai combination will be inserted." % {"num": len(bk_st_pairs_dict.keys())})
		for pk in bk_st_pairs_dict:
			print_log(pk)
			values.append("('%(stock_id)s', '%(bankuai_id)s', now(), 'Y')" % {"stock_id": bk_st_pairs_dict[pk]["st"], "bankuai_id": bk_st_pairs_dict[pk]["bk"]} )
		values_str = ",".join(values)
		ins_sql = "insert into dw.dim_stock_bankuai(stock_id, bankuai_id, upd_time, is_valid) values %(values)s" % {"values": values_str}
		cur.execute(ins_sql)
		db_conn.commit()
	else:
		print_log("No new stock bankuai combination.")

	print_log("dw.dim_stock_bankuai has been refreshed successfully.")
예제 #28
0
def load_into_dim_stock(db_conn, file ):
	#-- load CSV
	csvf = open(file)
	csvr = csv.DictReader(csvf)
	codes = {}
	codes_to_update = {}
	codes_to_valid = []
	codes_to_invalid = []

	# 板块	子版块		板块名称	股票代码	股票名称
	# 板块	概念板块	送转预期	600587		新华医疗
	for row in csvr:
		code = row[u'股票代码'.encode("gbk")].decode("gbk")
		name = row[u'股票名称'.encode("gbk")].decode("gbk")
		codes[code] = name
	csvf.close()
	print_log("%(num)s records have been read from %(fname)s." % {"num": len(codes.keys()), "fname": file})
	
	
	#---- get id, name from db, seach the combination in csv dict
	# if id exists but different name, update
	# if id doesn't exist, mark is_valid=N
	select_sql = "select t.id, t.name, t.is_valid from dw.dim_stock t /*where t.is_valid = 'Y'*/"
	cur = get_cur(db_conn)
	cur.execute(select_sql)
	db_rows = list(cur)

	for db_row in db_rows:
		db_name = db_row["name"].decode("utf-8")
		db_id = db_row["id"]
		db_is_valid = db_row["is_valid"]
		if db_id in codes and db_is_valid == "Y":
			if db_name == codes[db_id]:
				#delete from codes if it's already in the table and name is not changed.
				del codes[db_id]
			else: 
				#delete from codes, we will use codes_to_update dict to update the name 
				codes_to_update[db_id] = codes[db_id]
				del codes[db_id]
		elif db_id in codes and db_is_valid == "N":
			codes_to_valid.append("'" + str(db_id) + "'")
			del codes[db_id]
		elif db_is_valid == "N":
			# not in csv file and it's already invalid in db, do nothing
			pass
		else:
			# not in csv, but in db it's valid, mark it to invalid
			codes_to_invalid.append("'" + str(db_id) + "'")
			
	#---- mark stocks is_valid=N
	if len(codes_to_invalid) > 0:
		codes_to_invalid_str = ",".join(codes_to_invalid)
		#print_log("Mark stock ids to invalid: " + codes_to_invalid_str)
		print_log("There are %(num)s stocks will be marked invalid. %(stocks)s" % {"num": len(codes_to_invalid), "stocks": codes_to_invalid_str})
		upd_sql = "update dw.dim_stock t set is_valid = 'N', upd_time = now() where t.id in (%(ids)s)" % {"ids": codes_to_invalid_str}
		cur.execute(upd_sql)
		db_conn.commit()
	else:
		print_log("No stocks need to be marked invalid.")

	#---- mark stocks is_valid=Y
	if len(codes_to_valid) > 0:
		codes_to_valid_str = ",".join(codes_to_valid)
		print_log("There are %(num)s stocks will be marked valid. %(stocks)s" % {"num": len(codes_to_valid), "stocks": codes_to_valid_str})
		upd_sql = "update dw.dim_stock t set is_valid = 'Y', upd_time = now() where t.id in (%(ids)s)" % {"ids": codes_to_valid_str}
		cur.execute(upd_sql)
		db_conn.commit()
	else:
		print_log("No stocks need to be marked valid.")
		
	#---- update stock names in dim_stock
	if len(codes_to_update.keys()) > 0:
		print_log("There are %(num)s stocks will be updated." % {"num": len(codes_to_update.keys())})
		for id in codes_to_update:
			print_log(id)
			upd_sql = "update dw.dim_stock t set name = '%(name)s', upd_time = now() where t.id = '%(id)s'" % {"id": id, "name": codes_to_update[id]}
			cur.execute(upd_sql)
		db_conn.commit()
	else:
		print_log("No stocks need to be updated.")
	
	#---- insert stocks into dim_stock
	if len(codes.keys()) > 0:
		values = []
		print_log("There are %(num)s stocks will be inserted." % {"num": len(codes.keys())})
		for b in codes:
			print_log(b)
			values.append("('%(id)s', '%(name)s', now(), 'Y')" % {"id": b, "name": codes[b]} )
		values_str = ",".join(values)
		ins_sql = "insert into dw.dim_stock(id, name, upd_time, is_valid) values %(values)s" % {"values": values_str}
		cur.execute(ins_sql)
		db_conn.commit()
	else:
		print_log("No new stock ids.")
	
	print_log("dw.dim_stock has been refreshed successfully.")
예제 #29
0
def load_into_dim_stock_bankuai(db_conn, file):
    #-- load CSV
    csvf = open(file)
    csvr = csv.DictReader(csvf)
    bk_st_pairs = []
    bk_st_pairs_dict = {}
    bk_id_dict = {}

    codes_to_valid = []
    codes_to_invalid = []

    # 板块	子版块		板块名称	股票代码	股票名称
    # 板块	概念板块	送转预期	600587		新华医疗
    for row in csvr:
        bk_name = row[u'板块名称'.encode("gbk")].decode("gbk")
        st_id = row[u'股票代码'.encode("gbk")].decode("gbk")
        bk_st_pairs.append([bk_name, st_id])
    csvf.close()
    print_log("%(num)s records have been read from %(fname)s." % {
        "num": len(bk_st_pairs),
        "fname": file
    })

    #---- get bankuai_id from dim_bankuai
    select_sql = "select t.id, t.name from dw.dim_bankuai t"
    cur = get_cur(db_conn)
    cur.execute(select_sql)
    db_rows = list(cur)
    for db_row in db_rows:
        db_name = db_row["name"].decode("utf-8")
        db_id = db_row["id"]
        bk_id_dict[db_name] = db_id

    #---- convert to dict
    for i in range(len(bk_st_pairs)):
        bk_st_pairs[i][0] = bk_id_dict[bk_st_pairs[i][0]]
        bk_st_pairs[i].append(
            str(bk_st_pairs[i][0]) + "-" + str(bk_st_pairs[i][1]))  # as PK
        bk_st_pairs_dict[bk_st_pairs[i][2]] = {
            "bk": bk_st_pairs[i][0],
            "st": bk_st_pairs[i][1]
        }

    #---- get bk_id, st_id from db, seach the combination in csv dict
    select_sql = "select t.stock_id, t.bankuai_id, t.is_valid from dw.dim_stock_bankuai t"
    cur.execute(select_sql)
    db_rows = list(cur)
    for db_row in db_rows:
        db_bk_id = db_row["bankuai_id"]
        db_st_id = db_row["stock_id"]
        db_pk = str(db_bk_id) + "-" + db_st_id
        db_is_valid = db_row["is_valid"]

        if db_pk in bk_st_pairs_dict and db_is_valid == "Y":
            del bk_st_pairs_dict[db_pk]
        elif db_pk in bk_st_pairs_dict and db_is_valid == "N":
            codes_to_valid.append(" ( bankuai_id = " + str(db_bk_id) +
                                  " and stock_id = '" + str(db_st_id) + "' ) ")
            del bk_st_pairs_dict[db_pk]
        elif db_is_valid == "N":
            # not in csv file and it's already invalid in db, do nothing
            pass
        else:
            # not in csv, but in db it's valid, mark it to invalid
            codes_to_invalid.append(" ( bankuai_id = " + str(db_bk_id) +
                                    " and stock_id = '" + str(db_st_id) +
                                    "' ) ")

    #---- mark is_valid=N
    if len(codes_to_invalid) > 0:
        codes_to_invalid_str = " or ".join(codes_to_invalid)
        print_log(
            "There are %(num)s stock bankuai combination will be marked invalid. %(combination)s"
            % {
                "num": len(codes_to_invalid),
                "combination": codes_to_invalid_str
            })
        upd_sql = "update dw.dim_stock_bankuai t set is_valid = 'N', upd_time = now() where %(combinations)s" % {
            "combinations": codes_to_invalid_str
        }
        cur.execute(upd_sql)
        db_conn.commit()
    else:
        print_log("No stock bankuai combinations need to be marked invalid.")

    #---- mark is_valid=Y
    if len(codes_to_valid) > 0:
        codes_to_valid_str = " or ".join(codes_to_valid)
        print_log(
            "There are %(num)s stock bankuai combination will be marked valid. %(combination)s"
            % {
                "num": len(codes_to_valid),
                "combination": codes_to_valid_str
            })
        upd_sql = "update dw.dim_stock_bankuai t set is_valid = 'Y', upd_time = now() where %(combinations)s" % {
            "combinations": codes_to_valid_str
        }
        cur.execute(upd_sql)
        db_conn.commit()
    else:
        print_log("No stock bankuai combinations need to be marked valid.")

    #---- insert stocks into dim_stock_bankuai
    if len(bk_st_pairs_dict.keys()) > 0:
        values = []
        print_log(
            "There are %(num)s stock bankuai combination will be inserted." %
            {"num": len(bk_st_pairs_dict.keys())})
        for pk in bk_st_pairs_dict:
            print_log(pk)
            values.append(
                "('%(stock_id)s', '%(bankuai_id)s', now(), 'Y')" % {
                    "stock_id": bk_st_pairs_dict[pk]["st"],
                    "bankuai_id": bk_st_pairs_dict[pk]["bk"]
                })
        values_str = ",".join(values)
        ins_sql = "insert into dw.dim_stock_bankuai(stock_id, bankuai_id, upd_time, is_valid) values %(values)s" % {
            "values": values_str
        }
        cur.execute(ins_sql)
        db_conn.commit()
    else:
        print_log("No new stock bankuai combination.")

    print_log("dw.dim_stock_bankuai has been refreshed successfully.")
예제 #30
0
    
jobs = {
    'download_stock_bankuai': Task_download_stock_bankuai('download_stock_bankuai'),
    'recon_stock_bankuai': Task_recon_stock_bankuai('recon_stock_bankuai'),
    'download_stock_eod': Task_download_stock_eod('download_stock_eod'),
    'download_stock_transaction': Task_download_stock_transaction('download_stock_transaction'),
}

#job_run_seq = ['download_stock_bankuai', 'recon_stock_bankuai', 'download_stock_eod', 'download_stock_transaction']
job_run_seq = ['download_stock_bankuai', 'recon_stock_bankuai', 'download_stock_eod']
job_to_run = []

# determine which jobs need to run
for i, job in enumerate(job_run_seq):
    status = check_job_status(conn, job)
    print_log(job + ' ====> ' + status)
    if status == 'N': # one job failed, itself and the jobs depend on it will be added to to-run list
        job_to_run = job_run_seq[i:]
        break

# add to flow
flow = linear_flow.Flow('Eod loading')
for job in job_to_run:
    flow.add(jobs[job])

engine  = taskflow.engines.load(flow)
engine.notifier.register('*', flow_watch)
engine.task_notifier.register('*', task_watch)
try:
    engine.run()
except Exception as ex:
                        max_date = m.group("date")
        file_to_recon = file_db_recon[type]["file"].replace("$DATE", max_date)
    else:
        if not os.path.isfile(options.in_file):
            error_log("file can't be found! [" + options.in_file + "]")
            exit_process()
        else:
            file_to_recon = options.in_file

    #-- building dict for csv and db
    csvf = open(file_to_recon)
    csvr = csv.DictReader(csvf)

    #-- building dict for csv
    # based on the list of recon_fields_in_file, read the corresponding fields in csv and concatenate them together as a PK
    print_log("Start to read %(file)s..." % {"file": file_to_recon})
    for row in csvr:
        key = []
        for i in range(len(file_db_recon[type]["recon_fields_in_file"])):
            field = file_db_recon[type]["recon_fields_in_file"][i]
            key.append(row[field.encode("gbk")].decode("gbk"))
        csv_dict["-".join(key)] = ""
    print_log("%(num)s records loaded, dict for csv done." %
              {"num": len(csv_dict.keys())})
    csvf.close()

    #-- building dict for db
    # based on the list of recon_fields_in_db, read the corresponding fields in db and concatenate them together as a PK
    print_log("Start to read db...")
    select_sql = file_db_recon[type]["sql"]
    cur = get_cur(conn)
 def download_to_local(self):
     print_log('Reading data from ' + self.get_url())
     save_file_from_url(self.__download_file, self.get_url())
     print_log('Data saved to ' + self.__download_file)
예제 #33
0
    
jobs = {
    'download_stock_bankuai': Task_download_stock_bankuai('download_stock_bankuai'),
    'recon_stock_bankuai': Task_recon_stock_bankuai('recon_stock_bankuai'),
    'download_stock_eod': Task_download_stock_eod('download_stock_eod'),
    'download_stock_transaction': Task_download_stock_transaction('download_stock_transaction'),
}

#job_run_seq = ['download_stock_bankuai', 'recon_stock_bankuai', 'download_stock_eod', 'download_stock_transaction']
job_run_seq = ['download_stock_bankuai', 'recon_stock_bankuai', 'download_stock_eod']
job_to_run = []

# determine which jobs need to run
for i, job in enumerate(job_run_seq):
    status = check_job_status(conn, job)
    print_log(job + ' ====> ' + status)
    if status == 'N': # one job failed, itself and the jobs depend on it will be added to to-run list
        job_to_run = job_run_seq[i:]
        break

# add to flow
flow = linear_flow.Flow('Eod loading')
for job in job_to_run:
    flow.add(jobs[job])

engine  = taskflow.engines.load(flow)
engine.notifier.register('*', flow_watch)
#engine.task_notifier.register('*', task_watch)
try:
    engine.run()
except Exception as ex:
예제 #34
0
if not (re.match("^\d{8}$", start_date) and re.match("^\d{8}$", end_date)):
    exit_error("start_date or end_date error! [" + start_date + "][" + end_date + "]")
elif start_date > end_date:
    exit_error("start_date must be smaller than end_date! [" + start_date + "][" + end_date + "]")


# ------------------------------------------- Downloading
if options.mode in ("download", "downloadAndLoad"):
    e = Eastmoney()

    bkbkfile_full_name = Sys_paths.DATA_STOCK_BANKUAI_DAILY + Sys_paths.SEP + "bankuai_" + recent_working_day + ".csv"
    if os.path.exists(bkbkfile_full_name):
        bk_bkbkfile_full_name = bkbkfile_full_name + "." + datetime.datetime.now().strftime("%Y%m%d%H%M%S")
        os.rename(bkbkfile_full_name, bk_bkbkfile_full_name)  # rename
        print_log("The original file " + bkbkfile_full_name + " has been renamed to " + bk_bkbkfile_full_name)
    e.export_bankuai_status(bkbkfile_full_name)

    bkstfile_full_name = (
        Sys_paths.DATA_STOCK_BANKUAI_DAILY + Sys_paths.SEP + "bankuai_stock_" + recent_working_day + ".csv"
    )
    if os.path.exists(bkstfile_full_name):
        bk_bkstfile_full_name = bkstfile_full_name + "." + datetime.datetime.now().strftime("%Y%m%d%H%M%S")
        os.rename(bkstfile_full_name, bk_bkstfile_full_name)  # rename
        print_log("The original file " + bkstfile_full_name + " has been renamed to " + bk_bkstfile_full_name)
    e.export_bankuai_stock(bkstfile_full_name)

# ------------------------------------------- LOADing
if options.mode in ("downloadAndLoad", "load"):
    # -- determine file to load, $DATE is not replaced
    if options.in_file is None:
if not (options.mode in ['download', 'load', 'downloadAndLoad']):
    exit_error(mode + ' is not recognized, it could be download|load|downloadAndLoad.')
    
# check validation of start_date and end_date
if not (re.match("^\d{8}$", options.start_date) and re.match("^\d{8}$", options.end_date)):
    exit_error("Not valid start_date or end_date! [" + options.start_date + "][" + options.end_date + "]")
elif options.start_date > options.end_date:
    exit_error("Start date is greater then end date! [" + options.start_date + "][" + options.end_date + "]")

#-- create queue
queue = Queue(QUEUE_DOWNLOAD_MAX_SIZE)
#-- download stock info from internet
if options.mode == 'download' or options.mode == 'downloadAndLoad':
    #-- at most run 3 times, just in case some stocks failed to download
    for i in ['1st', '2nd', '3rd']:
        print_log('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
        print_log('downloader running for the {n} time...' . format(n=i))
        print_log('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
        downloader(queue, conn)
        error_num = download_log_checker(conn)
        if error_num == 0: break
        print_log('=================> waiting for 10 seconds to start the next round run...')
        time.sleep(10)
    #-- retry 3 times, still failed, raise runtime error
    if error_num > 0: exit_error('There are {num} stocks failed to download, please check.' . format(num=error_num))
    #queue.task_done()

#-- upsize queue size to speed up data loading 
queue = Queue(QUEUE_LOAD_MAX_SIZE)
#-- load stock info into database
if options.mode == 'load' or options.mode == 'downloadAndLoad':
예제 #36
0
            continue
    fh.close()
    print_log("{num} stocks have been written into {file}.".format(num=num, file=to_file), log_fh)


# -- parse input parameter, var assignment
stock_object = {"tengxun": "Tengxun_stock", "sina": "Sina_stock", "yahoo": "Yahoo_stock"}

# check validation of object class
if not options.object_class in stock_object:
    exit_error(
        "%(entered_object)s is not a valid object, it could be %(valid_objects)s"
        % {"entered_object": options.object_class, "valid_objects": "|".join(stock_object)}
    )
else:
    print_log(options.object_class + " selected.")

# check validation of mode and input file
if not options.mode in ("download", "load", "downloadAndLoad"):
    exit_error(mode + " is not recognized, it could be download|load|downloadAndLoad.")
elif not options.file is None and not os.path.exists(options.file):
    exit_error(options.file + " doesn't exist.")

# check validation of start_date and end_date
if options.object_class == "yahoo" and options.mode == "download":
    if options.start_date is None or options.end_date is None:
        exit_error("--start_date|-s and --end_date|-e must be specified for yahoo class")
    elif not (re.match("^\d{8}$", options.start_date) and re.match("^\d{8}$", options.end_date)):
        exit_error("Not valid start_date or end_date! [" + options.start_date + "][" + options.end_date + "]")

def loader(queue,
           conn,
           start_date=options.start_date,
           end_date=options.end_date,
           stock_id=options.stock_id,
           merge_before_copy=options.merge_before_copy,
           enable_copy=options.enable_copy):

    cur_date_dt = datetime.datetime.strptime(start_date, '%Y%m%d')
    end_date_dt = datetime.datetime.strptime(end_date, '%Y%m%d')

    stock_list_sql = '''
    select row_id, biz_date, stock_id
    from dw.log_stock_transaction
    where biz_date = '{biz_date}'
    and is_download_success = 'Y'
    and (is_load_success = 'N' or is_load_success is null)
    '''
    if not stock_id is None:
        stock_list_sql = stock_list_sql + ' and stock_id = \'' + stock_id + '\''

    cur = get_cur(conn)
    while cur_date_dt <= end_date_dt:
        if merge_before_copy:
            # since load files one by one into table is taking too much time, the solution to boost the procedure is to merge all the pieces of files into one file and load the merge file into table, this takes less than 5 mins to complete.
            cur_date_str = cur_date_dt.strftime('%Y%m%d')
            working_dir = data_dir + SEP + cur_date_str
            file_merged = os.path.join(working_dir, "file_merged.csv")
            if os.path.exists(file_merged):
                warn_log('Removing old file: ' + file_merged)
                os.remove(file_merged)
            #-- Starting to merge files
            with open(file_merged, "a") as dest:
                i = 0
                for _, _, filenames in os.walk(working_dir):
                    for filename in fnmatch.filter(filenames, "[0-9]*.txt"):
                        with open(os.path.join(working_dir, filename)) as src:
                            shutil.copyfileobj(src, dest)
                        i += 1
                        print_log('Merged ' + str(i) + ' files.')
            #-- Deleting records from db
            del_sql = '''delete from dw.stock_transaction where biz_date = '{}' '''.format(
                cur_date_str)
            get_query_result(conn, del_sql)
            conn.commit()
            print_log(
                'Deletion for biz_date {} completed successfully.'.format(
                    cur_date_str))
            #-- Updating is_load_success to N in log table
            upd_sql = '''update dw.log_stock_transaction set is_load_success = 'N' where biz_date = '{}' and is_download_success = 'Y' '''.format(
                cur_date_str)
            get_query_result(conn, upd_sql)
            conn.commit()
            print_log('is_load_success is updated to N')

            #++++++++ Starting to load the merged file into table
            psql_copy_from(DB_HOST,
                           DB_NAME,
                           DB_UNAME,
                           'dw.stock_transaction',
                           file_merged,
                           DB_PORT,
                           args=' with (encoding \'GBK\')')
            print_log('Successfully loaded {} into table.'.format(file_merged))

            #-- Updating is_load_success to Y in log table
            upd_sql = '''update dw.log_stock_transaction set is_load_success = 'Y' where biz_date = '{}' and is_download_success = 'Y' '''.format(
                cur_date_str)
            get_query_result(conn, upd_sql)
            conn.commit()
            print_log('is_load_success is updated to Y')

            #-- Cleaning up working dir
            os.remove(file_merged)

            cur_date_dt = cur_date_dt + datetime.timedelta(1)

        else:
            stock_list_sql_var_replaced = stock_list_sql.format(
                biz_date=cur_date_dt)
            cur.execute(stock_list_sql_var_replaced)
            rows = list(cur)
            for row in rows:
                row_id = row['row_id']
                biz_date = str(row['biz_date']).replace('-', '')
                stock_id = row['stock_id']
                while queue.full():
                    print_log(
                        '=================> queue is full, wait for 1 second...'
                    )
                    time.sleep(1)
                s = Stock_trans_loader(queue,
                                       conn,
                                       row_id,
                                       stock_id,
                                       biz_date,
                                       enable_copy=enable_copy)
                s.start()
                print_log('-----> queue size: ' + str(queue.qsize()))
                conn.commit()

            cur_date_dt = cur_date_dt + datetime.timedelta(1)

    while not queue.empty():
        print_log(
            '=================> queue is not empty yet, wait for 1 second...')
        time.sleep(1)
예제 #38
0
 def save_formatted_data(self):
     # save formatted data into file, \t as delimiter
     # 9:25:00    50.34   0.15    141 709794  买盘
     with open(self.out_file, 'w') as file:
         file.write(self.stock_trans_object.get_stock_content()[self.stock_id][self.date])
     print_log('Formatted data saved to ' + self.out_file)
def loader(queue, conn, start_date=options.start_date, end_date=options.end_date, stock_id=options.stock_id, merge_before_copy=options.merge_before_copy, enable_copy=options.enable_copy):

    cur_date_dt = datetime.datetime.strptime(start_date,'%Y%m%d')
    end_date_dt = datetime.datetime.strptime(end_date,'%Y%m%d')
    
    stock_list_sql = '''
    select row_id, biz_date, stock_id
    from dw.log_stock_transaction
    where biz_date = '{biz_date}'
    and is_download_success = 'Y'
    and (is_load_success = 'N' or is_load_success is null)
    '''
    if not stock_id is None: stock_list_sql = stock_list_sql + ' and stock_id = \'' + stock_id + '\''
    
    cur = get_cur(conn)
    while cur_date_dt <= end_date_dt:  
        if merge_before_copy:
        # since load files one by one into table is taking too much time, the solution to boost the procedure is to merge all the pieces of files into one file and load the merge file into table, this takes less than 5 mins to complete.
            cur_date_str = cur_date_dt.strftime('%Y%m%d')
            working_dir = data_dir + SEP + cur_date_str
            file_merged = os.path.join(working_dir, "file_merged.csv")
            if os.path.exists(file_merged):
                warn_log('Removing old file: ' + file_merged)
                os.remove(file_merged)
            #-- Starting to merge files
            with open(file_merged, "a") as dest:
                i=0
                for _, _, filenames in os.walk(working_dir):
                    for filename in fnmatch.filter(filenames, "[0-9]*.txt"):
                        with open(os.path.join(working_dir, filename)) as src:
                            shutil.copyfileobj(src, dest)
                        i+=1
                        print_log('Merged ' + str(i) + ' files.')
            #-- Deleting records from db
            del_sql = '''delete from dw.stock_transaction where biz_date = '{}' '''.format(cur_date_str)
            get_query_result(conn, del_sql)
            conn.commit()
            print_log('Deletion for biz_date {} completed successfully.'.format(cur_date_str))
            #-- Updating is_load_success to N in log table
            upd_sql = '''update dw.log_stock_transaction set is_load_success = 'N' where biz_date = '{}' and is_download_success = 'Y' '''.format(cur_date_str)
            get_query_result(conn, upd_sql)
            conn.commit()
            print_log('is_load_success is updated to N')

            #++++++++ Starting to load the merged file into table
            psql_copy_from(DB_HOST, DB_NAME, DB_UNAME, 'dw.stock_transaction', file_merged, DB_PORT, args=' with (encoding \'GBK\')')
            print_log('Successfully loaded {} into table.'.format(file_merged))
            
            #-- Updating is_load_success to Y in log table
            upd_sql = '''update dw.log_stock_transaction set is_load_success = 'Y' where biz_date = '{}' and is_download_success = 'Y' '''.format(cur_date_str)
            get_query_result(conn, upd_sql)
            conn.commit()
            print_log('is_load_success is updated to Y')

            #-- Cleaning up working dir
            os.remove(file_merged)
            
            cur_date_dt = cur_date_dt + datetime.timedelta(1)
            
        else:
            stock_list_sql_var_replaced = stock_list_sql.format(biz_date=cur_date_dt)
            cur.execute(stock_list_sql_var_replaced)
            rows = list(cur)
            for row in rows:
                row_id = row['row_id']
                biz_date = str(row['biz_date']).replace('-','')
                stock_id = row['stock_id']
                while queue.full():
                    print_log('=================> queue is full, wait for 1 second...')
                    time.sleep(1)
                s = Stock_trans_loader(queue, conn, row_id, stock_id, biz_date, enable_copy=enable_copy )
                s.start()
                print_log('-----> queue size: ' + str(queue.qsize()))
                conn.commit()
                    
            cur_date_dt = cur_date_dt + datetime.timedelta(1)

    while not queue.empty():
        print_log('=================> queue is not empty yet, wait for 1 second...')
        time.sleep(1)
def load_into_dim_bankuai(db_conn, file, parent_bankuai_ids={u'概念板块': 1, u'地域板块': 2, u'行业板块': 3} ):
	#-- load CSV
	csvf = open(file)
	csvr = csv.DictReader(csvf)
	bankuais = {}
	invalid_bankuai_ids = []

	#---- get parent_bankuai_id, bankuai_name from csv
	for row in csvr:
		bankuai = row[u'板块名称'.encode("gbk")].decode("gbk")
		parent_bankuai = row[u'子版块'.encode("gbk")].decode("gbk")
		parent_bankuai_id = parent_bankuai_ids[parent_bankuai]
		bankuais[bankuai] = {}
		bankuais[bankuai]["parent_bankuai_id"] = parent_bankuai_id
		#bankuais[bankuai].setdefault("parent_bankuai_id", parent_bankuai_id)
	csvf.close()
	print_log("%(num)s records have been read from %(fname)s." % {"num": len(bankuais.keys()), "fname": file})
	
	#---- get parent_bankuai_id, bankuai_name from db, seach the combination in csv dict, if it doesn't exist, add to invalid_bankuai_ids
	select_sql = "select t.parent_bankuai_id, t.name, t.id from dw.dim_bankuai t where t.is_valid = 'Y'"
	cur = get_cur(db_conn)
	cur.execute(select_sql)
	db_rows = list(cur)

	for db_row in db_rows:
		db_bankuai = db_row["name"].decode("utf-8")
		db_parent_bankuai_id = db_row["parent_bankuai_id"]
		db_id = db_row["id"]
		
		if db_bankuai in bankuais:
			if db_parent_bankuai_id == bankuais[db_bankuai]["parent_bankuai_id"]:
				#delete from bankuais if it's already in the table and is_valid=Y
				del bankuais[db_bankuai]
			else: 
				invalid_bankuai_ids.append(str(db_id))
		else:
			invalid_bankuai_ids.append(str(db_id))

	#---- mark bankuais is_valid=N
	if len(invalid_bankuai_ids) > 0:
		invalid_bankuai_ids_str = ",".join(invalid_bankuai_ids)
		print_log("Invalid bankuai ids: " + invalid_bankuai_ids_str)
		upd_sql = "update dw.dim_bankuai t set is_valid = 'N', upd_time = now() where t.id in (%(ids)s)" % {"ids": invalid_bankuai_ids_str}
		cur.execute(upd_sql)
		db_conn.commit()
	else:
		print_log("No invalid bankuai ids.")
		
	#---- insert bankuais into dim_bankuai
	if len(bankuais.keys()) > 0:
		values = []
		print_log("There are %(num)s bankuais will be inserted." % {"num": len(bankuais.keys())})
		for b in bankuais:
			values.append("('%(name)s', '%(parent_bankuai_id)s', now(), 'Y')" % {"name": b, "parent_bankuai_id": bankuais[b]["parent_bankuai_id"]} )
		values_str = ",".join(values)
		ins_sql = "insert into dw.dim_bankuai(name, parent_bankuai_id, upd_time, is_valid) values %(values)s" % {"values": values_str}
		cur.execute(ins_sql)
		db_conn.commit()
	else:
		print_log("No new bankuai ids.")
	
	print_log("dw.dim_bankuai has been refreshed successfully.")
예제 #41
0
						max_date = m.group("date")
		file_to_recon = file_db_recon[type]["file"].replace("$DATE", max_date)
	else:
		if not os.path.isfile(options.in_file):
			error_log("file can't be found! [" + options.in_file + "]")
			exit_process()
		else:
			file_to_recon = options.in_file
			
	#-- building dict for csv and db
	csvf = open(file_to_recon)
	csvr = csv.DictReader(csvf)

	#-- building dict for csv
	# based on the list of recon_fields_in_file, read the corresponding fields in csv and concatenate them together as a PK
	print_log("Start to read %(file)s..." % {"file": file_to_recon})
	for row in csvr:
		key = []
		for i in range(len(file_db_recon[type]["recon_fields_in_file"])):
			field = file_db_recon[type]["recon_fields_in_file"][i]
			key.append(row[field.encode("gbk")].decode("gbk"))
		csv_dict["-".join(key)] = ""
	print_log("%(num)s records loaded, dict for csv done." % {"num": len(csv_dict.keys()) })
	csvf.close()

	#-- building dict for db
	# based on the list of recon_fields_in_db, read the corresponding fields in db and concatenate them together as a PK
	print_log("Start to read db...")
	select_sql = file_db_recon[type]["sql"]
	cur = get_cur(conn)
	cur.execute(select_sql)
예제 #42
0
def load_into_dim_bankuai(db_conn, file, parent_bankuai_ids={u"概念板块": 1, u"地域板块": 2, u"行业板块": 3}):
    # -- load CSV
    csvf = open(file)
    csvr = csv.DictReader(csvf)
    bankuais = {}
    invalid_bankuai_ids = []

    # ---- get parent_bankuai_id, bankuai_name from csv
    for row in csvr:
        bankuai = row[u"板块名称".encode("gbk")].decode("gbk")
        parent_bankuai = row[u"子版块".encode("gbk")].decode("gbk")
        parent_bankuai_id = parent_bankuai_ids[parent_bankuai]
        bankuais[bankuai] = {}
        bankuais[bankuai]["parent_bankuai_id"] = parent_bankuai_id
        # bankuais[bankuai].setdefault("parent_bankuai_id", parent_bankuai_id)
    csvf.close()
    print_log("%(num)s records have been read from %(fname)s." % {"num": len(bankuais.keys()), "fname": file})

    # ---- get parent_bankuai_id, bankuai_name from db, seach the combination in csv dict, if it doesn't exist, add to invalid_bankuai_ids
    select_sql = "select t.parent_bankuai_id, t.name, t.id from dw.dim_bankuai t where t.is_valid = 'Y'"
    cur = get_cur(db_conn)
    cur.execute(select_sql)
    db_rows = list(cur)

    for db_row in db_rows:
        db_bankuai = db_row["name"].decode("utf-8")
        db_parent_bankuai_id = db_row["parent_bankuai_id"]
        db_id = db_row["id"]

        if db_bankuai in bankuais:
            if db_parent_bankuai_id == bankuais[db_bankuai]["parent_bankuai_id"]:
                # delete from bankuais if it's already in the table and is_valid=Y
                del bankuais[db_bankuai]
            else:
                invalid_bankuai_ids.append(str(db_id))
        else:
            invalid_bankuai_ids.append(str(db_id))

            # ---- mark bankuais is_valid=N
    if len(invalid_bankuai_ids) > 0:
        invalid_bankuai_ids_str = ",".join(invalid_bankuai_ids)
        print_log("Invalid bankuai ids: " + invalid_bankuai_ids_str)
        upd_sql = "update dw.dim_bankuai t set is_valid = 'N', upd_time = now() where t.id in (%(ids)s)" % {
            "ids": invalid_bankuai_ids_str
        }
        cur.execute(upd_sql)
        db_conn.commit()
    else:
        print_log("No invalid bankuai ids.")

        # ---- insert bankuais into dim_bankuai
    if len(bankuais.keys()) > 0:
        values = []
        print_log("There are %(num)s bankuais will be inserted." % {"num": len(bankuais.keys())})
        for b in bankuais:
            values.append(
                "('%(name)s', '%(parent_bankuai_id)s', now(), 'Y')"
                % {"name": b, "parent_bankuai_id": bankuais[b]["parent_bankuai_id"]}
            )
        values_str = ",".join(values)
        ins_sql = "insert into dw.dim_bankuai(name, parent_bankuai_id, upd_time, is_valid) values %(values)s" % {
            "values": values_str
        }
        cur.execute(ins_sql)
        db_conn.commit()
    else:
        print_log("No new bankuai ids.")

    print_log("dw.dim_bankuai has been refreshed successfully.")
 def download_to_local(self):
     print_log('Reading data from ' + self.get_url())
     save_file_from_url(self.__download_file, self.get_url())
     print_log('Data saved to ' + self.__download_file)
예제 #44
0
def load_into_bankuai(db_conn, file, biz_date=None ):

# 板块	子版块		板块名称	涨跌幅	总市值(亿)	换手率	上涨家数	下跌家数	领涨股票代码	领涨股票	领涨股票涨跌幅
# 板块	概念板块	全息技术	3.95%	365.12		11.65	7			1			600288			大恒科技	10.03
# 板块	概念板块	网络安全	2.95%	818.79		25.61	19			1			002308			威创股份	10.01

# biz_date date not null,
# bankuai_id integer not null,
# rise varchar(16),
# market_value_in_million decimal(12,2),
# turnover_rate decimal(5,2),
# num_of_rise integer,
# num_of_drop integer,
# leading_stock_id varchar(6),
# rise_of_leading_stock decimal(10,2),
# primary key(biz_date, bankuai_id)
	
	bk_id_dict = {}
	csv_data = []
	v_biz_date = ""
	
	#-- build dict for bankuai name and bankuai id from db
	select_sql = 'select t.name, t.id from dw.dim_bankuai t'
	cur = get_cur(db_conn)
	cur.execute(select_sql)
	db_rows = list(cur)
	for db_row in db_rows:
		db_name = db_row["name"].decode("utf-8")
		db_id = db_row["id"]
		bk_id_dict[db_name] = db_id
	
	print_log("There are %(num)s records read from %(name)s" % {"num": len(bk_id_dict.keys()), "name": 'dw.dim_bankuai'})

	#-- load CSV
	csvf = open(file)
	csvr = csv.DictReader(csvf)
	for row in csvr:
		bk_name = row[u'板块名称'.encode("gbk")].decode("gbk")
		bk_id = bk_id_dict[bk_name]
		row_dict = {}
		row_dict[bk_id] = {}
		row_dict[bk_id]["rise"] = row[u'涨跌幅'.encode("gbk")].decode("gbk")
		row_dict[bk_id]["market_value_in_million"] = row[u'总市值(亿)'.encode("gbk")]
		row_dict[bk_id]["turnover_rate"] = row[u'换手率'.encode("gbk")]
		row_dict[bk_id]["num_of_rise"] = row[u'上涨家数'.encode("gbk")]
		row_dict[bk_id]["num_of_drop"] = row[u'下跌家数'.encode("gbk")]
		row_dict[bk_id]["leading_stock_id"] = row[u'领涨股票代码'.encode("gbk")]
		row_dict[bk_id]["rise_of_leading_stock"] = row[u'领涨股票涨跌幅'.encode("gbk")]
		
		csv_data.append(row_dict)
		
	csvf.close()
	print_log("%(num)s records have been read from %(name)s." % {"num": len(csv_data), "name": file})

	#-- determine biz_date
	if not biz_date is None: 
		if re.search(r'\d{8}', biz_date):
			v_biz_date = biz_date
		else:
			raise RuntimeError(biz_date + " is not a valid date format, the date should be like YYYYMMDD.") 
	elif re.search(r'.*(?P<date>\d{8})\.csv', file):
		v_biz_date = re.search(r'.*(?P<date>\d{8})\.csv', file).group("date")
	else:
		raise RuntimeError('Can not determine biz_date, please check if file name has date included or pass biz_date when calling the function.')
	v_biz_date_dt = datetime.datetime.strptime(v_biz_date,'%Y%m%d')
	
	#-- delete biz_date from dw.bankuai
	del_sql = 'delete from dw.bankuai where biz_date = \'%(date)s \'' % {'date': v_biz_date_dt}
	cur.execute(del_sql)
	db_conn.commit()
	print_log("Deleted records from dw.bankuai where biz_date = '%(biz_date)s'." % {"biz_date": v_biz_date})

	#-- insert into dw.bankuai
	iter = 0
	for r in csv_data:
		k = r.keys()[0]
		iter += 1
		ins_sql = '''insert into dw.bankuai(
			biz_date, 
			bankuai_id, 
			rise, 
			market_value_in_million, 
			turnover_rate, 
			num_of_rise, 
			num_of_drop, 
			leading_stock_id, 
			rise_of_leading_stock) values(
			'%(biz_date)s',
			%(bankuai_id)s, 
			'%(rise)s', 
			%(market_value_in_million)s, 
			%(turnover_rate)s, 
			%(num_of_rise)s, 
			%(num_of_drop)s, 
			'%(leading_stock_id)s', 
			%(rise_of_leading_stock)s
			)''' % {
			'biz_date': v_biz_date_dt, 
			'bankuai_id': k, 
			'rise': r[k]['rise'], 
			'market_value_in_million': r[k]['market_value_in_million'], 
			'turnover_rate': r[k]['turnover_rate'], 
			'num_of_rise': r[k]['num_of_rise'], 
			'num_of_drop': r[k]['num_of_drop'], 
			'leading_stock_id': r[k]['leading_stock_id'] if r[k]['leading_stock_id'] != '-' else '000000', # sometimes eastmoney doesn't return valid leading stock id, but '-', for this case, '000000' would replace it as an unknown stock id
			'rise_of_leading_stock': r[k]['rise_of_leading_stock']
			}
		cur.execute(ins_sql)
		
	db_conn.commit()
	print_log( str(iter) + " inserted into dw.bankuai.")
	print_log("dw.bankuai has been refreshed successfully.")
# check validation of start_date and end_date
if not (re.match("^\d{8}$", options.start_date)
        and re.match("^\d{8}$", options.end_date)):
    exit_error("Not valid start_date or end_date! [" + options.start_date +
               "][" + options.end_date + "]")
elif options.start_date > options.end_date:
    exit_error("Start date is greater then end date! [" + options.start_date +
               "][" + options.end_date + "]")

#-- create queue
queue = Queue(QUEUE_DOWNLOAD_MAX_SIZE)
#-- download stock info from internet
if options.mode == 'download' or options.mode == 'downloadAndLoad':
    #-- at most run 3 times, just in case some stocks failed to download
    for i in ['1st', '2nd', '3rd']:
        print_log('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
        print_log('downloader running for the {n} time...'.format(n=i))
        print_log('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
        downloader(queue, conn)
        error_num = download_log_checker(conn)
        if error_num == 0: break
        print_log(
            '=================> waiting for 10 seconds to start the next round run...'
        )
        time.sleep(10)
    #-- retry 3 times, still failed, raise runtime error
    if error_num > 0:
        exit_error(
            'There are {num} stocks failed to download, please check.'.format(
                num=error_num))
    #queue.task_done()
if not (re.match("^\d{8}$", start_date) and re.match("^\d{8}$", end_date)):
	exit_error("start_date or end_date error! [" + start_date + "][" + end_date + "]")
elif start_date > end_date:
	exit_error("start_date must be smaller than end_date! [" + start_date + "][" + end_date + "]")


#------------------------------------------- Downloading
if options.mode in ('download', 'downloadAndLoad'):
    e = Eastmoney()
    
    bkbkfile_full_name = Sys_paths.DATA_STOCK_BANKUAI_DAILY + Sys_paths.SEP + 'bankuai_' + recent_working_day + '.csv'
    if os.path.exists(bkbkfile_full_name):
        bk_bkbkfile_full_name = bkbkfile_full_name + "." + datetime.datetime.now().strftime("%Y%m%d%H%M%S")
        os.rename(bkbkfile_full_name, bk_bkbkfile_full_name) #rename
        print_log('The original file ' + bkbkfile_full_name + " has been renamed to " + bk_bkbkfile_full_name)
    e.export_bankuai_status(bkbkfile_full_name)
	
    bkstfile_full_name = Sys_paths.DATA_STOCK_BANKUAI_DAILY + Sys_paths.SEP + 'bankuai_stock_' + recent_working_day + '.csv'
    if os.path.exists(bkstfile_full_name):
        bk_bkstfile_full_name = bkstfile_full_name + "." + datetime.datetime.now().strftime("%Y%m%d%H%M%S")
        os.rename(bkstfile_full_name, bk_bkstfile_full_name) #rename
        print_log('The original file ' + bkstfile_full_name + " has been renamed to " + bk_bkstfile_full_name)
    e.export_bankuai_stock(bkstfile_full_name)
    
#------------------------------------------- LOADing
if options.mode in ('downloadAndLoad', 'load'):
    #-- determine file to load, $DATE is not replaced
    if options.in_file is None:
        if options.table is None:
            for tab in table_mapping: