Пример #1
0
    def insert(self, total_data):
        # data order:
        # 公司代码 公司简称 董监高姓名 职务 股票种类 货币种类 本次变动前持股数
        # 变动数 本次变动平均价格 变动后持股数 变动原因 变动日期 填报日期
        number = (lambda num: num.replace(',', ''))

        for item in total_data:
            currency = data_by_table_type({'zhsname': item[5]}, [('code',)], 'curr')
            secu_code, orgid = data_by_table_type({'tick': item[0]}, [('code', ), ('org', 'id')], 'stock')

            query_name_en = {'name.szh': item[2], 'orgid': orgid}
            name_en, pid = data_by_table_type(query_name_en, [('name', 'en'), ('pid',)], 'exec')
            to_ratio, cir_ratio = ratio(number(item[7]), secu_code, 'stock', 'vary')
            uid_uuid = str(uuid.uuid5(uuid.NAMESPACE_DNS, ''.join(item).encode('u8')))

            data = {
                'secu': secu_code or item[0], 'org': orgid, 'scp': {'szh': '', 'en': ''}, 'uuid': uid_uuid,
                'name': {'szh': item[2], 'en': name_en}, 'relation': '', 'pid': pid,
                'change': item[7], 'after': item[9], 'cur': currency, 'cause': item[10], 'cd': item[11],
                'rd': item[12], 'stat': 1, 'price': item[8], 'upt': datetime.now(), 'upu': 'system',
                'torat': to_ratio, 'cirrat': cir_ratio, 'typ': 'sha'
            }

            if not self._coll_in.get({'uuid': uid_uuid}, {'secu': 1}):
                self._coll_in.insert(data)
            else:
                print 'uuid existed:', uid_uuid
        self._coll_in.disconnect()
Пример #2
0
    def main(self, multi_pool_page=None):
        assert multi_pool_page is None or isinstance(multi_pool_page, int), '`multi_pool_page` must None or int.'

        if multi_pool_page is None:
            start_page, end_page = 1, self.crawl_pages
        else:
            start_page, end_page = multi_pool_page, multi_pool_page

        for page in range(start_page, end_page + 1):
            url = self._base_url + self._query_string.format(page)
            secus, changes, afters, prices, scps, ncdrs = self.parse_data(url)
            for i in range(len(secus)):
                # Getting secu_code, orgid with `tick` from coll_stock table
                # Getting name_en, pid with `董监高姓名` from coll_exec table
                # Getting to_ratio, cir_ratio with `变动股份数量` and `secu_code` from coll_stock and coll_vary table

                # update data change_date greater than self._latest_cd_data
                cp_flag = self._latest_cd_data < ncdrs[i][1]

                secu_code, orgid = data_by_table_type({'tick': secus[i]}, [('code',), ('org', 'id')], 'stock')
                query_name_en__pid = {'name.szh': ncdrs[i][0], 'orgid': orgid}
                name_en, pid = data_by_table_type(query_name_en__pid, [('name', 'en'), ('pid',)], 'exec')
                to_ratio, cir_ratio = ratio(changes[i], secu_code, 'stock', 'vary')

                # uuid is unique identifier
                name = ''.join([secus[i], ncdrs[i][0], ncdrs[i][1], changes[i], prices[i], ncdrs[i][2],
                                afters[i], scps[i].replace('\\', ''), ncdrs[i][3]])
                uid_uuid = str(uuid.uuid5(uuid.NAMESPACE_DNS, name))
                data = {
                    'secu': secu_code or secus[i], 'org': orgid, 'change': changes[i], 'after': afters[i],
                    'price': prices[i], 'cd': ncdrs[i][1], 'cause': ncdrs[i][2], 'relation': ncdrs[i][3],
                    'stat': 1, 'name': {'szh': ncdrs[i][0], 'en': name_en},
                    'scp': {
                        'szh': MarkReplace(secu_code, scps[i].replace('\\', '')).replace_mark(),
                        'en': data_by_table_type({'name.szh': scps[i]}, [('name', 'en')], 'exec')
                    },
                    'cur': 'CNY', 'rd': '', 'pid': pid, 'uuid': uid_uuid, 'typ': 'szx',
                    'upu': 'system', 'upt': datetime.now(), 'torat': to_ratio, 'cirrat': cir_ratio
                }

                if not cp_flag and not self._coll_in.get({'uuid': uid_uuid}, {'secu': 1}):
                    self._coll_in.insert(data)
                elif cp_flag:
                    self._coll_in.insert(data)
            print 'page: [{0}] done!'.format(page)
        self._coll_in.disconnect()
Пример #3
0
    def main(self, multi_pool_page=None):
        assert multi_pool_page is None or isinstance(
            multi_pool_page, int), '`multi_pool_page` must None or int.'

        if multi_pool_page is None:
            start_page, end_page = 1, self.crawl_pages
        else:
            start_page, end_page = multi_pool_page, multi_pool_page

        for page in range(start_page, end_page + 1):
            url = self._base_url + self._query_string.format(page)
            secus, changes, afters, prices, scps, ncdrs = self.parse_data(url)
            for i in range(len(secus)):
                # Getting secu_code, orgid with `tick` from coll_stock table
                # Getting name_en, pid with `董监高姓名` from coll_exec table
                # Getting to_ratio, cir_ratio with `变动股份数量` and `secu_code` from coll_stock and coll_vary table

                # update data change_date greater than self._latest_cd_data
                cp_flag = self._latest_cd_data < ncdrs[i][1]

                secu_code, orgid = data_by_table_type({'tick': secus[i]},
                                                      [('code', ),
                                                       ('org', 'id')], 'stock')
                query_name_en__pid = {'name.szh': ncdrs[i][0], 'orgid': orgid}
                name_en, pid = data_by_table_type(query_name_en__pid,
                                                  [('name', 'en'),
                                                   ('pid', )], 'exec')
                to_ratio, cir_ratio = ratio(changes[i], secu_code, 'stock',
                                            'vary')

                # uuid is unique identifier
                name = ''.join([
                    secus[i], ncdrs[i][0], ncdrs[i][1], changes[i], prices[i],
                    ncdrs[i][2], afters[i], scps[i].replace('\\',
                                                            ''), ncdrs[i][3]
                ])
                uid_uuid = str(uuid.uuid5(uuid.NAMESPACE_DNS, name))
                data = {
                    'secu': secu_code or secus[i],
                    'org': orgid,
                    'change': changes[i],
                    'after': afters[i],
                    'price': prices[i],
                    'cd': ncdrs[i][1],
                    'cause': ncdrs[i][2],
                    'relation': ncdrs[i][3],
                    'stat': 1,
                    'name': {
                        'szh': ncdrs[i][0],
                        'en': name_en
                    },
                    'scp': {
                        'szh':
                        MarkReplace(secu_code,
                                    scps[i].replace('\\', '')).replace_mark(),
                        'en':
                        data_by_table_type({'name.szh': scps[i]},
                                           [('name', 'en')], 'exec')
                    },
                    'cur': 'CNY',
                    'rd': '',
                    'pid': pid,
                    'uuid': uid_uuid,
                    'typ': 'szx',
                    'upu': 'system',
                    'upt': datetime.now(),
                    'torat': to_ratio,
                    'cirrat': cir_ratio
                }

                if not cp_flag and not self._coll_in.get({'uuid': uid_uuid},
                                                         {'secu': 1}):
                    self._coll_in.insert(data)
                elif cp_flag:
                    self._coll_in.insert(data)
            print 'page: [{0}] done!'.format(page)
        self._coll_in.disconnect()