def get_cst(): from opener import Opener reg = re.compile(r'nyear=(?P<year>\d+).*?nmonth=(?P<month>\d+).*?nday=(?P<day>\d+).*?nwday=(\d+).*?nhrs=(?P<hour>\d+).*?nmin=(?P<minute>\d+).*?nsec=(?P<second>\d+)', re.S) opener = Opener(encoding='utf8') content = opener.urlopen('http://www.beijing-time.org/time.asp', times=0) search_obj = reg.search(content) return search_obj and datetime.datetime(**dict(((item[0], int(item[1])) for item in search_obj.groupdict().items()))) or datetime.datetime.now()
class Register_corp: def __init__(self): logging.basicConfig(filename='log.txt', level=logging.INFO, format='%(asctime)s %(message)s', datefmt='%Y-%m-%d %I:%M:%S %p') self.model = CorpModel self.invalid_code_model = InvalidCodeModel self.max_code_model = MaxCodeModel self.info_from = '广东红盾网' self.charset = 'gbk' self.opener = Opener(encoding=self.charset) # 6位机关代码; 6位 self.org_code = '441900' # 企业性质号码; 2位, 00~30为内资;40~50为外资. self.nature_num = 0 # 流水号; 4位 self.ord_num= 1 self.corp_url = 'http://wsnj.gdgs.gov.cn/aiccps/SearchServlet?service=getEntityInfoByPage®isterNo=%s' self.search_text_reg = re.compile(r'^<table width="100%" border="0"><tr><td align=center height=200 >找不到相关的数据\.\.</td></tr></table>$') self.regs = [ re.compile(r'<td align=left width=100% colspan=6 height=25>(?P<name>[^<]+)', re.S), re.compile(r'址:</td><td align=left valign=top colspan=5>(?P<addr>[^<]*)', re.S), re.compile(r'号:</td><td align=left valign=top><font color="red">(?P<register_code>[^<]*)', re.S), re.compile(r'[名人]:</td><td align=left valign=top colspan=3>(?P<representative>[^<]*)', re.S), re.compile(r'型:</td><td align=left valign=top>(?P<nature>[^<]*)', re.S), re.compile(r'限:</td><td align=left valign=top colspan=3>(?P<period>[^<]*)', re.S), re.compile(r'本:</td><td align=left valign=top>(?P<capital>[^<]*)', re.S), re.compile(r'关:</td><td align=left valign=top colspan=3>(?P<register_department>[^<]*)', re.S), re.compile(r'态:</td><td align=left valign=top>(?P<status>[^<]*)', re.S), re.compile(r'期:</td><td align=left valign=top colspan=3>(?P<establishment_data>[^<]+)', re.S), re.compile(r'围:</td><td align=left valign=top colspan=5>(?P<scope>[^<]*)', re.S), ] self._save_times = 0 self._today = get_cst() def _msg(self, text): return '%s %s' % (time.asctime(), text) def _process(self, corp_dict): corp_dict['insert_date'] = self._today corp_dict['info_from'] = self.info_from if 'establishment_data' in corp_dict and corp_dict['establishment_data']: corp_dict['establishment_data'] = self.model._str2date(corp_dict['establishment_data']) return corp_dict def calc_code15(self, org_code, nature_num, ord_num): """ 计算15位注册号的检验码. 并返回15位注册号. """ code14 = '%s%02d%06d' % (org_code, nature_num, ord_num) temp = reduce(lambda x,y: ((x+int(y))%10 or 10)*2%11, code14, 10) return '%s%s' % (code14, (11-temp)%10) def calc_code13(self, org_code, nature_num, ord_num): """ 返回13位注册号. """ return '%s%s%04d' % (org_code, nature_num, ord_num) def init_invalid_codes(self): i = 0 for query_obj in self.max_code_model.get_all(): org_code, nature_num, max_ord_num = query_obj.org_code, query_obj.nature_num, query_obj.ord_num for ord_num in range(0, max_ord_num): register_code = self.calc_code15(org_code, nature_num, ord_num) if self.model.exists_by(register_code=register_code): continue self.invalid_code_model.add({'register_code': register_code}, is_commit=False) i += 1 if not i%200: self.model.commit() print('Save 200 invalid codes!') self.model.commit() def fetch(self, code): print('###############################################################') print(self._msg('注册码: %s' % code)) url = self.corp_url % code content = self.opener.urlopen(url, timeout=10, times=0) #if content.find(self.search_text) < 0: if self.search_text_reg.match(content): print('没找到相关信息.') return [] result = [] temp = {} for search_obj in (reg.search(content) for reg in self.regs): if search_obj: temp.update(search_obj.groupdict()) print(temp) result.append(temp) if not result: logging.info('register code: %s' % code) return result def save(self, register_code): self._save_times = (self._save_times + 1) % 101 self._save_times or self.model.commit() corps = self.fetch(register_code) if not corps: return False for corp in corps: self.model.add(self._process(corp), is_commit=False) print('添加成功!') print('###############################################################') return True def action(self): invalid_times = 0 while self.nature_num<=99: if self.save(self.calc_code15(self.org_code, self.nature_num, self.ord_num)): invalid_times = 0 else: invalid_times += 1 self.ord_num += 1 if invalid_times >= 500: invalid_times = 0 self.ord_num = 0 if self.nature_num >=40: self.nature_num = 0 self.org_code = str(int(self.org_code)+1) if int(self.org_code)>=440200: break else: self.nature_num = 40 logging.info('org_code: %s nature code: %s' % (self.org_code, self.nature_num)) self.model.commit() def action_test(self): pass def action_new(self, invalid_times=20): for query_obj in self.max_code_model.get_all(): org_code, nature_num, ord_num = query_obj.org_code, query_obj.nature_num, query_obj.ord_num+1 times = 0 while times<invalid_times: if self.save(self.calc_code15(org_code, nature_num, ord_num)): times = 0 query_obj.ord_num = ord_num # 预防 query_obj.ord_num = ord_num 发生特殊的未commit事件. self._save_times or self.model.commit() else: times += 1 ord_num += 1 self.model.commit() def action_from_invalid_codes(self): for query_obj in self.invalid_code_model.get_all(): if self.save(query_obj.register_code): query_obj.delete() self.model.commit() def action_from_file(self): f = open('others.txt') for line in f: code = line[:-1] self.save(code) f.close() self.model.commit() def report(self): corps = self.model.filter_by(status='登记成立', insert_date=datetime.date.today()) #corps = self.model.filter_by(status='登记成立', insert_date=datetime.date(2011,11,3)) rows = [] fields = ( ('名称', 'name'), ('注册码', 'register_code'), ('地址', 'addr'), ('经营范围', 'scope'), ('注册资金', 'capital'), ('成立日期', 'establishment_data'), ('企业性质', 'nature'), ('法人', 'representative'), ('企业状态', 'status'), ('期限', 'period'), ('登记单位', 'register_department'), ('信息来源', 'info_from'), ('更新日期', 'insert_date'), ) self.model.report('东莞红盾网最新注册公司信息_%s.csv' % time.strftime('%Y-%m-%d'), fields=fields, rows=corps, encoder='gbk')
class Corp(multiprocessing.Process): def __init__(self, corplist_url, corp_url, info_from, corplist_post_data=None, corp_post_data=None, corplist_reg=None, corp_regs=[], timeout=5, commit_each_times=30, has_cookie=True, charset='utf8', model=None): """ 参数 corplist_url 和 corp_url 取胜字符串的高级格式化:format, 使用{0},{1}等通配符; """ super().__init__() self.charset = charset self.info_from = info_from self.corplist_url = corplist_url self.corp_url = corp_url self.opener = Opener(has_cookie=has_cookie, encoding=self.charset) self.corplist_post_data = corplist_post_data self.corp_post_data = corp_post_data self.corplist_reg = corplist_reg self.corp_regs = corp_regs self.commit_each_times = commit_each_times self.timeout = timeout if model: self.model = model else: from lib.models import CorpModel self.model = CorpModel #self._today = get_cst() self._today = datetime.date.today() def _msg(self, msg=''): #print('%s %s' % (time.strftime('%Y-%m-%d %H:%M:%S'), msg)) logging.info(msg) def set_queue(self, queue): self.queue = queue def process_corp_info(self, corp_info, date_reg=r'(?P<year>\d+)-(?P<month>\d+)-(?P<day>\d+)'): for key, values in corp_info.items(): corp_info[key] = values.strip() if 'insert_date' in corp_info: corp_info['insert_date'] = self.model._str2date(corp_info['insert_date'], date_reg=date_reg) else: corp_info['insert_date'] = self._today return corp_info def get_next_page_url(self): """ 必须是一个非协程的Generator, 或者返回一个iterable. """ return (self.corplist_url) def get_corp_url(self, corp_info={}): return self.corp_url.format(**corp_info) def prepare(self): pass def fetch_corplist(self, page_url): """ 如果成功抓取, 返回一个包含 Corp Info dict 的列表或者iterable; 否则返回 {}. """ content = self.opener.urlopen(page_url, data=self.corplist_post_data, timeout=self.timeout, times=0) return ({} if not search_obj else search_obj.groupdict() for search_obj in self.corplist_reg.finditer(content)) def fetch_corp(self, corp_info=None): """ 如果成功抓取, 返回一个Corp Info 的 dict. """ corp_url = self.get_corp_url(corp_info) content = self.opener.urlopen(corp_url, data=self.corp_post_data, timeout=self.timeout, times=0) for reg in self.corp_regs: search_obj = reg.search(content) search_obj and corp_info.update(search_obj.groupdict()) return corp_info def before_save(self, corp_info): corp_info = self.process_corp_info(corp_info) corp_info['info_from'] = self.info_from return corp_info def commit(self): self.model.commit() @coroutine def check_exists(self): """ Generator, 存在的话返回其 info_from, 否则返回 None. """ corp_names_cache = {} corp_names_cache_list = [] cache_length = 0 result = None while 1: corp_info = (yield result) result = None corp_name = corp_info['name'].strip() if corp_name not in corp_names_cache: corp_names_cache[corp_name] = self.info_from corp_names_cache_list.insert(0,corp_name) cache_length += 1 if cache_length > self.commit_each_times: del corp_names_cache[corp_names_cache_list.pop()] cache_length -= 1 exists_corp = self.model.filter_by(name=corp_name).first() if exists_corp: result = exists_corp.info_from corp_names_cache[corp_name] = result else: result = corp_names_cache[corp_name] def run(self): self.prepare() check_exists = self.check_exists() cur_page = itertools.count() for page_url in self.get_next_page_url(): print('\n%s 第%s页' % (self.info_from, next(cur_page)+1)) for corp_info in self.fetch_corplist(page_url): self._msg('***************************************************') print(corp_info['name'], end=' ') info_from = check_exists.send(corp_info) if not info_from: if self.corp_regs: corp_info = self.fetch_corp(corp_info) corp_info = self.before_save(corp_info) self.queue.put(corp_info) print('保存成功!') else: print('已经存在于: %s' % info_from) self._msg('\n%s 抓取完毕!' % self.info_from) self.queue.put(None) def report(self, fields=None): corps = self.model.filter_by(info_from=self.info_from, insert_date=datetime.date.today()) #corps = self.model.filter_by(info_from=self.info_from, insert_date=datetime.date(2011,12,8)) fields = fields or ( ('名称', 'name'), ('地址', 'addr'), ('联系人', 'contact_person'), ('区号', 'contact_tel_code'), ('电话号码', 'contact_tel_no'), ('邮箱', 'mail'), ('网址', 'website'), ('信息来源', 'info_from'), ('更新日期', 'insert_date'), ('链接', self.corp_url), ) self.model.report('%s最新公司信息_%s.csv' % (self.info_from, time.strftime('%Y-%m-%d')), fields=fields, rows=corps, encoder='gbk')
class Register_corp: def __init__(self): logging.basicConfig(filename='log.txt', level=logging.INFO, format='%(asctime)s %(message)s', datefmt='%Y-%m-%d %I:%M:%S %p') self.model = CorpModel self.invalid_code_model = InvalidCodeModel self.max_code_model = MaxCodeModel self.info_from = '广东红盾网' self.charset = 'utf8' self.opener = Opener(has_cookie=True, encoding=self.charset) # 6位机关代码; 6位 self.org_code = '440101' # 企业性质号码; 2位, 00~30为内资;40~50为外资. self.nature_num = 0 # 流水号; 4位 self.ord_num= 0 self.query_url = 'http://www.gzaic.gov.cn/gsbm/FrmRegBaseopeninfo.aspx' self.corp_url = 'http://www.gzaic.gov.cn/gsbm/FrmRegBaseOpeninfoDetail.aspx?key={0}' self.search_string = '查看' self.reg = re.compile(r'lbSREGNO">(?P<register_code>[^<]*)</span></td>.*?lbSNAME">(?P<name>[^<]*)</span></td>.*?lbSDOM">(?P<addr>[^<]*)</span></td>.*?lbSMAN">(?P<representative>[^<]*)</span></td>.*?lbSENTCLASS">\s*(?P<nature>[^<]*)</span></td>.*?lbSREGCAP">(?P<capital>[^<]*)</span></td>.*?lbSSSZB">[^<]*</span></td>.*?lbSOPSCOPE">(?P<scope>[^<]*)</span></td>.*?LbSREGRDATE">(?P<establishment_data>[^<]*)</span></td>.*?lbSAPPRDATE">[^<]*</span></td>.*?lbSREGORG">(?P<register_department>[^<]*)</span></td>', re.S) self.event_regs = [ re.compile(r'__VIEWSTATE" value="(?P<__VIEWSTATE>[^"]*)'), re.compile(r'__EVENTVALIDATION" value="(?P<__EVENTVALIDATION>[^"]*)'), ] self.key_reg = re.compile(r'key=(?P<key>[^\']*)') self.post_data = { '__EVENTTARGET': 'QueryButton', '__EVENTARGUMENT': '', '__VIEWSTATE': '', '__EVENTVALIDATION': '', 'txtName': '', 'txtReg': '', } self._save_times = 0 self._today = get_cst() self.initial() # 初始化 ufps def initial(self): self.fetch_content(self.query_url) def _msg(self, text): return '%s %s' % (time.asctime(), text) def _process(self, corp_dict): corp_dict.update({ 'insert_date': self._today, 'info_from': self.info_from, 'status': '登记成立', 'period': '长期', }) if 'establishment_data' in corp_dict and corp_dict['establishment_data']: corp_dict['establishment_data'] = self.model._str2date(corp_dict['establishment_data']) return corp_dict def calc_code15(self, org_code, nature_num, ord_num): """ 计算15位注册号的检验码. 并返回15位注册号. """ code14 = '%s%02d%06d' % (org_code, nature_num, ord_num) temp = reduce(lambda x,y: ((x+int(y))%10 or 10)*2%11, code14, 10) return '%s%s' % (code14, (11-temp)%10) def update_events(self, content): for reg in self.event_regs: search_obj = reg.search(content) search_obj and self.post_data.update(search_obj.groupdict()) def fetch_content(self, url, data=None, timeout=10, raw_string=False, update_events=True): content = self.opener.urlopen(url, data, timeout=timeout, times=0) update_events and self.update_events(content) return content def fetch_query_content(self, code): self.post_data.update({ '__EVENTTARGET': 'QueryButton', 'txtReg': code, }) return self.fetch_content(self.query_url, self.post_data) def fetch_key_content(self): self.post_data['__EVENTTARGET'] = 'GridView1$ctl02$LinkButton1' return self.fetch_content(self.query_url, self.post_data) def fetch_corp_key(self, code): content = self.fetch_query_content(code) if content.find(self.search_string) < 0: return None content = self.fetch_key_content() search_obj = self.key_reg.search(content) return search_obj and search_obj.groups()[0] def fetch_corp_info(self, code): print('###############################################################') print(self._msg('注册码: %s' % code)) corp_key = self.fetch_corp_key(code) if corp_key == None: print('没找到相关信息.') return [] print('key: %s' % corp_key) url = self.corp_url.format(corp_key) content = self.fetch_content(url, update_events=False) result = [] search_obj = self.reg.search(content) if search_obj: search_obj.groupdict() and result.append(search_obj.groupdict()) print(search_obj.groupdict()) if not result: logging.info('register code: %s' % code) return result def save(self, register_code): self._save_times = (self._save_times + 1) % 101 self._save_times or self.model.commit() corps = self.fetch_corp_info(register_code) if not corps: return False for corp in corps: self.model.add(self._process(corp), is_commit=False) print('添加成功!') print('###############################################################') return True def action(self): invalid_times = 0 while self.nature_num<=99: if self.save(self.calc_code15(self.org_code, self.nature_num, self.ord_num)): invalid_times = 0 else: invalid_times += 1 self.ord_num += 1 if invalid_times >= 500: invalid_times = 0 self.ord_num = 0 if self.nature_num >=40: self.nature_num = 0 self.org_code = str(int(self.org_code)+1) if int(self.org_code)>=440200: break else: self.nature_num = 40 logging.info('org_code: %s nature code: %s' % (self.org_code, self.nature_num)) self.model.commit() def action_test(self): pass def action_new(self, invalid_times=10): for query_obj in self.max_code_model.get_all(): org_code, nature_num, ord_num = query_obj.org_code, query_obj.nature_num, query_obj.ord_num+1 times = 0 while times<invalid_times: if self.save(self.calc_code15(org_code, nature_num, ord_num)): times = 0 query_obj.ord_num = ord_num # 预防 query_obj.ord_num = ord_num 发生特殊的未commit事件. self._save_times or self.model.commit() else: times += 1 ord_num += 1 self.model.commit() def action_from_invalid_codes(self): for query_obj in self.invalid_code_model.get_all(): if self.save(query_obj.register_code): query_obj.delete() self.model.commit() def action_from_file(self): f = open('others.txt') for line in f: code = line[:-1] self.save(code) f.close() self.model.commit() def report(self): corps = self.model.filter_by(status='登记成立', insert_date=datetime.date.today()) #corps = self.model.filter_by(status='登记成立', insert_date=datetime.date(2011,12,8)) rows = [] fields = ( ('名称', 'name'), ('注册码', 'register_code'), ('地址', 'addr'), ('经营范围', 'scope'), ('注册资金', 'capital'), ('成立日期', 'establishment_data'), ('企业性质', 'nature'), ('法人', 'representative'), ('企业状态', 'status'), ('期限', 'period'), ('登记单位', 'register_department'), ('信息来源', 'info_from'), ('更新日期', 'insert_date'), ) self.model.report('广州红盾网最新注册公司信息_%s.csv' % time.strftime('%Y-%m-%d'), fields=fields, rows=corps, encoder='gbk')