def work_thread(temp): set = setting.objects.get(id=1) last_num = set.patent_num delay = set.scrap_delay logger.log('last_num: ' + last_num) while (get_loop()): num = WatchMan.is_change() if num == None: continue logger.log('get the num: ' + str(num[0])) if str(num) != last_num: set.patent_num = num set.save() set = setting.objects.get(id=1) last_num = set.patent_num logger.log('last_num: ' + last_num) logger.log("catch it !") # set_scraping(True) scrap() set_scraping(False) # set_loop(False) logger.log("wathman is running !" + time.strftime("%y-%m-%d %H:%M")) #seconds time.sleep(delay)
def work_thread(temp): set = setting.objects.get(id=1) last_num = set.patent_num delay = set.scrap_delay logger.log('last_num: ' + last_num) while (get_loop()): num = WatchMan.is_change() if num == None: continue logger.log('get the num: ' + str(num[0])) if str(num) != last_num: set.patent_num = num set.save() set = setting.objects.get(id=1) last_num = set.patent_num logger.log('last_num: ' + last_num) logger.log("catch it !") # set_scraping(True) scrap() set_scraping(False) # set_loop(False) logger.log("wathman is running !" + time.strftime("%y-%m-%d %H:%M")) #seconds time.sleep(delay)
def watch(request): if get_scraping() == True: return HttpResponse('正在抓取,无法开启监控!') if get_loop() == True: return HttpResponse('已经开启监控,请勿重复开启!') set_loop(True) thread.start_new_thread(work_thread, (1,)) logger.log('continue') #等待一段时间,让线程跑起来 time.sleep(20) return HttpResponse('ok')
def watch(request): if get_scraping() == True: return HttpResponse('正在抓取,无法开启监控!') if get_loop() == True: return HttpResponse('已经开启监控,请勿重复开启!') set_loop(True) thread.start_new_thread(work_thread, (1, )) logger.log('continue') #等待一段时间,让线程跑起来 time.sleep(20) return HttpResponse('ok')
def re_transmission(self, url, opener, post_data=None, append=None): failed = True if post_data != None: u_post_data = {} for k, v in post_data.iteritems(): u_post_data[k] = unicode(v).encode('utf-8') u_post_data = urllib.urlencode(u_post_data) if append != None: u_post_data += ('&channelId=' + append) data = None while failed: try: data = opener.open(url, u_post_data).read() #logger.log("try to visit url:"+url+" success !", flush=True) failed = False except Exception, e: failed = True logger.log("try to visit url:" + url + " failed !") logger.log(str(e), flush=True) pass
def re_transmission(self, url, opener, post_data=None,append=None): failed = True if post_data != None: u_post_data = {} for k, v in post_data.iteritems(): u_post_data[k] = unicode(v).encode('utf-8') u_post_data = urllib.urlencode(u_post_data) if append != None: u_post_data+=('&channelId='+append) data = None while failed: try: data = opener.open(url, u_post_data).read() #logger.log("try to visit url:"+url+" success !", flush=True) failed = False except Exception, e: failed = True logger.log("try to visit url:"+url+" failed !") logger.log(str(e), flush=True) pass
def login(self, user='', pwd=''): try_login = "******" + str( random.random()) # 尝试登陆 login_url = "http://vip.cnipr.com/login!goonlogin.action?rd=" + str( random.random()) # 强制登陆 check_login = "******" + str( random.random()) cnt = 3 while (cnt > 0): cnt -= 1 try: cj = cookielib.CookieJar() browser = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) browser.addheaders = [ ('User-agent', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)') ] post_data = {'username': user, 'password': pwd} ####################### data = self.re_transmission(try_login, browser, post_data) if json.loads(data)['msg'] == 'alreadylogin': logger.log("account already login !", flush=True) cj = cookielib.CookieJar() browser = urllib2.build_opener( urllib2.HTTPCookieProcessor(cj)) browser.addheaders = [ ('User-agent', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)') ] post_data = {'username': user, 'password': pwd} ####################### data = self.re_transmission(login_url, browser, post_data) if json.loads(data)['msg'] != 'success': logger.log('force login failed !', flush=True) continue logger.log('force login success !', flush=True) return browser if data.find('success') == -1: logger.log("login failed !", flush=True) logger.log("login success !", flush=True) return browser except Exception, e: logger.log("login catch exception !", flush=True) logger.log(str(e), flush=True) pass
def get_data_by_expr(self, expr, opener, start_day=None, end_day=None): #check_login = self.check_login(opener) if opener != None: logger.log('login is not None !') else: logger.log('login is None !') return None begin = 1 end = 2000 search_url = "http://vip.cnipr.com/search!doOverviewSearch.action" if (start_day == end_day): date = time.strftime("%Y%m%d", time.localtime(time.time())) search_expr = expr + " and (" + str(start_day) + ")/PD" else: search_expr = expr + " and (" + str(start_day) + " to " + str( end_day) + ")/PD" #logger.log(search_expr) post_data = { 'strWhere': search_expr, 'start': 1, 'saveFlag': 1, 'limit': 10, 'mpage': 'null', 'channelId': 'SYXX', 'mpage': 'advsch' } ####################### data = self.re_transmission(search_url, opener, post_data, 'FMZL') soup = BeautifulSoup(data) target = soup.findAll('div', {'class': 'g_item'}) length = len(target) logger.log("length of items:" + str(length), format(True)) if length < 1: logger.log("The search result is None,don't need to scrap !") return None # g_item # logger.log data #logger.log str(target) download_url = "http://vip.cnipr.com/downloadvip!download2000.action?rd" + str( random.random()) downlist_url = "http://vip.cnipr.com/downloadvip!downloadUserFile.action?rd=" + str( random.random()) post_data = { 'begin': begin, 'end': end, 'strWhere': search_expr, 'filename': u"著录项批量下载2014401", 'fields': u"申请号;名称;主分类号;分类号;申请(专利权)人;发明(设计)人;公开(公告)日;公开(公告)号;专利代理机构;代理人;申请日;地址;优先权;国省代码;摘要;主权项;国际申请;国际公布;进入国家日期;分案原申请号;权利要求书;法律状态;专利权状态代码", 'source': 'FMZL,SYXX,WGZL', 'strSortMethod': u"-公开(公告)日", "option": "2", } try: file_url = None ####################### data = self.re_transmission(download_url, opener, post_data) json_data = json.loads(data) assert json_data['success'] == True, "获取下载项失败" except Exception, e: logger.log("Get the download file catch exception !", flush=True) logger.log(str(e), flush=True)
#logger.log("try to visit url:"+url+" success !", flush=True) failed = False except Exception, e: failed = True logger.log("try to visit url:" + url + " failed !") logger.log(str(e), flush=True) pass else: while failed: try: data = opener.open(url).read() #logger.log("try to visit url:"+url+" success !", flush=True) failed = False except Exception, e: failed = True logger.log("try to visit url:" + url + " failed !") logger.log(str(e), flush=True) pass return data #登陆 def login(self, user='', pwd=''): try_login = "******" + str( random.random()) # 尝试登陆 login_url = "http://vip.cnipr.com/login!goonlogin.action?rd=" + str( random.random()) # 强制登陆 check_login = "******" + str( random.random()) cnt = 3 while (cnt > 0): cnt -= 1
def scrap(start_day=None, end_day=None, start=1, end=20): logger.clear() logger.begin(start_day, end_day, start) logger.log("Try to get expressions...", flush=True) if end != None: expressions = expression.objects.filter(id__range=(start, end)).order_by('id') else: expressions = expression.objects.filter(id__range=(start, 3000)).order_by('id') s = spider() logger.log("Try to login...", flush=True) browser = s.login() cnt = 0 file_path = '' for item in expressions: cnt += 1 logger.log(u"第" + str(item.id) + u"个表达式:" + item.name, count=item.id, flush=True) #验证是否登录 check_login = s.check_login(browser) if not json.loads(check_login)['success']: logger.log('check is not login , sleep 100s ,then try login again') time.sleep(100) browser = s.login() file_path = s.get_xls_by_expression(item.content, browser, start_day, end_day) if file_path != None: file_path = os.path.normpath(file_path) #logger.log(file_path) rows = excel_table_byindex(file_path, include_name=False) #删除文件 os.remove(file_path) for row in rows: # 忽略第一行 if row == rows[0]: continue apply_num = row[0] # 查重 p = patent.objects.filter(apply_number=apply_num) if len(p) > 0: logger.log("{0} update!".format(apply_num)) p = p[0] records = excute_record.objects.filter(expression=item, time_stamp=row[6]) if len(records) > 0: record = records[0] else: record = excute_record(expression=item, time_stamp=row[6]) record.save() p.record = record p.apply_number = row[0] p.name = row[1] p.main_classify_code = row[2] p.classify_code = row[3] p.apply_man = row[4] p.invente_man = row[5] p.publicity_date = row[6] p.publicity_code = row[7] p.patent_agent = row[8] p.agent = row[9] p.aplly_date = row[10] p.address = row[11] p.priority = row[12] p.province_code = row[13] p.abstract = row[14] p.main_right = row[15] p.international_apply = row[16] p.international_publicity = row[17] p.enter_country_date = row[18] p.right_demand = row[20] p.valid_state = row[21] p.state_code = row[22] p.type = row[23] p.save() continue logger.log(apply_num) #插入纪录 records = excute_record.objects.filter(expression=item, time_stamp=row[6]) # row[6]==public data # if len(records) > 0: # logger.log("record already exist !") record = records[0] else: record = excute_record(expression=item, time_stamp=row[6]) # row[6]==public data # record.save() p = patent( # 对应的执行记录 record=record, # 申请号 apply_number=(row[0]), # 名称 name=(row[1]), # 主分类号 main_classify_code=row[2], #分类号 classify_code=row[3], #申请(专利权)人 apply_man=row[4], #发明(设计)人 invente_man=row[5], #公开(公告)日 publicity_date=(row[6]), #公开(公告)号 publicity_code=row[7], # 专利代理机构 patent_agent=row[8], # 代理人 agent=row[9], # 申请日 aplly_date=row[10], # 地址 address=row[11], # 优先权 priority=row[12], # 国省代码 province_code=row[13], # 摘要 abstract=row[14], # 主权项 main_right=row[15], # 国际申请 international_apply=row[16], # 国际公布 international_publicity=row[17], # 进入国家日期 enter_country_date=row[18], # 权利要求书 right_demand=row[20], # 法律状态 valid_state=row[21], # 专利状态代码 state_code=row[22], # 专利类型 type=row[23] ) try: p.save() except Exception, e: logger.log(str(e), flush=True) logger.log('failed to save patent!',flush=True)
# 摘要 abstract=row[14], # 主权项 main_right=row[15], # 国际申请 international_apply=row[16], # 国际公布 international_publicity=row[17], # 进入国家日期 enter_country_date=row[18], # 权利要求书 right_demand=row[20], # 法律状态 valid_state=row[21], # 专利状态代码 state_code=row[22], # 专利类型 type=row[23] ) try: p.save() except Exception, e: logger.log(str(e), flush=True) logger.log('failed to save patent!',flush=True) logger.log("--------Finish---------", flush=True) logger.finished()
def get_data_by_expr(self, expr, opener, start_day=None, end_day=None): #check_login = self.check_login(opener) if opener != None: logger.log('login is not None !') else: logger.log('login is None !') return None begin = 1 end = 2000 search_url = "http://vip.cnipr.com/search!doOverviewSearch.action" if(start_day==end_day): date = time.strftime("%Y%m%d", time.localtime(time.time())) search_expr = expr + " and ("+str(start_day)+")/PD" else: search_expr = expr + " and (" + str(start_day) +" to "+str(end_day) +")/PD" #logger.log(search_expr) post_data = { 'strWhere': search_expr, 'start': 1, 'saveFlag': 1, 'limit': 10, 'mpage': 'null', 'channelId': 'SYXX', 'mpage': 'advsch' } ####################### data = self.re_transmission(search_url, opener, post_data,'FMZL') soup = BeautifulSoup(data) target = soup.findAll('div', {'class': 'g_item'}) length = len(target) logger.log ("length of items:"+str(length), format(True)) if length < 1: logger.log ("The search result is None,don't need to scrap !") return None # g_item # logger.log data #logger.log str(target) download_url = "http://vip.cnipr.com/downloadvip!download2000.action?rd" + str(random.random()) downlist_url = "http://vip.cnipr.com/downloadvip!downloadUserFile.action?rd=" + str(random.random()) post_data = {'begin': begin, 'end': end, 'strWhere': search_expr, 'filename': u"著录项批量下载2014401", 'fields': u"申请号;名称;主分类号;分类号;申请(专利权)人;发明(设计)人;公开(公告)日;公开(公告)号;专利代理机构;代理人;申请日;地址;优先权;国省代码;摘要;主权项;国际申请;国际公布;进入国家日期;分案原申请号;权利要求书;法律状态;专利权状态代码", 'source': 'FMZL,SYXX,WGZL', 'strSortMethod': u"-公开(公告)日", "option": "2", } try: file_url = None ####################### data = self.re_transmission(download_url, opener, post_data) json_data = json.loads(data) assert json_data['success'] == True, "获取下载项失败" except Exception, e: logger.log("Get the download file catch exception !",flush=True) logger.log(str(e),flush=True)
def login(self, user='', pwd=''): try_login = "******" + str(random.random()) # 尝试登陆 login_url = "http://vip.cnipr.com/login!goonlogin.action?rd=" + str(random.random()) # 强制登陆 check_login = "******" + str(random.random()) cnt=3 while(cnt>0): cnt-=1 try: cj = cookielib.CookieJar() browser = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) browser.addheaders = [('User-agent', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)')] post_data = {'username': user, 'password': pwd} ####################### data = self.re_transmission(try_login,browser,post_data) if json.loads(data)['msg'] == 'alreadylogin': logger.log ("account already login !",flush=True) cj = cookielib.CookieJar() browser = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) browser.addheaders = [('User-agent', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)')] post_data = {'username': user, 'password': pwd} ####################### data = self.re_transmission(login_url,browser,post_data) if json.loads(data)['msg'] != 'success': logger.log('force login failed !',flush=True) continue logger.log('force login success !',flush=True) return browser if data.find('success')==-1: logger.log ("login failed !",flush=True) logger.log("login success !",flush=True) return browser except Exception, e: logger.log ("login catch exception !",flush=True) logger.log(str(e),flush=True) pass
#logger.log("try to visit url:"+url+" success !", flush=True) failed = False except Exception, e: failed = True logger.log("try to visit url:"+url+" failed !") logger.log(str(e), flush=True) pass else: while failed: try: data = opener.open(url).read() #logger.log("try to visit url:"+url+" success !", flush=True) failed = False except Exception, e: failed = True logger.log("try to visit url:"+url+" failed !") logger.log(str(e), flush=True) pass return data #登陆 def login(self, user='', pwd=''): try_login = "******" + str(random.random()) # 尝试登陆 login_url = "http://vip.cnipr.com/login!goonlogin.action?rd=" + str(random.random()) # 强制登陆 check_login = "******" + str(random.random()) cnt=3 while(cnt>0): cnt-=1 try: cj = cookielib.CookieJar() browser = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
def scrap(start_day=None, end_day=None, start=1, end=20): logger.clear() logger.begin(start_day, end_day, start) logger.log("Try to get expressions...", flush=True) if end != None: expressions = expression.objects.filter(id__range=(start, end)).order_by('id') else: expressions = expression.objects.filter( id__range=(start, 3000)).order_by('id') s = spider() logger.log("Try to login...", flush=True) browser = s.login() cnt = 0 file_path = '' for item in expressions: cnt += 1 logger.log(u"第" + str(item.id) + u"个表达式:" + item.name, count=item.id, flush=True) #验证是否登录 check_login = s.check_login(browser) if not json.loads(check_login)['success']: logger.log('check is not login , sleep 100s ,then try login again') time.sleep(100) browser = s.login() file_path = s.get_xls_by_expression(item.content, browser, start_day, end_day) if file_path != None: file_path = os.path.normpath(file_path) #logger.log(file_path) rows = excel_table_byindex(file_path, include_name=False) #删除文件 os.remove(file_path) for row in rows: # 忽略第一行 if row == rows[0]: continue apply_num = row[0] # 查重 p = patent.objects.filter(apply_number=apply_num) if len(p) > 0: logger.log("{0} update!".format(apply_num)) p = p[0] records = excute_record.objects.filter(expression=item, time_stamp=row[6]) if len(records) > 0: record = records[0] else: record = excute_record(expression=item, time_stamp=row[6]) record.save() p.record = record p.apply_number = row[0] p.name = row[1] p.main_classify_code = row[2] p.classify_code = row[3] p.apply_man = row[4] p.invente_man = row[5] p.publicity_date = row[6] p.publicity_code = row[7] p.patent_agent = row[8] p.agent = row[9] p.aplly_date = row[10] p.address = row[11] p.priority = row[12] p.province_code = row[13] p.abstract = row[14] p.main_right = row[15] p.international_apply = row[16] p.international_publicity = row[17] p.enter_country_date = row[18] p.right_demand = row[20] p.valid_state = row[21] p.state_code = row[22] p.type = row[23] p.save() continue logger.log(apply_num) #插入纪录 records = excute_record.objects.filter( expression=item, time_stamp=row[6]) # row[6]==public data # if len(records) > 0: # logger.log("record already exist !") record = records[0] else: record = excute_record( expression=item, time_stamp=row[6]) # row[6]==public data # record.save() p = patent( # 对应的执行记录 record=record, # 申请号 apply_number=(row[0]), # 名称 name=(row[1]), # 主分类号 main_classify_code=row[2], #分类号 classify_code=row[3], #申请(专利权)人 apply_man=row[4], #发明(设计)人 invente_man=row[5], #公开(公告)日 publicity_date=(row[6]), #公开(公告)号 publicity_code=row[7], # 专利代理机构 patent_agent=row[8], # 代理人 agent=row[9], # 申请日 aplly_date=row[10], # 地址 address=row[11], # 优先权 priority=row[12], # 国省代码 province_code=row[13], # 摘要 abstract=row[14], # 主权项 main_right=row[15], # 国际申请 international_apply=row[16], # 国际公布 international_publicity=row[17], # 进入国家日期 enter_country_date=row[18], # 权利要求书 right_demand=row[20], # 法律状态 valid_state=row[21], # 专利状态代码 state_code=row[22], # 专利类型 type=row[23]) try: p.save() except Exception, e: logger.log(str(e), flush=True) logger.log('failed to save patent!', flush=True)
# 摘要 abstract=row[14], # 主权项 main_right=row[15], # 国际申请 international_apply=row[16], # 国际公布 international_publicity=row[17], # 进入国家日期 enter_country_date=row[18], # 权利要求书 right_demand=row[20], # 法律状态 valid_state=row[21], # 专利状态代码 state_code=row[22], # 专利类型 type=row[23]) try: p.save() except Exception, e: logger.log(str(e), flush=True) logger.log('failed to save patent!', flush=True) logger.log("--------Finish---------", flush=True) logger.finished()