def valid_proxy(self): half_proxy = int(self.size("set:proxy:self_built")) / 2 proxys = self.redis_set_srandmember("set:proxy:self_built", half_proxy) for proxy in proxys: self.proxy = proxy if self.proxy_true(): return from common.proxyutils import choice_proxy self.proxy = choice_proxy(is_debug=True, url="", area=u"中国", host=config.proxy_host, port=config.proxy_port)
def Get_href(self,url): RedisQueue=RedisQueue_master1.getredisQueuev2('yingjiesheng_href1') f1=open('href.txt','ar+') proxy=None count=10 while True: try: #proxy=None myPage=webutil.request(url,timeout=10,proxy=proxy).decode('gbk').encode('utf-8') break except Exception as e3: print e3 proxy = proxyutils.choice_proxy(is_debug=False,host="master1",port=8880) if count<=0: raise Exception(u"连续10次失败,放弃") count-=1 time.sleep(1) try: pp=re.findall('<a href="(.*?)" target="_blank"><span style="color:(.*?);">(.*?)</a>(.*?)</td>(.*?)<td class="date">(.*?)</td>',myPage,re.S) for i in pp: m=i[0] p=m.split('href="')[-1] if p.find('http://')==-1: href='http://www.yingjiesheng.com'+p else: href=p if href not in f1.read(): print href,i[-1] hh=href+'BBD'+i[-1] RedisQueue.put(hh) f1.write(href) f1.write('\r\n') f1.write(i[-1]) f1.write('\r\n') print len(pp) f1.close() except Exception as e2: logging.error("product id:%s" %e2) print e2 pass
def Get_message(self,url,date): db_yjs=mongoutil.getmondbv2(db.mongo_host,db.mongo_port,db.yjs_db_name,db.yjs_table_name,username=db.mongo_user,password=db.mongo_pwd,timeout=30) now=timeutil.format("%Y-%m-%d",time.time()) proxy=None count=10 while True: try: #proxy=None myPage=webutil.request(url,timeout=10,proxy=proxy,encoding="gbk") break except Exception as e3: print e3 proxy = proxyutils.choice_proxy(is_debug=False,host="master1",port=8880) if count<=0: raise Exception(u"连续10次失败,放弃") count-=1 time.sleep(1) tree=etree.HTML(myPage) jiben=xpathutil.get_all_text(tree,".//*[@id='container']/div[3]/div[2]/div/ul",num=0,split=u" ")#.//*[@id='container']/div[3]/div[2]/div/ul/li[2] text=xpathutil.get_all_text(tree,".//*[@id='wordDiv']/div/div",num=0,split=u" ") print len(text) if len(text)<=10: text=xpathutil.get_all_text(tree,".//*[@id='container']/div[3]",num=0,split=u" ") else: pass p=functions.remove_all_space_char(text) p=functions.remove_all_space_char(p) emeail=re.compile('[\w\.-]+@(?:[A-Za-z0-9]+\.)+[A-Za-z]+') e=emeail.findall(p) a=[] phone_number=re.compile('^(?:\+86)?(\d{3})\d{8}$|^(?:\+86)?(0\d{2,3})\d{7,8}$') pn=phone_number.findall(p) posdict=dict() for key in self.key_word: found=False for j in key: index=p.find(j) if index>=0: if found: print "error" else: posdict[j]=index found=True for key in posdict: a.append(posdict[key]) a.sort() save_data=dict() for i in range(0,len(a)): if i+1<len(a): text3='' text3=p[int(a[i]):int(a[i+1])].replace(':',':',1).replace(":",":",1).replace(":",":",1) text3=text3.split(':') if len(text3)>1: if len(text3)==2: save_data[text3[0]]=text3[1] else: save_data[text3[0]]=text3[1]+text3[2] elif len(text3)==1: save_data[text3[0]]='None' else: pass else: pass if e: save_data["邮箱"]=e[0] else: save_data["邮箱"]="无" if pn: save_data['电话']=pn[0] else: save_data["电话"]="无" myPage=myPage.encode('utf-8') title=re.findall('<title>(.*?)</title>',myPage) if not title: title=xpathutil.get_all_text(tree,".//*[@id='container']/div[3]/div[1]/h1/a",num=0,split=u" ") name= title else: name=title[0] print name p='' if not jiben: pp=re.findall('<div class="info clearfix"><ol><li>\xe5\x8f\x91\xe5\xb8\x83\xe6\x97\xb6\xe9\x97\xb4:<u>(.*?)</u></li><li>\xe5\xb7\xa5\xe4\xbd\x9c\xe5\x9c\xb0\xe7\x82\xb9:<u>(.*?) </u></li><li>\xe8\x81\x8c\xe4\xbd\x8d\xe7\xb1\xbb\xe5\x9e\x8b:<u>(.*?)</u></li><li>\xe6\x9d\xa5\xe6\xba\x90:<a href="#" onclick="window.open(.*?)">(.*?)</a></li>',myPage) for i in pp: for j in i: if j.find('(')==-1: p=p+j+'\r\n' else: jiben=jiben.replace('\t','') jiben=jiben.split('\r\n') for i in jiben: p=p+i.split(':')[-1].replace('\n','')+'\r\n' print p myPage=myPage.decode('utf-8') keys=url+now save_data["公司名称"]=name save_data["发布时间"]=date save_data["文本1"]=p save_data['文本2']=text save_data["页面链接"]=url save_data["页面源码"]=myPage save_data['dotime']=now save_data['uptime']=time.time() save_data['source']="yingjiesheng" save_data["type"]="1" mongoutil.updatev3(db_yjs,keys,save_data) print("数据入库成功!")