def search_weixin_info(keyword, is_all=False): print "search_weixin_info start, keyword="+keyword import urllib weixin_infos = [] page_url = "http://weixin.sogou.com/weixin?type=1&"+urllib.urlencode({"query":keyword}) while True: page_src = utils.getSogouContent(page_url, sleep_time=1) page_src = utils.remove_tag(page_src, "em") page_src = utils.remove_tag(page_src, "/em") page_src = utils.remove_tag(page_src, "!--red_beg--") page_src = utils.remove_tag(page_src, "!--red_end--") regular_str = re.compile(r'<h3>([^<]*)</h3>[^<]*<h4>[^<]*<span>微信号:([^<]*)</span>') datas = regular_str.findall(page_src) #openid openid_regular_str = re.compile(r'gotourl\(\'/gzh\?openid=([^\']*)') openids = openid_regular_str.findall(page_src) if len(datas)==len(openids): for index in range(len(datas)): weixin_infos.append((datas[index][0], datas[index][1], openids[index]), ) else: print "ERROR: search_weixin_info("+keyword+") error! len(datas)="+str(len(datas))+", len(openids)="+str(len(openids)) print page_src return [] # nextpage_regular_str = re.compile(r'<a id="sogou_next" href="([^"]*)') nextpage = nextpage_regular_str.findall(page_src) if is_all!=True or len(nextpage)==0: break else: page_url = "http://weixin.sogou.com/weixin"+nextpage[0] print weixin_infos return weixin_infos
def temp_get_weixin_info(weixin_no): weixin_no = weixin_no.strip() page_url = "http://w.sugg.sogou.com/sugg/ajaj_json.jsp?key="+weixin_no+"&type=wxpub&ori=yes&pr=web&abtestid=&ipn=" headers = { "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding":"gzip,deflate,sdch", "Accept-Language":"zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4", "Cache-Control":"max-age=0", "Connection":"keep-alive", "Host":"w.sugg.sogou.com", "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36", } page_src = utils.getSogouContent(page_url, headers=headers) page_src = page_src.decode('GBK').encode('utf8') regular_str = re.compile(r'\["'+weixin_no+r'",\[([^]]*)') datas = regular_str.findall(page_src) print "get_weixin_info, datas="+str(datas) if datas: data_list = datas[0].split(",") for data in data_list: weixin_name = data.replace("\"", "") weixin_infos = search_weixin_info(weixin_name) print weixin_infos for weixin_info in weixin_infos: if weixin_info[1]==weixin_no: return weixin_info return None
def temp_get_weixin_info(weixin_no): weixin_no = weixin_no.strip() page_url = "http://w.sugg.sogou.com/sugg/ajaj_json.jsp?key=" + weixin_no + "&type=wxpub&ori=yes&pr=web&abtestid=&ipn=" headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding": "gzip,deflate,sdch", "Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Host": "w.sugg.sogou.com", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36", } page_src = utils.getSogouContent(page_url, headers=headers) page_src = page_src.decode('GBK').encode('utf8') regular_str = re.compile(r'\["' + weixin_no + r'",\[([^]]*)') datas = regular_str.findall(page_src) print "get_weixin_info, datas=" + str(datas) if datas: data_list = datas[0].split(",") for data in data_list: weixin_name = data.replace("\"", "") weixin_infos = search_weixin_info(weixin_name) print weixin_infos for weixin_info in weixin_infos: if weixin_info[1] == weixin_no: return weixin_info return None
def get_page_total(weixin_info_id, openid): page_total = 1 page_current = 1 totalPages_retrieve_str = re.compile(r'"totalPages":(\d*)') page_retrieve_str = re.compile(r'"page":(\d*)') while True: page_src = utils.getSogouContent( "http://weixin.sogou.com/gzhjs?cb=sogou.weixin.gzhcb&openid=" + openid + "&page=" + str(page_current), sleep_time=1) if page_src is None: return 0 totalPages = totalPages_retrieve_str.findall(page_src) page = page_retrieve_str.findall(page_src) if len(totalPages) == 0 or len(page) == 0: print "ERROR: get_page_total error!" print page_src return 0 else: page_total = int(totalPages[0]) page_current = int(page[0]) if (page_current >= page_total): break else: page_current = page_total print "sogou page_total(10 results per page)=" + str(page_total) return page_total
def get_page_total(weixin_info_id, openid): page_total = 1 page_current = 1 totalPages_retrieve_str = re.compile(r'"totalPages":(\d*)') page_retrieve_str = re.compile(r'"page":(\d*)') while True: page_src = utils.getSogouContent("http://weixin.sogou.com/gzhjs?cb=sogou.weixin.gzhcb&openid="+openid+"&page="+str(page_current), sleep_time=1) if page_src is None: return 0 totalPages = totalPages_retrieve_str.findall(page_src) page = page_retrieve_str.findall(page_src) if len(totalPages)==0 or len(page)==0: print "ERROR: get_page_total error!" print page_src return 0 else: page_total = int(totalPages[0]) page_current = int(page[0]) if(page_current>=page_total): break else: page_current = page_total print "sogou page_total(10 results per page)="+str(page_total) return page_total
def search_weixin_info(keyword, is_all=False): print "search_weixin_info start, keyword=" + keyword import urllib weixin_infos = [] page_url = "http://weixin.sogou.com/weixin?type=1&" + urllib.urlencode( {"query": keyword}) while True: page_src = utils.getSogouContent(page_url, sleep_time=1) page_src = utils.remove_tag(page_src, "em") page_src = utils.remove_tag(page_src, "/em") page_src = utils.remove_tag(page_src, "!--red_beg--") page_src = utils.remove_tag(page_src, "!--red_end--") regular_str = re.compile( r'<h3>([^<]*)</h3>[^<]*<h4>[^<]*<span>微信号:([^<]*)</span>') datas = regular_str.findall(page_src) #openid openid_regular_str = re.compile(r'gotourl\(\'/gzh\?openid=([^\']*)') openids = openid_regular_str.findall(page_src) if len(datas) == len(openids): for index in range(len(datas)): weixin_infos.append( (datas[index][0], datas[index][1], openids[index]), ) else: print "ERROR: search_weixin_info(" + keyword + ") error! len(datas)=" + str( len(datas)) + ", len(openids)=" + str(len(openids)) print page_src return [] # nextpage_regular_str = re.compile(r'<a id="sogou_next" href="([^"]*)') nextpage = nextpage_regular_str.findall(page_src) if is_all != True or len(nextpage) == 0: break else: page_url = "http://weixin.sogou.com/weixin" + nextpage[0] print weixin_infos return weixin_infos
def scan_article_list(weixin_info_id, openid, look_back=True): article_infos = [] page_total = 1 page_current = 1 totalPages_retrieve_str = re.compile(r'"totalPages":(\d*)') page_retrieve_str = re.compile(r'"page":(\d*)') publish_date_regular_str = re.compile(r'<date><!\[CDATA\[([^]]*)') title_url_regular_str = re.compile(r'<title><\!\[CDATA\[([^]]*)\]\]><[^/]*/title><url><\!\[CDATA\[([^]]*)' ) # db_max_publish_date=None if look_back: db_max_publish_date_object = dbutils.getWeixinArticleMaxPublishDate(weixin_info_id) if db_max_publish_date_object is None: db_max_publish_date = arrow.get('2000-1-1', 'YYYY-M-D') #那时候还没微信 else: db_max_publish_date = arrow.get(db_max_publish_date_object) while True: headers = { "Accept":"*/*", "Accept-Encoding":"gzip,deflate,sdch", "Accept-Language":"zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4", "Cache-Control":"max-age=0", "Connection":"keep-alive", "Host":"weixin.sogou.com", "Referer":"http://weixin.sogou.com/gzh?openid="+openid, "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36", } page_url = "http://weixin.sogou.com/gzhjs?cb=sogou.weixin.gzhcb&openid="+openid+"&page="+str(page_current) page_src = utils.getSogouContent(url=page_url, headers=headers) #获取文章信息(title/url/publish_date) if page_src is None: print "ERROR: scan_article_list("+page_url+") error! page_src is None" return [] datas = title_url_regular_str.findall(page_src) publish_dates = publish_date_regular_str.findall(page_src) if len(datas)==len(publish_dates): for index in range(len(datas)): article_infos.append({"title":datas[index][0], "url":datas[index][1], "publish_date":publish_dates[index], } ) else: #page_src存在特殊字符 title_list = utils.parse_block_match(page_src, "<title><![CDATA[", "]]><\/title><url><![CDATA[") url_list = utils.parse_block_match(page_src, "]]><\/title><url><![CDATA[", "]]><\/url>") if len(title_list)==len(url_list) and len(url_list)==len(publish_dates): for index in range(len(title_list)): article_infos.append({"title":title_list[index], "url":url_list[index], "publish_date":publish_dates[index], } ) else: print "ERROR: scan_article_list("+page_url+") error! len(datas)="+str(len(datas))+", len(publish_date)="+str(len(publish_dates)) print page_src return [] #页面上最小的发布日期比数据库最大时,终止文章列表遍历 if look_back: page_min_publish_dates = publish_date_regular_str.findall(page_src) if len(page_min_publish_dates)>0: page_min_publish_date_str=page_min_publish_dates[-1] page_min_publish_date = arrow.get(page_min_publish_date_str, 'YYYY-M-D') if page_min_publish_date<db_max_publish_date: return article_infos #当前页数等于或大于总页数,终止文章列表遍历 totalPages = totalPages_retrieve_str.findall(page_src) currentPage = page_retrieve_str.findall(page_src) if len(totalPages)==0 or len(currentPage)==0: #出现禁止页,抓取取消 print "ERROR: get_page_total error!" print page_src return [] else: page_total = int(totalPages[0]) page_current = int(currentPage[0]) if(page_current>=page_total): break else: page_current = page_current+1 print "scan_article_list, weixin_info_id="+str(weixin_info_id)+", size="+str(len(article_infos)) return article_infos
def search_weixin_article(keyword): ''' 文章转载量的统计 文章被抄袭,title大多会被修改,且title关键词太多,搜索引擎采取的模糊匹配,短词组匹配率高,采用title判断转载量准确率低。; 大多抄袭文章都不会认真修改,中间段落基本不会被修改,随机截取文章中的稍长句子,到搜索引擎中搜索匹配很高。 考虑绝大数情况的准确率,一篇文章需要多次截取匹配(暂定三次),开头结尾避免采用。 文章搜素结果页源码有: <!--STATUS total 10 time 115 page 2 maxEnd 24 totalItems 24--> 文章数:totalItems 24 ''' print "search_weixin_article start, keyword="+keyword import urllib weixinArticleReproducedRecord_list = [] #keyword=吹牛说起大学就预测出微博类的产品会火,比特币刚出来几乎还没什么人知道的时候还挖了 #keyword=%E5%90%B9%E7%89%9B%E8%AF%B4%E8%B5%B7%E5%A4%A7%E5%AD%A6%E5%B0%B1%E9%A2%84%E6%B5%8B%E5%87%BA%E5%BE%AE%E5%8D%9A%E7%B1%BB%E7%9A%84%E4%BA%A7%E5%93%81%E4%BC%9A%E7%81%AB%EF%BC%8C%E6%AF%94%E7%89%B9%E5%B8%81%E5%88%9A%E5%87%BA%E6%9D%A5%E5%87%A0%E4%B9%8E%E8%BF%98%E6%B2%A1%E4%BB%80%E4%B9%88%E4%BA%BA%E7%9F%A5%E9%81%93%E7%9A%84%E6%97%B6%E5%80%99%E8%BF%98%E6%8C%96%E4%BA%86 keyword_len = len(keyword) is_completed = True page = 1 #标红 page_red_str = re.compile(r'<!--red_end--></em>.{1}<em><!--red_beg-->') #发布日期:var date = new Date(time * 1000); page_publish_date_str = re.compile(r"vrTimeHandle552write\('([^']*)'") while True: page_url = "http://weixin.sogou.com/weixin?type=2&ie=utf8&page="+str(page)+"&"+urllib.urlencode({"query":keyword.encode('utf-8')}) page_src = utils.getSogouContent(page_url, sleep_time=1) #预处理,替换单符号间隔</em>,<em>等,“,”只是用来占一个字符位 page_src, number=page_red_str.subn(",", page_src) from bs4 import BeautifulSoup soup = BeautifulSoup(page_src) # sogou_weixin_name_openid_list = [] weixin_account_list = soup.find_all("a", id="weixin_account") for weixin_account in weixin_account_list: sogou_weixin_name_openid_list.append((weixin_account.get("title"), weixin_account.get("i"))) # sogou_article_title_url_list = [] h4_list = soup.find_all("h4") for h4 in h4_list: sogou_article_title_url_list.append((h4.a.get_text(), h4.a.get("href"))) # sogou_publish_date_list = [] publish_date_datas = page_publish_date_str.findall(page_src) for publish_date_data in publish_date_datas: sogou_publish_date_list.append(str(arrow.get(publish_date_data).date())) # sogou_summary_list = soup.find_all("p", id=re.compile("sogou_vr_*")) for index in range(len(sogou_summary_list)): red_item_list = sogou_summary_list[index].find_all("em" ) for red_item in red_item_list: print len(red_item.get_text()),red_item.get_text() if len(red_item.get_text())>=keyword_len: print len(red_item.get_text()),red_item.get_text() #匹配到了文章 is_completed = False from gather.models import WeixinArticleReproducedRecord weixinArticleReproducedRecord = WeixinArticleReproducedRecord() weixinArticleReproducedRecord.weixin_name = sogou_weixin_name_openid_list[index][0] weixinArticleReproducedRecord.openid = sogou_weixin_name_openid_list[index][1] weixinArticleReproducedRecord.title = sogou_article_title_url_list[index][0] weixinArticleReproducedRecord.url = sogou_article_title_url_list[index][1] weixinArticleReproducedRecord.publish_date = sogou_publish_date_list[index] weixinArticleReproducedRecord_list.append(weixinArticleReproducedRecord) break else: is_completed = True if is_completed: break if is_completed: break else: page = page+1 return weixinArticleReproducedRecord_list
def scan_article_list(weixin_info_id, openid, look_back=True): article_infos = [] page_total = 1 page_current = 1 totalPages_retrieve_str = re.compile(r'"totalPages":(\d*)') page_retrieve_str = re.compile(r'"page":(\d*)') publish_date_regular_str = re.compile(r'<date><!\[CDATA\[([^]]*)') title_url_regular_str = re.compile( r'<title><\!\[CDATA\[([^]]*)\]\]><[^/]*/title><url><\!\[CDATA\[([^]]*)' ) # db_max_publish_date = None if look_back: db_max_publish_date_object = dbutils.getWeixinArticleMaxPublishDate( weixin_info_id) if db_max_publish_date_object is None: db_max_publish_date = arrow.get('2000-1-1', 'YYYY-M-D') #那时候还没微信 else: db_max_publish_date = arrow.get(db_max_publish_date_object) while True: headers = { "Accept": "*/*", "Accept-Encoding": "gzip,deflate,sdch", "Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Host": "weixin.sogou.com", "Referer": "http://weixin.sogou.com/gzh?openid=" + openid, "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36", } page_url = "http://weixin.sogou.com/gzhjs?cb=sogou.weixin.gzhcb&openid=" + openid + "&page=" + str( page_current) page_src = utils.getSogouContent(url=page_url, headers=headers) #获取文章信息(title/url/publish_date) if page_src is None: print "ERROR: scan_article_list(" + page_url + ") error! page_src is None" return [] datas = title_url_regular_str.findall(page_src) publish_dates = publish_date_regular_str.findall(page_src) if len(datas) == len(publish_dates): for index in range(len(datas)): article_infos.append({ "title": datas[index][0], "url": datas[index][1], "publish_date": publish_dates[index], }) else: #page_src存在特殊字符 title_list = utils.parse_block_match(page_src, "<title><![CDATA[", "]]><\/title><url><![CDATA[") url_list = utils.parse_block_match(page_src, "]]><\/title><url><![CDATA[", "]]><\/url>") if len(title_list) == len(url_list) and len(url_list) == len( publish_dates): for index in range(len(title_list)): article_infos.append({ "title": title_list[index], "url": url_list[index], "publish_date": publish_dates[index], }) else: print "ERROR: scan_article_list(" + page_url + ") error! len(datas)=" + str( len(datas)) + ", len(publish_date)=" + str( len(publish_dates)) print page_src return [] #页面上最小的发布日期比数据库最大时,终止文章列表遍历 if look_back: page_min_publish_dates = publish_date_regular_str.findall(page_src) if len(page_min_publish_dates) > 0: page_min_publish_date_str = page_min_publish_dates[-1] page_min_publish_date = arrow.get(page_min_publish_date_str, 'YYYY-M-D') if page_min_publish_date < db_max_publish_date: return article_infos #当前页数等于或大于总页数,终止文章列表遍历 totalPages = totalPages_retrieve_str.findall(page_src) currentPage = page_retrieve_str.findall(page_src) if len(totalPages) == 0 or len(currentPage) == 0: #出现禁止页,抓取取消 print "ERROR: get_page_total error!" print page_src return [] else: page_total = int(totalPages[0]) page_current = int(currentPage[0]) if (page_current >= page_total): break else: page_current = page_current + 1 print "scan_article_list, weixin_info_id=" + str( weixin_info_id) + ", size=" + str(len(article_infos)) return article_infos
def search_weixin_article(keyword): ''' 文章转载量的统计 文章被抄袭,title大多会被修改,且title关键词太多,搜索引擎采取的模糊匹配,短词组匹配率高,采用title判断转载量准确率低。; 大多抄袭文章都不会认真修改,中间段落基本不会被修改,随机截取文章中的稍长句子,到搜索引擎中搜索匹配很高。 考虑绝大数情况的准确率,一篇文章需要多次截取匹配(暂定三次),开头结尾避免采用。 文章搜素结果页源码有: <!--STATUS total 10 time 115 page 2 maxEnd 24 totalItems 24--> 文章数:totalItems 24 ''' print "search_weixin_article start, keyword=" + keyword import urllib weixinArticleReproducedRecord_list = [] #keyword=吹牛说起大学就预测出微博类的产品会火,比特币刚出来几乎还没什么人知道的时候还挖了 #keyword=%E5%90%B9%E7%89%9B%E8%AF%B4%E8%B5%B7%E5%A4%A7%E5%AD%A6%E5%B0%B1%E9%A2%84%E6%B5%8B%E5%87%BA%E5%BE%AE%E5%8D%9A%E7%B1%BB%E7%9A%84%E4%BA%A7%E5%93%81%E4%BC%9A%E7%81%AB%EF%BC%8C%E6%AF%94%E7%89%B9%E5%B8%81%E5%88%9A%E5%87%BA%E6%9D%A5%E5%87%A0%E4%B9%8E%E8%BF%98%E6%B2%A1%E4%BB%80%E4%B9%88%E4%BA%BA%E7%9F%A5%E9%81%93%E7%9A%84%E6%97%B6%E5%80%99%E8%BF%98%E6%8C%96%E4%BA%86 keyword_len = len(keyword) is_completed = True page = 1 #标红 page_red_str = re.compile(r'<!--red_end--></em>.{1}<em><!--red_beg-->') #发布日期:var date = new Date(time * 1000); page_publish_date_str = re.compile(r"vrTimeHandle552write\('([^']*)'") while True: page_url = "http://weixin.sogou.com/weixin?type=2&ie=utf8&page=" + str( page) + "&" + urllib.urlencode({"query": keyword.encode('utf-8')}) page_src = utils.getSogouContent(page_url, sleep_time=1) #预处理,替换单符号间隔</em>,<em>等,“,”只是用来占一个字符位 page_src, number = page_red_str.subn(",", page_src) from bs4 import BeautifulSoup soup = BeautifulSoup(page_src) # sogou_weixin_name_openid_list = [] weixin_account_list = soup.find_all("a", id="weixin_account") for weixin_account in weixin_account_list: sogou_weixin_name_openid_list.append( (weixin_account.get("title"), weixin_account.get("i"))) # sogou_article_title_url_list = [] h4_list = soup.find_all("h4") for h4 in h4_list: sogou_article_title_url_list.append( (h4.a.get_text(), h4.a.get("href"))) # sogou_publish_date_list = [] publish_date_datas = page_publish_date_str.findall(page_src) for publish_date_data in publish_date_datas: sogou_publish_date_list.append( str(arrow.get(publish_date_data).date())) # sogou_summary_list = soup.find_all("p", id=re.compile("sogou_vr_*")) for index in range(len(sogou_summary_list)): red_item_list = sogou_summary_list[index].find_all("em") for red_item in red_item_list: print len(red_item.get_text()), red_item.get_text() if len(red_item.get_text()) >= keyword_len: print len(red_item.get_text()), red_item.get_text() #匹配到了文章 is_completed = False from gather.models import WeixinArticleReproducedRecord weixinArticleReproducedRecord = WeixinArticleReproducedRecord( ) weixinArticleReproducedRecord.weixin_name = sogou_weixin_name_openid_list[ index][0] weixinArticleReproducedRecord.openid = sogou_weixin_name_openid_list[ index][1] weixinArticleReproducedRecord.title = sogou_article_title_url_list[ index][0] weixinArticleReproducedRecord.url = sogou_article_title_url_list[ index][1] weixinArticleReproducedRecord.publish_date = sogou_publish_date_list[ index] weixinArticleReproducedRecord_list.append( weixinArticleReproducedRecord) break else: is_completed = True if is_completed: break if is_completed: break else: page = page + 1 return weixinArticleReproducedRecord_list