def is_crawl_success(self,url, content): content = util.html_encode(content) if content.find("站长帮手网") > 0: return True if content.find("暂无数据") > 0: return True if content.find("为无效的域名格式") > 0: return True if content.find("HTTP Error 400. The request URL is invalid.") > 0: return True if content.find("您的查询量比较大") > 0: logger.info("您的查询量比较大") if len(login_users) < 100: while True: opener = urllib2.build_opener() username = util.id_generator(10) data = { "username":username, "password": "******", "confirmpassword": "******", "opaction":"reg", "qq":"", "isqqopen":"1", "email":"*****@*****.**" % username } data = urllib.urlencode(data) logger.info(data) headers = { "Referer": "http://my.links.cn/reg.asp" } user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9' headers['User-Agent'] = user_agent try: request= urllib2.Request("http://my.links.cn/regpost.asp", data, headers) r = opener.open(request, timeout=30) try: content = util.html_encode(r.read()) #logger.info(content) login_users.append({"name":username, "pwd":"ann123456", "date":datetime.datetime.now()}) logger.info(login_users) break except Exception,e: #pass traceback.print_exc() except Exception,e: #pass traceback.print_exc() return False
def handle_result(response, website): global total if response.error: # url = website['link'] delete(website) else: try: if response.code != 200: delete(website) else: # html = unicode(response.body,encoding="utf-8",errors='replace') html = util.html_encode(response.body) doc = pq(html) metas = doc('meta') description = None keywords = None for meta in metas: name = pq(meta).attr('name') content = pq(meta).attr('content') if name == 'keywords': keywords = content if name == 'description': description = content update(description, keywords, website) except: traceback.print_exc() total -= 1 if total <= 0: begin()
def login(self, url, redirect=True): global login_users #logger.info(login_users) _login_users = [] for user in login_users: date = user["date"] if (datetime.datetime.now() - date).seconds < 5*60: _login_users.append(user) login_users = _login_users if len(login_users) == 0: login_users.append({"name":"ann201","pwd":"ann123456", "date":datetime.datetime.now()}) #logger.info(login_users) retries = 0 while True: retries += 1 if retries > 3: break self.init_http_session(url) while True: try: idx = random.randint(0, len(login_users) - 1) login_user = login_users[idx] logger.info(login_user) break except: pass data = { "backurl":" http://beian.links.cn", "bsave": "1", "opaction":"login", "username":login_user["name"], "password":login_user["pwd"],} data = urllib.urlencode(data) headers = { "Referer": "http://beian.links.cn" } user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9' headers['User-Agent'] = user_agent try: request= urllib2.Request("http://my.links.cn/checklogin.asp", data, headers) r = self.opener.open(request, timeout=30) except: continue try: content = util.html_encode(r.read()) except: continue #logger.info(content) if content.find("loaduserinfo") > 0: break
def crawler(company_id, link): retry_time = 0 while True: result = news_crawler.crawl(link, agent=False) if result['get'] == 'success': #logger.info(result["content"]) html = util.html_encode(result["content"]) #logger.info(html) contents = extract.extractContents(link, html) title = extract.extractTitle(html) date = extractArticlePublishedDate.extractArticlePublishedDate( link, html) dnews = { "companyId": company_id, "date": date, "title": title, "link": link, "createTime": datetime.datetime.now(), "source": 13001 } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } dcontents.append(dc) rank += 1 dnews["contents"] = dcontents logger.info(dnews) mongo = db.connect_mongo() _id = mongo.article.news.insert_one(dnews).inserted_id mongo.close() return _id retry_time += 1 if retry_time > 10: break return None
def get_meta_info(url): user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9' headers = { 'User-Agent': user_agent, 'accept-language': 'zh-CN,zh;q=0.8,en;q=0.6', 'Accept-Encoding': 'gzip' } try: request = urllib2.Request(url, None, headers) except: return None opener = urllib2.build_opener() retries = 0 while True: try: r = opener.open(request, timeout=17) if r.info().get('Content-Encoding') == 'gzip': buf = StringIO(r.read()) f = gzip.GzipFile(fileobj=buf) data = f.read() else: data = r.read() content = util.html_encode(data) redirect_url = url_helper.url_normalize(r.geturl()) #logger.info(redirect_url) #logger.info(content) d = pq(html.fromstring(content)) title = d("title").text() #logger.info(title) keywords = d("meta[name='keywords']").attr("content") if keywords is None: keywords = d("meta[name='Keywords']").attr("content") #logger.info(keywords) description = d("meta[name='description']").attr("content") if description is None: description = d("meta[name='Description']").attr("content") #logger.info(description) flag, domain = url_helper.get_domain(url) if flag is not True: domain = None return { "url": url, "redirect_url": redirect_url, "domain": domain, "title": title, "tags": keywords, "description": description, "httpcode": 200 } break except: retries += 1 if retries >= 3: return None return None
def process(search_name, from_doc_id, content): d = pq(util.html_encode(content)) divs = d('div.app') for div in divs: e = pq(div) a = e('a.app-name') name = a.text().strip() #logger.info(name) href = a.attr("href") #logger.info(href) result = util.re_get_result("docid=(\d*)",href) if result: (docid_str,) = result try: docid = long(docid_str) except: continue else: continue data = e('a.inst-btn') if len(data) == 0: data = e('a.inst-btn-big') if len(data) == 0: continue type = data.attr("data_detail_type") apkname = data.attr("data_package") version = data.attr("data_versionname") size = None try: size = long(data.attr("data_size")) except: pass item = { "key_int": docid, "search_name": search_name, "name": name, "link": "http://shouji.baidu.com/software/%s.html" % docid, "type": type, "apkname": apkname, "version": version, "size": size } #logger.info(json.dumps(item, ensure_ascii=False, cls=util.CJsonEncoder)) try: android.save_baidu_search(collection_search, item) except Exception,e: logger.info(e)
def handle_news_result(response, company, summary): global total if response.error: # logger.info("Error: %s, %s" % (response.error,response.request.url)) if response.code == 404: content = response.body if content.find("404.jpg") > 0: total -= 1 if total <= 0: begin() request( response.request.url, lambda r, company=company: handle_news_result(r, company, summary)) # logger.info('erroring .....') return logger.info(summary["title"]) content_from_toutiao = True if response.effective_url != response.request.url: if 'toutiao.com' not in response.effective_url: #logger.info(response.effective_url) logger.info(response.request.url) logger.info('url changed .....') content_from_toutiao = False try: content = util.html_encode(response.body) if content_from_toutiao: if content.find(u"京ICP备12025439号") > 0: save_news(company, response.request.url, summary, content, content_from_toutiao) else: request(response.request.url, lambda r, company=company: handle_news_result( r, company, summary)) return else: save_news(company, response.request.url, summary, content, content_from_toutiao) except: traceback.print_exc() total -= 1 if total <= 0: begin()
def process_news(content, news_crawler, news_key, company_key_int, title, news_date, tags): d = pq(html.fromstring(content)) actual_news_url = d("iframe").attr("src") if actual_news_url is None: return if not actual_news_url.startswith("http"): return logger.info("actual_news_url: %s", actual_news_url) retry_time = 0 while True: result = news_crawler.crawl(actual_news_url, agent=True) if result['get'] == 'success' and result.get("code") == 200: #logger.info(result["content"]) news_content = util.html_encode(result["content"]) try: collection_content = { "date": datetime.datetime.now(), "source": SOURCE, "type": TYPE, "url": actual_news_url, "key": news_key, "key_int": int(news_key), "content": news_content, "company_key_int": company_key_int, "title": title, "news_date": news_date, "original_tags": tags } collection.insert_one(collection_content) break except Exception, ex: #logger.exception(ex) pass retry_time += 1 if retry_time > 10: break
for i in range(204, 210): opener = urllib2.build_opener() username = "******" % i data = { "username": username, "password": "******", "confirmpassword": "******", "opaction": "reg", "qq": "", "isqqopen": "1", "email": "*****@*****.**" % username } data = urllib.urlencode(data) logger.info(data) headers = {"Referer": "http://my.links.cn/reg.asp"} user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9' headers['User-Agent'] = user_agent try: request = urllib2.Request("http://my.links.cn/regpost.asp", data, headers) r = opener.open(request, timeout=60) try: content = util.html_encode(r.read()) #logger.info(content) except Exception, e: traceback.print_exc() except Exception, e: traceback.print_exc()