def process(crawler, app, content): if content.find('请检查您所输入的URL地址是否有误') != -1: return key = app["key_int"] url = app["link"] d = pq(content) cate = d('div.nav> span >a').eq(1).text().strip() if cate == "游戏": return sub_cate = d('div.nav> span >a').eq(2).text().strip() name = d('h1.app-name> span').text().strip() downloadstr = d("span.download-num").eq(0).text().replace("下载次数:","").replace("+","").strip() if downloadstr.endswith("千"): download = float(downloadstr.replace("千","")) * 1000 elif downloadstr.endswith("万"): download = float(downloadstr.replace("万","")) * 10000 elif downloadstr.endswith("亿"): download = float(downloadstr.replace("亿","")) * 10000 * 10000 else: download = int(downloadstr) logger.info("%s-%s, %s, %s", cate, sub_cate, name, download) mosug_url = "http://m.baidu.com/mosug?wd=%s&type=soft" % urllib.quote(name.encode("utf-8")) while True: result = crawler.crawl(mosug_url) if result['get'] == 'success': mosug_content = result["content"] break #logger.info(mosug_content) data = json.loads(mosug_content) if data["result"].get("s") is None: return found = False for dt in data["result"].get("s"): if dt.get("package") is None: continue if long(dt["docid"]) == key: download = int(dt["download_num"]) score = int(dt["score"]) * 0.05 break # screenshot screenshots = [] imgs = d('img.imagefix') #logger.info(imgs) for img in imgs: surl = pq(img).attr("src") #logger.info(url) screenshots.append(surl) # content desc = d('p.content').text() #logger.info(desc) icon = d('div.app-pic> img').attr("src") #logger.info(icon) author = d('div.origin-wrap> span> span').eq(1).text() chinese, is_company = name_helper.name_check(author) if chinese and is_company: author = name_helper.company_name_normalize(author) #logger.info("author: %s", author) commentbyeditor = d('span.head-content').text() item = { "link": url, "apkname": app["apkname"], "appmarket": APPMARKET, "name": name, "brief": None, "website": None, "description": desc, "commentbyeditor": commentbyeditor, "updateDate": None, "language": None, "tags": sub_cate, "version": app["version"], "updates": None, "size": app["size"], "compatibility": None, "icon": icon, "author": author, "screenshots": screenshots, "type": app["type"], "key": str(key), "key_int": key, "download": download } logger.info(json.dumps(item, ensure_ascii=False, cls=util.CJsonEncoder)) android.save(collection, APPMARKET, item) android.merge(item)
def to_parser(page, url): try: d = pq(html.fromstring(page.decode('utf-8'))) uls = d('div.app-info> ul.app-info-ul') name = d(uls)('span.title').text().strip() logger.info('start to parser name:%s | url:%s' % (name, url)) apkurl = url.split('app/')[-1] key = apkurl.split('C')[-1] key_int = int(key) icon = d(uls)('li.img> img.app-ico').attr('src') downloadstr = d(uls)('span.title').next().text().split(':')[-1] download = None try: download = int(downloadstr) except: if downloadstr.find('万次') >= 0: download = int(float(downloadstr.replace('万次', '')) * 10000) elif downloadstr.find('亿次') >= 0: download = int( float(downloadstr.replace('亿次', '')) * 10000 * 10000) else: logger.info("********download :%s cannot get", downloadstr) sizestr = d(uls).eq(1)('li').eq(0)('span').text().strip() size = None if sizestr.find('KB') >= 0: size = int(float(sizestr.replace("KB", "").strip()) * 1024) elif sizestr.find("MB") >= 0: size = int(float(sizestr.replace("MB", "").strip()) * 1024 * 1024) updatedatestr = d(uls).eq(1)('li').eq(1)('span').text().strip() updatedate = datetime.datetime.strptime(updatedatestr, '%Y-%m-%d') author = d(uls).eq(1)('li').eq(2)('span').attr('title') version = d(uls).eq(1)('li').eq(3)('span').text().strip() if version.startswith("V"): version = version.replace("V", "") elif version.startswith("v"): version = version.replace("v", "") screenshots = [] imgs = d('ul.imgul> li') for img in imgs: imgurl = d(img)('a').attr('href') screenshots.append(imgurl) desc = d('div.content').text().replace('\r', '').strip() re1 = re.search(r'(hide.*none)', desc, re.S) if re1: desc = desc.replace(re1.group(1), '') commentbyeditor = d('div#comment_list').text().replace('\r', '').strip() re2 = re.search(r'(var.*/;)', commentbyeditor, re.S) if re2: commentbyeditor = commentbyeditor.replace(re2.group(1), '') apknamestr = d('div.app-function> a').attr('onclick') re3 = re.search(r'apk/.*?/.*?/(.*)\.\d+\.apk', apknamestr) if not re3: return apkname = re3.group(1) if apkname.find('.huawei') >= 0: apkname = apkname.replace('.huawei', '') logger.info('------------%s------------' % apkname) item = { "link": url, "apkname": apkname, "appmarket": APPMARKET, "name": name, "brief": None, "website": None, "description": desc, "commentbyeditor": commentbyeditor, "updateDate": updatedate, "language": None, "tags": None, "version": version, "updates": None, "size": size, "compatibility": None, "icon": icon, "author": author, "screenshots": screenshots, "type": None, "key": key, "key_int": key_int, "download": download, } # logger.info(json.dumps(item, ensure_ascii=False, cls=util.CJsonEncoder, indent=2)) mongo = db.connect_mongo() collection = mongo.market.android_market collection_android = mongo.market.android android.save(collection, APPMARKET, item) android.merge(item) change_android_status(apkname, found=True) mongo.close() logger.info('parser done') except Exception, e: logger.info('parser error:%s' % e) raise e
def process(url, key, content): global LATEST if content.find('360安全中心') == -1: return #logger.info(content) r = "var detail = \(function \(\) \{\s*?return\s*?(.*?);\s*?\}\)" result = util.re_get_result(r, content) (b, ) = result base = json.loads(b.replace("'", '"'), strict=False) name = base["sname"] type = base["type"] package = base["pname"].strip() #logger.info("%s, %s, %s" % (type, name, package)) d = pq(html.fromstring(content.decode("utf-8"))) desc = "" try: # desc = d('div.breif').contents()[0].strip() desc = d('div.breif').text().strip() ts = desc.split("【基本信息】") desc = ts[0].strip() except: pass if desc == "": try: desc = d('div#html-brief').text().strip() except: pass #logger.info(desc) author = d('div.base-info> table> tbody> tr> td').eq( 0).contents()[1].strip() chinese, is_company = name_helper.name_check(author) if chinese and is_company: author = name_helper.company_name_normalize(author) author = None #logger.info(author) modify_date_str = d('div.base-info> table> tbody> tr> td').eq( 1).contents()[1].strip() #logger.info(modify_date_str) modify_date = datetime.datetime.strptime(modify_date_str, "%Y-%m-%d") #logger.info(modify_date) versionname = None try: versionname = d('div.base-info> table> tbody> tr> td').eq( 2).contents()[1].strip() if versionname.startswith("V"): versionname = versionname.replace("V", "") except: pass #logger.info(versionname) compatibility = d('div.base-info> table> tbody> tr> td').eq( 3).contents()[1].strip() language = d('div.base-info> table> tbody> tr> td').eq( 4).contents()[1].strip() if language == "其他": if hz.is_chinese_string(desc): language = "中文" #logger.info(language) icon = d('div#app-info-panel> div> dl> dt >img').attr("src").strip() #logger.info(icon) screenshots = [] try: screenshots = d('div#scrollbar').attr("data-snaps").split(",") except: pass commentbyeditor = None r = "<p><strong>【小编点评】</strong>(.*?)</p>" result = util.re_get_result(r, content) if result: (commentbyeditor, ) = result updates = None r = "<br/><b>【更新内容】</b><br/>(.*?)</div>" result = util.re_get_result(r, content) if result: (updates, ) = result updates = updates.replace("<br />", "\n").strip() tags = d("div.app-tags> a").text().replace(" ", ",") size = None r = "'size':'(.*?)'" result = util.re_get_result(r, content) if result: (size, ) = result size = int(size) downloadstr = d("span.s-3").eq(0).text().replace("下载:", "").replace( "次", "").replace("+", "").strip() download = None try: if downloadstr.endswith("千"): download = float(downloadstr.replace("千", "")) * 1000 elif downloadstr.endswith("万"): download = float(downloadstr.replace("万", "")) * 10000 elif downloadstr.endswith("亿"): download = float(downloadstr.replace("亿", "")) * 10000 * 10000 else: download = int(downloadstr) score = float(d("span.s-1").text().replace("分", "").strip()) * 0.5 except: traceback.print_exc() item = { "link": url, "apkname": package, "appmarket": APPMARKET, "name": name, "brief": None, "website": None, "description": desc, "commentbyeditor": commentbyeditor, "updateDate": modify_date, "language": language, "tags": tags, "version": versionname, "updates": updates, "size": size, "compatibility": compatibility, "icon": icon, "author": author, "screenshots": screenshots, "type": type, "key": str(key), "key_int": key, "download": download, } logger.info(json.dumps(item, ensure_ascii=False, cls=util.CJsonEncoder)) android.save(collection, APPMARKET, item) android.merge(item) if LATEST < key: LATEST = key
def process(crawler, url, apkname, content): # logger.info(content) if has_content(content,apkname): logger.info("hereherehere") #content = content.decode('utf-8') d = pq(html.fromstring(content.decode("utf-8", "ignore"))) #content = unicode(content, encoding="utf-8", errors='replace') #d = pq(content) name = d('span.title').text() # logger.info("name: %s",name) icon = d('div.app-icon> img').attr("src") brief = d('p.tagline').text() # logger.info(brief) commentbyeditor= d('div.editorComment> div').text() #logger.info(editor_comment) screenshots = [] imgs = d('div.overview> img') # logger.info(imgs) for img in imgs: imgurl = pq(img).attr("src") screenshots.append(imgurl) desc = d('div.desc-info> div').text() # logger.info(desc) updates = d('div.change-info> div').text() # logger.info(update_desc) try: size = int(d('meta[itemprop="fileSize"]').attr("content")) except: size = d('meta[itemprop="fileSize"]').attr("content") if size.find("KB") >= 0: size = int(float(size.replace("KB","").strip())* 1024) elif size.find("MB") >= 0: size = int(float(size.replace("MB","").strip())* 1024 * 1024) else: size = None tags = d('dd.tag-box >a').text().replace(" ",",") datestr = d('time#baidu_time').text() updatedate = datetime.datetime.strptime(datestr, "%Y年%m月%d日") #versionname = d(':contains("版本")').next() #logger.info(versionname) author = d('span.dev-sites').text() chinese, is_company = name_helper.name_check(author) if chinese and is_company: author = name_helper.company_name_normalize(author) try: website=d('a.dev-sites').attr("href") website = url_helper.url_normalize(website) except: website=None compatibility=None if content.find("查看权限要求") == -1: r1 = "content=\"Android\">(.*?)</dd>.*<dt>来自" else: r1 = "content=\"Android\">(.*?)<div>.*" result1 = util.re_get_result(r1, content) if result1: (compatibility,)= result1 compatibility=compatibility.replace("\n","").replace("\r","").replace("\s","").replace(" ","") #logger.info(compatibility) versionname=None r2 = "<dt>版本</dt>.*<dd>(.*?)</dd>.*<dt>要求" result2 = util.re_get_result(r2, content) if result2: (versionname,)= result2 versionname = versionname.replace("\n", "").replace("\r", "").replace("\s", "").replace(" ","").strip() #logger.info(versionname) try: versionname = versionname.split()[0] if versionname.startswith("V"): versionname = versionname.replace("V", "") except: pass # download = int(d("i[itemprop='interactionCount']").attr("content").split(":")[1]) dnum = d("i[itemprop='interactionCount']").attr("content").split(":")[1] download = None try: download = int(dnum) except: if dnum.find("万") >= 0: download = int(float(dnum.replace("万", "").strip()) * 10000) elif dnum.find("亿") >= 0: download = int(float(dnum.replace("亿", "").strip()) * 10000 * 10000) else: logger.info("********download :%s cannot get", dnum) item = { "link": url, "apkname": apkname, "appmarket": APPMARKET, "name": name, "brief": brief, "website": website, "description": desc, "commentbyeditor": commentbyeditor, "updateDate": updatedate, "language": None, "tags": tags, "version": versionname, "updates": updates, "size": size, "compatibility": compatibility, "icon": icon, "author": author, "screenshots": screenshots, "type": None, "key": apkname, "key_int": None, "download":download, } logger.info(json.dumps(item, ensure_ascii=False, cls=util.CJsonEncoder)) android.save(collection, APPMARKET, item) android.merge(item) collection_android.update_one({"apkname": apkname}, {"$set": {"wandoujiaprocessed": True, "wandoujiafound": True}}) else: logger.info("App: %s has no content", apkname) #logger.info(content) collection_android.update_one({"apkname": apkname}, {"$set": {"wandoujiaprocessed": True, "wandoujiafound": False}})
def process(crawler, url, apkname, content): # logger.info(content) if has_content(content, apkname): if content.find(r'</br>') > 0: content = content.replace(r'</br>', "") d = pq(html.fromstring(content.decode("utf-8"))) name = d('.intro-titles h3').text() # logger.info("name: %s",name) icon = d('.app-info img').attr("src") screenshots = [] imgs = d('.img-list img') # logger.info(imgs) for img in imgs: imgurl = pq(img).attr("src") # logger.info("url: %s", imgurl) screenshots.append(imgurl) desc = d('.pslide').eq(0).text().replace("\r", "") # logger.info("desc: %s", desc) updates = d('.pslide').eq(1).text().replace("\r", "") # logger.info("updates: %s", updates) size = d(':contains("软件大小:")+ li').text() try: size = float(size.replace("M", "")) size = str(round(size * 1024 * 1024)) except: pass # logger.info("size: %s",size) tag = d('.bread-crumb li a').eq(1).text().strip() # logger.info("tag: %s", tag) # (datestr,) = util.re_get_result('data-apkPublishTime=\"(.*?)\"', content) datestr = d(':contains("更新时间:")+ li').text() # logger.info("datastr=%s" % datestr) # updatedate = datetime.datetime.fromtimestamp(int(datestr)) updatedate = datetime.datetime.strptime(datestr, '%Y-%m-%d') # logger.info("updatedate=%s" % updatedate) versionname = None try: versionname = d(':contains("版本号:")+ li').text() # logger.info("versionname: %s", versionname) if versionname.startswith("V"): versionname = versionname.replace("V", "") # logger.info("versionname: %s", versionname) except: pass author = None try: author = d('.intro-titles p').eq(0).text() chinese, is_company = name_helper.name_check(author) if chinese and is_company: author = name_helper.company_name_normalize(author) # logger.info("author: %s", author) except: pass # (download,) = util.re_get_result('downTimes:"(.*?)"', content) # download = float(download) download = None item = { "link": url, "apkname": apkname, "appmarket": APPMARKET, "name": name, "brief": None, "website": None, "description": desc, "commentbyeditor": None, "updateDate": updatedate, "language": None, "tags": tag, "version": versionname, "updates": updates, "size": size, "compatibility": None, "icon": icon, "author": author, "screenshots": screenshots, "type": None, "key": apkname, "key_int": None, "download": download, } logger.info(json.dumps(item, ensure_ascii=False, cls=util.CJsonEncoder)) android.save(collection, APPMARKET, item) android.merge(item) collection_android.update_one( {"apkname": apkname}, {"$set": { "miappprocessed": True, "miappfound": True }}) else: logger.info("App: %s has no content", apkname) collection_android.update_one( {"apkname": apkname}, {"$set": { "miappprocessed": True, "miappfound": False }})