Пример #1
0
def process(crawler, app, content):
    if content.find('请检查您所输入的URL地址是否有误') != -1:
        return

    key = app["key_int"]
    url = app["link"]

    d = pq(content)
    cate = d('div.nav> span >a').eq(1).text().strip()
    if cate == "游戏":
        return

    sub_cate = d('div.nav> span >a').eq(2).text().strip()
    name = d('h1.app-name> span').text().strip()
    downloadstr = d("span.download-num").eq(0).text().replace("下载次数:","").replace("+","").strip()
    if downloadstr.endswith("千"):
        download = float(downloadstr.replace("千","")) * 1000
    elif downloadstr.endswith("万"):
        download = float(downloadstr.replace("万","")) * 10000
    elif downloadstr.endswith("亿"):
        download = float(downloadstr.replace("亿","")) * 10000 * 10000
    else:
        download = int(downloadstr)
    logger.info("%s-%s, %s, %s", cate, sub_cate, name, download)


    mosug_url = "http://m.baidu.com/mosug?wd=%s&type=soft" % urllib.quote(name.encode("utf-8"))
    while True:
        result = crawler.crawl(mosug_url)
        if result['get'] == 'success':
            mosug_content = result["content"]
            break
    #logger.info(mosug_content)

    data = json.loads(mosug_content)
    if data["result"].get("s") is None:
        return

    found = False
    for dt in data["result"].get("s"):
        if dt.get("package") is None:
            continue
        if long(dt["docid"]) == key:
            download = int(dt["download_num"])
            score = int(dt["score"]) * 0.05
            break


    # screenshot
    screenshots = []
    imgs = d('img.imagefix')
    #logger.info(imgs)
    for img in imgs:
        surl = pq(img).attr("src")
        #logger.info(url)
        screenshots.append(surl)

    # content
    desc = d('p.content').text()
    #logger.info(desc)

    icon = d('div.app-pic> img').attr("src")
    #logger.info(icon)
    author = d('div.origin-wrap> span> span').eq(1).text()
    chinese, is_company = name_helper.name_check(author)
    if chinese and is_company:
        author = name_helper.company_name_normalize(author)
    #logger.info("author: %s", author)
    commentbyeditor = d('span.head-content').text()

    item = {
        "link": url,
        "apkname": app["apkname"],
        "appmarket": APPMARKET,
        "name": name,
        "brief": None,
        "website": None,
        "description": desc,
        "commentbyeditor": commentbyeditor,
        "updateDate": None,
        "language": None,
        "tags": sub_cate,
        "version": app["version"],
        "updates": None,
        "size": app["size"],
        "compatibility": None,
        "icon": icon,
        "author": author,
        "screenshots": screenshots,
        "type": app["type"],
        "key": str(key),
        "key_int": key,
        "download": download
    }
    logger.info(json.dumps(item, ensure_ascii=False, cls=util.CJsonEncoder))

    android.save(collection, APPMARKET, item)
    android.merge(item)
Пример #2
0
def to_parser(page, url):
    try:
        d = pq(html.fromstring(page.decode('utf-8')))

        uls = d('div.app-info> ul.app-info-ul')

        name = d(uls)('span.title').text().strip()
        logger.info('start to parser name:%s | url:%s' % (name, url))

        apkurl = url.split('app/')[-1]
        key = apkurl.split('C')[-1]
        key_int = int(key)
        icon = d(uls)('li.img> img.app-ico').attr('src')

        downloadstr = d(uls)('span.title').next().text().split(':')[-1]
        download = None
        try:
            download = int(downloadstr)
        except:
            if downloadstr.find('万次') >= 0:
                download = int(float(downloadstr.replace('万次', '')) * 10000)
            elif downloadstr.find('亿次') >= 0:
                download = int(
                    float(downloadstr.replace('亿次', '')) * 10000 * 10000)
            else:
                logger.info("********download :%s cannot get", downloadstr)

        sizestr = d(uls).eq(1)('li').eq(0)('span').text().strip()
        size = None
        if sizestr.find('KB') >= 0:
            size = int(float(sizestr.replace("KB", "").strip()) * 1024)
        elif sizestr.find("MB") >= 0:
            size = int(float(sizestr.replace("MB", "").strip()) * 1024 * 1024)

        updatedatestr = d(uls).eq(1)('li').eq(1)('span').text().strip()
        updatedate = datetime.datetime.strptime(updatedatestr, '%Y-%m-%d')
        author = d(uls).eq(1)('li').eq(2)('span').attr('title')
        version = d(uls).eq(1)('li').eq(3)('span').text().strip()
        if version.startswith("V"):
            version = version.replace("V", "")
        elif version.startswith("v"):
            version = version.replace("v", "")

        screenshots = []
        imgs = d('ul.imgul> li')
        for img in imgs:
            imgurl = d(img)('a').attr('href')
            screenshots.append(imgurl)

        desc = d('div.content').text().replace('\r', '').strip()
        re1 = re.search(r'(hide.*none)', desc, re.S)
        if re1:
            desc = desc.replace(re1.group(1), '')

        commentbyeditor = d('div#comment_list').text().replace('\r',
                                                               '').strip()
        re2 = re.search(r'(var.*/;)', commentbyeditor, re.S)
        if re2:
            commentbyeditor = commentbyeditor.replace(re2.group(1), '')

        apknamestr = d('div.app-function> a').attr('onclick')
        re3 = re.search(r'apk/.*?/.*?/(.*)\.\d+\.apk', apknamestr)
        if not re3:
            return
        apkname = re3.group(1)
        if apkname.find('.huawei') >= 0:
            apkname = apkname.replace('.huawei', '')
        logger.info('------------%s------------' % apkname)

        item = {
            "link": url,
            "apkname": apkname,
            "appmarket": APPMARKET,
            "name": name,
            "brief": None,
            "website": None,
            "description": desc,
            "commentbyeditor": commentbyeditor,
            "updateDate": updatedate,
            "language": None,
            "tags": None,
            "version": version,
            "updates": None,
            "size": size,
            "compatibility": None,
            "icon": icon,
            "author": author,
            "screenshots": screenshots,
            "type": None,
            "key": key,
            "key_int": key_int,
            "download": download,
        }
        # logger.info(json.dumps(item, ensure_ascii=False, cls=util.CJsonEncoder, indent=2))
        mongo = db.connect_mongo()
        collection = mongo.market.android_market
        collection_android = mongo.market.android
        android.save(collection, APPMARKET, item)
        android.merge(item)
        change_android_status(apkname, found=True)
        mongo.close()
        logger.info('parser done')
    except Exception, e:
        logger.info('parser error:%s' % e)
        raise e
Пример #3
0
def process(url, key, content):
    global LATEST
    if content.find('360安全中心') == -1:
        return

    #logger.info(content)

    r = "var detail = \(function \(\) \{\s*?return\s*?(.*?);\s*?\}\)"
    result = util.re_get_result(r, content)
    (b, ) = result
    base = json.loads(b.replace("'", '"'), strict=False)
    name = base["sname"]
    type = base["type"]
    package = base["pname"].strip()
    #logger.info("%s, %s, %s" % (type, name, package))

    d = pq(html.fromstring(content.decode("utf-8")))
    desc = ""
    try:
        # desc = d('div.breif').contents()[0].strip()
        desc = d('div.breif').text().strip()
        ts = desc.split("【基本信息】")
        desc = ts[0].strip()
    except:
        pass
    if desc == "":
        try:
            desc = d('div#html-brief').text().strip()
        except:
            pass

    #logger.info(desc)

    author = d('div.base-info> table> tbody> tr> td').eq(
        0).contents()[1].strip()
    chinese, is_company = name_helper.name_check(author)
    if chinese and is_company:
        author = name_helper.company_name_normalize(author)
    author = None

    #logger.info(author)
    modify_date_str = d('div.base-info> table> tbody> tr> td').eq(
        1).contents()[1].strip()
    #logger.info(modify_date_str)
    modify_date = datetime.datetime.strptime(modify_date_str, "%Y-%m-%d")
    #logger.info(modify_date)
    versionname = None
    try:
        versionname = d('div.base-info> table> tbody> tr> td').eq(
            2).contents()[1].strip()
        if versionname.startswith("V"):
            versionname = versionname.replace("V", "")
    except:
        pass
    #logger.info(versionname)
    compatibility = d('div.base-info> table> tbody> tr> td').eq(
        3).contents()[1].strip()
    language = d('div.base-info> table> tbody> tr> td').eq(
        4).contents()[1].strip()

    if language == "其他":
        if hz.is_chinese_string(desc):
            language = "中文"
    #logger.info(language)

    icon = d('div#app-info-panel> div> dl> dt >img').attr("src").strip()
    #logger.info(icon)

    screenshots = []
    try:
        screenshots = d('div#scrollbar').attr("data-snaps").split(",")
    except:
        pass

    commentbyeditor = None
    r = "<p><strong>【小编点评】</strong>(.*?)</p>"
    result = util.re_get_result(r, content)
    if result:
        (commentbyeditor, ) = result

    updates = None
    r = "<br/><b>【更新内容】</b><br/>(.*?)</div>"
    result = util.re_get_result(r, content)
    if result:
        (updates, ) = result
        updates = updates.replace("<br />", "\n").strip()

    tags = d("div.app-tags> a").text().replace(" ", ",")

    size = None
    r = "'size':'(.*?)'"
    result = util.re_get_result(r, content)
    if result:
        (size, ) = result
        size = int(size)

    downloadstr = d("span.s-3").eq(0).text().replace("下载:", "").replace(
        "次", "").replace("+", "").strip()
    download = None
    try:
        if downloadstr.endswith("千"):
            download = float(downloadstr.replace("千", "")) * 1000
        elif downloadstr.endswith("万"):
            download = float(downloadstr.replace("万", "")) * 10000
        elif downloadstr.endswith("亿"):
            download = float(downloadstr.replace("亿", "")) * 10000 * 10000
        else:
            download = int(downloadstr)
        score = float(d("span.s-1").text().replace("分", "").strip()) * 0.5
    except:
        traceback.print_exc()

    item = {
        "link": url,
        "apkname": package,
        "appmarket": APPMARKET,
        "name": name,
        "brief": None,
        "website": None,
        "description": desc,
        "commentbyeditor": commentbyeditor,
        "updateDate": modify_date,
        "language": language,
        "tags": tags,
        "version": versionname,
        "updates": updates,
        "size": size,
        "compatibility": compatibility,
        "icon": icon,
        "author": author,
        "screenshots": screenshots,
        "type": type,
        "key": str(key),
        "key_int": key,
        "download": download,
    }
    logger.info(json.dumps(item, ensure_ascii=False, cls=util.CJsonEncoder))

    android.save(collection, APPMARKET, item)
    android.merge(item)

    if LATEST < key:
        LATEST = key
Пример #4
0
def process(crawler, url, apkname, content):
    # logger.info(content)
    if has_content(content,apkname):
        logger.info("hereherehere")
        #content = content.decode('utf-8')
        d = pq(html.fromstring(content.decode("utf-8", "ignore")))
        #content = unicode(content, encoding="utf-8", errors='replace')
        #d = pq(content)

        name = d('span.title').text()
        # logger.info("name: %s",name)

        icon = d('div.app-icon> img').attr("src")

        brief = d('p.tagline').text()
        # logger.info(brief)

        commentbyeditor= d('div.editorComment> div').text()
        #logger.info(editor_comment)

        screenshots = []
        imgs = d('div.overview> img')
        # logger.info(imgs)
        for img in imgs:
            imgurl = pq(img).attr("src")
            screenshots.append(imgurl)

        desc = d('div.desc-info> div').text()
        # logger.info(desc)
        updates = d('div.change-info> div').text()
        # logger.info(update_desc)
        try:
            size = int(d('meta[itemprop="fileSize"]').attr("content"))
        except:
            size = d('meta[itemprop="fileSize"]').attr("content")
            if size.find("KB") >= 0:
                size = int(float(size.replace("KB","").strip())* 1024)
            elif size.find("MB") >= 0:
                size = int(float(size.replace("MB","").strip())* 1024 * 1024)
            else:
                size = None
        tags = d('dd.tag-box >a').text().replace(" ",",")


        datestr = d('time#baidu_time').text()
        updatedate = datetime.datetime.strptime(datestr, "%Y年%m月%d日")
        #versionname = d(':contains("版本")').next()
        #logger.info(versionname)
        author = d('span.dev-sites').text()
        chinese, is_company = name_helper.name_check(author)
        if chinese and is_company:
            author = name_helper.company_name_normalize(author)
        try:
            website=d('a.dev-sites').attr("href")
            website = url_helper.url_normalize(website)
        except:
            website=None

        compatibility=None
        if content.find("查看权限要求") == -1:
            r1 = "content=\"Android\">(.*?)</dd>.*<dt>来自"
        else:
            r1 = "content=\"Android\">(.*?)<div>.*"
        result1 = util.re_get_result(r1, content)
        if result1:
            (compatibility,)= result1
            compatibility=compatibility.replace("\n","").replace("\r","").replace("\s","").replace(" ","")
        #logger.info(compatibility)

        versionname=None
        r2 = "<dt>版本</dt>.*<dd>(.*?)</dd>.*<dt>要求"
        result2 = util.re_get_result(r2, content)
        if result2:
            (versionname,)= result2
            versionname = versionname.replace("\n", "").replace("\r", "").replace("\s", "").replace("&nbsp;","").strip()

        #logger.info(versionname)

        try:
            versionname = versionname.split()[0]
            if versionname.startswith("V"):
                versionname = versionname.replace("V", "")
        except:
            pass
        # download = int(d("i[itemprop='interactionCount']").attr("content").split(":")[1])
        dnum = d("i[itemprop='interactionCount']").attr("content").split(":")[1]
        download = None
        try:
            download = int(dnum)
        except:
            if dnum.find("万") >= 0:
                download = int(float(dnum.replace("万", "").strip()) * 10000)
            elif dnum.find("亿") >= 0:
                download = int(float(dnum.replace("亿", "").strip()) * 10000 * 10000)
            else:
                logger.info("********download :%s cannot get", dnum)

        item = {
            "link": url,
            "apkname": apkname,
            "appmarket": APPMARKET,
            "name": name,
            "brief": brief,
            "website": website,
            "description": desc,
            "commentbyeditor": commentbyeditor,
            "updateDate": updatedate,
            "language": None,
            "tags": tags,
            "version": versionname,
            "updates": updates,
            "size": size,
            "compatibility": compatibility,
            "icon": icon,
            "author": author,
            "screenshots": screenshots,
            "type": None,
            "key": apkname,
            "key_int": None,
            "download":download,
            }

        logger.info(json.dumps(item, ensure_ascii=False, cls=util.CJsonEncoder))

        android.save(collection, APPMARKET, item)
        android.merge(item)
        collection_android.update_one({"apkname": apkname}, {"$set": {"wandoujiaprocessed": True, "wandoujiafound": True}})

    else:
        logger.info("App: %s has no content", apkname)
        #logger.info(content)
        collection_android.update_one({"apkname": apkname}, {"$set": {"wandoujiaprocessed": True, "wandoujiafound": False}})
Пример #5
0
def process(crawler, url, apkname, content):
    # logger.info(content)
    if has_content(content, apkname):

        if content.find(r'</br>') > 0:
            content = content.replace(r'</br>', "")

        d = pq(html.fromstring(content.decode("utf-8")))

        name = d('.intro-titles h3').text()
        # logger.info("name: %s",name)

        icon = d('.app-info img').attr("src")

        screenshots = []
        imgs = d('.img-list img')
        # logger.info(imgs)
        for img in imgs:
            imgurl = pq(img).attr("src")
            # logger.info("url: %s", imgurl)
            screenshots.append(imgurl)

        desc = d('.pslide').eq(0).text().replace("\r", "")
        # logger.info("desc: %s", desc)

        updates = d('.pslide').eq(1).text().replace("\r", "")
        # logger.info("updates: %s", updates)

        size = d(':contains("软件大小:")+ li').text()
        try:
            size = float(size.replace("M", ""))
            size = str(round(size * 1024 * 1024))
        except:
            pass

        # logger.info("size: %s",size)

        tag = d('.bread-crumb li a').eq(1).text().strip()
        # logger.info("tag: %s", tag)

        # (datestr,) = util.re_get_result('data-apkPublishTime=\"(.*?)\"', content)
        datestr = d(':contains("更新时间:")+ li').text()
        # logger.info("datastr=%s" % datestr)
        # updatedate = datetime.datetime.fromtimestamp(int(datestr))
        updatedate = datetime.datetime.strptime(datestr, '%Y-%m-%d')
        # logger.info("updatedate=%s" % updatedate)

        versionname = None
        try:
            versionname = d(':contains("版本号:")+ li').text()
            # logger.info("versionname: %s", versionname)
            if versionname.startswith("V"):
                versionname = versionname.replace("V", "")
                # logger.info("versionname: %s", versionname)
        except:
            pass

        author = None
        try:
            author = d('.intro-titles p').eq(0).text()
            chinese, is_company = name_helper.name_check(author)
            if chinese and is_company:
                author = name_helper.company_name_normalize(author)
                # logger.info("author: %s", author)
        except:
            pass

        # (download,) = util.re_get_result('downTimes:"(.*?)"', content)
        # download = float(download)
        download = None

        item = {
            "link": url,
            "apkname": apkname,
            "appmarket": APPMARKET,
            "name": name,
            "brief": None,
            "website": None,
            "description": desc,
            "commentbyeditor": None,
            "updateDate": updatedate,
            "language": None,
            "tags": tag,
            "version": versionname,
            "updates": updates,
            "size": size,
            "compatibility": None,
            "icon": icon,
            "author": author,
            "screenshots": screenshots,
            "type": None,
            "key": apkname,
            "key_int": None,
            "download": download,
        }
        logger.info(json.dumps(item, ensure_ascii=False,
                               cls=util.CJsonEncoder))

        android.save(collection, APPMARKET, item)
        android.merge(item)
        collection_android.update_one(
            {"apkname": apkname},
            {"$set": {
                "miappprocessed": True,
                "miappfound": True
            }})
    else:
        logger.info("App: %s has no content", apkname)
        collection_android.update_one(
            {"apkname": apkname},
            {"$set": {
                "miappprocessed": True,
                "miappfound": False
            }})