예제 #1
0
def start_run(concurrent_num):
    global raw_urls

    logger.info("website start...")

    items = beian_collection.find({})
    for item in items:
        if item["domain"] is None or item["domain"] == "":
            continue

        url = "http://www." + item["domain"]
        logger.info(url)
        raw_urls.append(url)

    conn = db.connect_torndb()
    items = conn.query("select * from artifact where type=4010")
    for item in items:
        url = item["link"]
        if url is None or url == "":
            continue
        url = url_helper.url_normalize(url)
        logger.info(url)
        raw_urls.append(url)
    conn.close()

    threads = [gevent.spawn(run) for i in xrange(concurrent_num)]
    gevent.joinall(threads)

    logger.info("website end.")
예제 #2
0
def insert(shortname, name, brief, website):
    name = name.replace("(开业)", "")
    sourceId = util.md5str(name)
    sid = parser_db_util.save_company_yitai(shortname, name, 13100, sourceId,
                                            brief)
    logger.info("sid:%s->sourceId:%s", sid, sourceId)
    parser_db_util.save_source_company_name(sid, name, 12010)
    parser_db_util.save_source_company_name(sid, shortname, 12020)
    if website is not None and website.strip() != "":
        website = url_helper.url_normalize(website)
        if website is not None and website != "":
            if website.find("http://") == -1 and website.find("https://"):
                website = "http://" + website
            type, market, app_id = url_helper.get_market(website)
            if type == 4010:
                if website.find('sse.com') > 0:
                    pass
                else:
                    artifact = {
                        "sourceCompanyId": sid,
                        "name": shortname,
                        "description": None,
                        "link": website,
                        "domain": app_id,
                        "type": type
                    }

                    parser_db_util.save_artifacts_standard(sid, [artifact])
예제 #3
0
def run():
    global raw_urls
    while True:
        if len(raw_urls) == 0:
            return
        url = raw_urls.pop(0)
        item = collection.find_one({"url": url})
        if item is not None:
            continue

        flag, domain = url_helper.get_domain(url)

        result = website.get_meta_info(url)
        logger.info(url)
        logger.info(
            json.dumps(result, ensure_ascii=False, cls=util.CJsonEncoder))
        if result is None:
            result = {"url": url, "httpcode": 404}
        else:
            if result["url"] != result["redirect_url"]:
                new_url = url_helper.url_normalize(result["redirect_url"])
                flag1, domain1 = url_helper.get_domain(new_url)
                if domain != domain1:
                    raw_urls.append(new_url)
        result["createTime"] = datetime.datetime.now()
        result["modifyTime"] = result["createTime"]
        try:
            collection.insert(result)
        except:
            pass
예제 #4
0
def get_meta_info(url):
    user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9'
    headers = {
        'User-Agent': user_agent,
        'accept-language': 'zh-CN,zh;q=0.8,en;q=0.6',
        'Accept-Encoding': 'gzip'
    }
    try:
        request = urllib2.Request(url, None, headers)
    except:
        return None
    opener = urllib2.build_opener()
    retries = 0
    while True:
        try:
            r = opener.open(request, timeout=17)
            if r.info().get('Content-Encoding') == 'gzip':
                buf = StringIO(r.read())
                f = gzip.GzipFile(fileobj=buf)
                data = f.read()
            else:
                data = r.read()
            content = util.html_encode(data)
            redirect_url = url_helper.url_normalize(r.geturl())
            #logger.info(redirect_url)
            #logger.info(content)
            d = pq(html.fromstring(content))
            title = d("title").text()
            #logger.info(title)
            keywords = d("meta[name='keywords']").attr("content")
            if keywords is None:
                keywords = d("meta[name='Keywords']").attr("content")
            #logger.info(keywords)
            description = d("meta[name='description']").attr("content")
            if description is None:
                description = d("meta[name='Description']").attr("content")
            #logger.info(description)

            flag, domain = url_helper.get_domain(url)
            if flag is not True:
                domain = None
            return {
                "url": url,
                "redirect_url": redirect_url,
                "domain": domain,
                "title": title,
                "tags": keywords,
                "description": description,
                "httpcode": 200
            }
            break
        except:
            retries += 1
        if retries >= 3:
            return None
    return None
예제 #5
0
def save(collection_market, appmarket, item):
    item["website"] = url_helper.url_normalize(item["website"])
    flag, domain = url_helper.get_domain(item["website"])
    if flag:
        item["website_domain"] = domain
    else:
        item["website_domain"] = None

    temp = "http://" + ".".join(item["apkname"].split(".")[::-1])
    flag, domain = url_helper.get_domain(temp)
    item["apkname_domain"] = domain

    record = collection_market.find_one(
        {
            "appmarket": appmarket,
            "apkname": item["apkname"]
        },
        projection={'histories': False})
    if record:
        _id = record.pop("_id")
        record.pop("key")
        record.pop("key_int")
        #logger.info(json.dumps(record, ensure_ascii=False, cls=util.CJsonEncoder))
        if item["version"] is not None and item["version"].strip() != "":
            if record["version"] is not None and record["version"].strip(
            ) != "" and LooseVersion(item["version"]) > LooseVersion(
                    record["version"]):
                item["createTime"] = record["createTime"]
                item["modifyTime"] = datetime.datetime.now()
                if item["updateDate"] is None:
                    item["updateDate"] = datetime.datetime.now()
                collection_market.update_one({"_id": _id}, {
                    '$set': item,
                    '$addToSet': {
                        "histories": record
                    }
                })
            elif record["version"] is None or record["version"].strip(
            ) == "" or LooseVersion(item["version"]) == LooseVersion(
                    record["version"]):
                item["modifyTime"] = datetime.datetime.now()
                collection_market.update_one({"_id": _id}, {'$set': item})
    else:
        item["createTime"] = datetime.datetime.now()
        item["modifyTime"] = item["createTime"]
        if item["updateDate"] is None:
            item["updateDate"] = datetime.datetime.now()
        try:
            collection_market.insert(item)
        except Exception, e:
            logger.info(e)
예제 #6
0
def parse_base(item):
    if item is None:
        return None

    company_key = item["key"]
    content = item["content"]

    return {
        "shortName":
        content["name"],
        "fullName":
        None,
        "productName":
        content["name"],
        "description":
        None,
        "brief":
        content["desc"],
        "round":
        0,
        "roundDesc":
        "",
        "companyStatus":
        2010,
        "fundingType":
        0,
        "locationId":
        0,
        "establishDate":
        None,
        "logo":
        None,
        "sourceId":
        company_key,
        "field":
        None,
        "subField":
        None,
        "tags":
        None,
        "type":
        41020,
        "score":
        content["score"],
        "artifacts": [{
            "name": content["name"],
            "desc": content["desc"],
            "link": url_helper.url_normalize(content["website"])
        }]
    }
예제 #7
0
def process(item):
    logger.info("process: %s, %s", item["id"], item["name"])
    deal_id = item["dealId"]
    if deal_id is None:
        set_deal_artifact_new_proceed(item["id"], "F")
        return

    conn = db.connect_torndb()
    deal = conn.get("select * from deal where id=%s", deal_id)
    conn.close()
    if deal is None:
        set_deal_artifact_new_proceed(item["id"], "F")
        return

    company_id = deal["companyId"]

    conn = db.connect_torndb()
    sc = conn.get(
        "select * from source_company where companyId=%s and source=13001 and sourceId=%s",
        company_id, str(deal_id))
    if sc is None:
        source_company_id = conn.insert(
            "insert source_company(companyId,source,sourceId,createTime,processStatus) "
            "values(%s,%s,%s,now(),%s)", company_id, 13001, str(deal_id), 2)
    else:
        source_company_id = sc["id"]

    if item["sourceArtifactId"] is None:
        link = item["link"]
        domain = None
        if item["type"] == 4010:
            link = url_helper.url_normalize(link)
            flag, domain = url_helper.get_domain(link)
            if flag is False:
                domain = None

        sourceArtifactId = conn.insert(
            "insert source_artifact(sourceCompanyId,name,description,link,domain,type,createTime) "
            "values(%s,%s,%s,%s,%s,%s,now())", source_company_id, item["name"],
            item["description"], link, domain, item["type"])
        conn.update(
            "update deal_artifact_new set sourceArtifactId=%s, proceed='Y' where id=%s",
            sourceArtifactId, item["id"])
        conn.update("update source_company set processStatus=0 where id=%s",
                    source_company_id)
    conn.close()
예제 #8
0
def parser(item):
    if item is None:
        return None

    investor_key = item["key"]

    html = item["content"]
    #logger.info(html)
    d = pq(html)
    investor_name = d('div.picinfo> p> span.title').text()
    investor_name = name_helper.company_name_normalize(investor_name)
    logger.info("investor_name: " + investor_name)

    if investor_name is None:
        logger.info("No investor name!!!")
        return None

    logo = d('div.pic> img').attr("src")
    if logo is not None:
        logo = logo.strip()
    logger.info("Investor Logo: %s" % logo)

    website = d('span.links >a[target="_black"]').attr("href")
    if website is None or website.strip() == "暂无":
        website = None

    website = url_helper.url_normalize(website)
    flag, domain = url_helper.get_domain(website)
    if flag is None:
        website = None

    logger.info("Investor website: %s" % website)

    stageStr = d('div.pad.block> div.list-tags.yellow').text().replace(
        " ", ",").strip()
    logger.info("Investor rounds: %s" % stageStr)

    fieldsStr = d('div.pad.block> div.list-tags.darkblue').text().replace(
        " ", ",").strip()
    logger.info("Investor fields: %s" % fieldsStr)

    desc = d('div.des').text().strip()
    logger.info("Investor desc: %s" % desc)

    return investor_key, investor_name, logo, website, stageStr, fieldsStr, desc
예제 #9
0
def parse_base(item):
    if item is None:
        return None

    company_key = item["key"]
    content = item["content"]
    artifacts = []
    link = url_helper.url_normalize(content["website"])
    type, app_market, app_id = url_helper.get_market(link)
    if type == 4010 or \
    ( (type == 4040 or type == 4050) and app_id):
        artifacts.append({
                    "type":type,
                    "name":content["name"],
                    "desc":content["desc"],
                    "link":link,
                    "domain":app_id
            })

    return {
        "shortName": content["name"],
        "fullName": None,
        "productName": content["name"],
        "description": None,
        "brief": content["desc"],
        "round": 0,
        "roundDesc": "",
        "companyStatus": 2010,
        "fundingType": 0,
        "locationId": 0,
        "establishDate": None,
        "logo": None,
        "sourceId": company_key,
        "field": None,
        "subField": None,
        "tags": None,
        "type":41020,
        "score":content["score"],
        "artifacts":artifacts
    }
예제 #10
0
def parse_artifact(source_company_id,item):
    logger.info("parse_artifact")
    c = item["baseinfo"]
    artifacts = []
    website = c.get("website","").strip()

    website = url_helper.url_normalize(website)
    if website is not None and website != "":
        if website.find("http://") == -1 and website.find("https://"):
            website = "http://"+website
        type, market, app_id = url_helper.get_market(website)
        if type == 4010:
            if website.find('neeq') > 0:
                pass
            else:
                artifact = {
                    "sourceCompanyId": source_company_id,
                    "name": c["name"],
                    "description": None,
                    "link": website,
                    "domain": app_id,
                    "type": type
                }
                artifacts.append(artifact)
        elif (type==4040 or type==4050) and app_id is not None:
            domain = get_android_domain(market, app_id)
            if (type==4040 or type==4050) and domain is not None:
                artifact = {
                    "sourceCompanyId": source_company_id,
                    "name": c["name"],
                    "description": None,
                    "link": website,
                    "domain": domain,
                    "type": type
                }
                artifacts.append(artifact)

    return artifacts
예제 #11
0
def parse_company(item):
    if item is None:
        logger.info("here")
        return None

    logger.info("*** base ***")
    company_key = item["key"]
    html1 = item["content"]
    logger.info(company_key)
    d = pq((html.fromstring(html1.decode("utf-8"))))

    name = d('h1.name').text().strip()

    fullName = d('div.company-business> h4').text()
    if fullName.find("来源")>=0:
        fullName = fullName.split(" ")[-1]

    fullName = name_helper.company_name_normalize(fullName)

    if (name is None or name == "") or (fullName is None or fullName == ""):
        logger.info("here1: %s", name)
        return {
            "status": "No_Name",
        }
    if len(name) > len(fullName):
        name = fullName
    if name is None or name.strip() == "":
        name = fullName

    chinese, companycheck = name_helper.name_check(fullName)
    if companycheck is not True:
        logger.info("here")
        return {
            "status": "No_Name",
        }
    logo = d('div.company-logo> img').attr('src')

    if logo.startswith("http") or logo.startswith("https") or logo.find("default") >= 0:
        pass
    else:
        logo = None

    # if logo.find("default") >= 0:
    #     logo = None

    brief = None
    desc_text = d('div.job-sec> div.text').text()
    logger.info("desc: %s", desc_text)

    if u"该公司尚未添加公司介绍" in desc_text or desc_text == "" or len(desc_text) < 5:
        desc = None
    else:

        desc = desc_text.replace('公司简介:',"").replace("收起","").replace("展开","").replace("&nbsp;","").strip()

    field = ''
    stage = ''
    headCount = ''
    location = ''
    address = ''
    try:
        lll = d('div.info-primary> p').text().strip()
        if len(lll.split(" ")) == 3:
            field = lll.split(" ")[2]
            stage = lll.split(" ")[0]
            headCount = lll.split(" ")[1]

    except:
        pass

    headCount = headCount.replace("人", "")

    if headCount == "少于15":
        min_staff = 1
        max_staff = 15
    else:
        staffarr = headCount.split('-')
        if len(staffarr) > 1:
            min_staff = staffarr[0]
            max_staff = staffarr[1]
        else:
            try:
                min_staff = int(staffarr[0].strip())
                max_staff = None
            except:
                min_staff = None
                max_staff = None



    funding_type = 0
    if stage == '不需要融资':
        stage = 0
        funding_type = 8010
    elif stage == '未融资':
        stage = 0
    elif stage == '天使轮':
        stage = 1010
    elif stage == 'A轮':
        stage = 1030
    elif stage == 'B轮':
        stage = 1040
    elif stage == 'C轮':
        stage = 1050
    elif stage == 'D轮及以上':
        stage = 1060
    elif stage == '上市公司':
        stage = 1110
    else:
        stage = 0



    links = d('div.company-products> ul> li> div.text> div.name> a')
    artifacts = []
    for linkp in links:
        link = pq(linkp)('a').attr("href")
        website = url_helper.url_normalize(link)
        logger.info("website: %s" % website)

        type, app_market, app_id = url_helper.get_market(website)
        if type == 4010:
            if item["url"] != website and website.find("zhipin") == -1:
                flag, domain = url_helper.get_domain(website)
                if flag is not None:
                    if flag is False:
                        domain = None
                    artifacts.append({
                        "type": 4010,
                        "name": name,
                        "description": None,
                        "link": website,
                        "domain": domain
                    })
        elif type == 4020 or type == 4030:
            domain = None
            if domain is not None:
                artifacts.append({
                    "type": type,
                    "name": name,
                    "description": None,
                    "link": website,
                    "domain": domain
                })
        elif type == 4040:
            domain = app_id
            if domain is not None:
                artifacts.append({
                    "type": 4040,
                    "name": name,
                    "description": None,
                    "link": website,
                    "domain": domain
                })
        elif type == 4050:
            domain = None
            if app_market == 16010 or app_market == 16020:
                android_app = parser_mongo_util.find_android_market(app_market, app_id)
                if android_app:
                    domain = android_app["apkname"]
            else:
                domain = app_id
            if domain is not None:
                artifacts.append({
                    "type": 4050,
                    "name": name,
                    "description": None,
                    "link": website,
                    "domain": domain
                })

    #parser member
    members = []

    lis = d('div.manager-list> div> ul >li> div')
    member_rank = 0
    if len(lis) > 0:
        for li in lis:
            mem = pq(li)
            try:
                logo_url = mem('div.info-user> img').attr('src')
                if logo_url.startswith("http") or logo_url.startswith("https"):
                    pass
                else:
                    logo_url = "http:" + logo_url
                member_rank += 1
                member_key = str(item["key"]) + '_' + str(member_rank)
                member_name = mem('p> span.name').text()
                # member_link = mem('p.item_manager_name > a').attr('href')
                member_position = mem('p> span.job-title').text()

                member_desc = mem('div.item_manager_content').text()

                # weibo = None
                # if member_link is not None:
                #     if 'weibo.com' in member_link:
                #         weibo = member_link

                source_member = {'name': member_name,
                                 'photo_url': logo_url,
                                 'weibo': None,
                                 'location': None,
                                 'role': member_position,
                                 'description': member_desc,
                                 'education': None,
                                 'work': None
                                 }
                members.append(source_member)
            except:
                pass

    sourceId2link =  d('div.company-tab> a').eq(0).attr("href")
    if sourceId2link is not None and sourceId2link.find("gongsi") >=0:
        sourceId2 = sourceId2link.split("/")[-1].replace(".html","")
    else:
        sourceId2 =  None

    source_company = {
                      "name": name,
                      "fullName": fullName  if fullName is not None and fullName.strip() != "" else None,
                      "description": desc,
                      "brief": brief,
                      "round": None,
                      "roundDesc": None,
                      "companyStatus": 2010,
                      'fundingType': funding_type,
                      "locationId": int(0),
                      "address": address,
                      "phone": None,
                      "establishDate": None,
                      "logo": logo,
                      "source": SOURCE,
                      "sourceId": company_key,
                      "sourceId2": sourceId2,
                      "sourceUrl": "https://www.zhipin.com/gongsi/%s.html?ka=company-intro" % company_key,
                      "field": field,
                      "headCountMin": min_staff,
                      "headCountMax": max_staff,
                      "artifacts": artifacts,
                      "members": members,
                      "status": 1,
                      "stage": 0,
                      }

    return source_company
예제 #12
0
def parse_artifact(source_company_id, item):
    name = item['name']
    logger.info('parse_artifact:%s' % name)

    artifacts = []
    desc = ''
    descs = item['content']['company_base']['properties']
    if descs.has_key('short_description'):
        desc = descs['short_description']

    of = item['content']['company_base']['overview_fields2']
    if of.has_key('website'):
        website = of['website']['value']
        website = url_helper.url_normalize(website)
        # logger.info('website:%s'%website)
        if website is not None and website.find(
                'twitter') == -1 and website.find(
                    'linkedin') == -1 and website.find('facebook') == -1:
            type, app_market, app_id = url_helper.get_market(website)
            # logger.info('type:%s---market:%s---app_id:%s'%(type,market,app_id))
            if type == 4010:
                flag, domain = url_helper.get_domain(website)
                if flag is not None:
                    if flag is False:
                        domain = None
                    artifacts.append({
                        "type": 4010,
                        "name": name,
                        "description": desc,
                        "link": website,
                        "domain": domain
                    })
            elif type == 4020 or type == 4030:
                domain = None
                if domain is not None:
                    artifacts.append({
                        "type": type,
                        "name": name,
                        "description": desc,
                        "link": website,
                        "domain": domain
                    })
            elif type == 4040:
                domain = app_id
                if domain is not None:
                    artifacts.append({
                        "type": 4040,
                        "name": name,
                        "description": desc,
                        "link": website,
                        "domain": domain
                    })
            elif type == 4050:
                domain = None
                if app_market == 16010 or app_market == 16020:
                    android_app = parser_mongo_util.find_android_market(
                        app_market, app_id)
                    if android_app:
                        domain = android_app["apkname"]
                else:
                    domain = app_id
                if domain is not None:
                    artifacts.append({
                        "type": 4050,
                        "name": name,
                        "description": desc,
                        "link": website,
                        "domain": domain
                    })

    return artifacts
예제 #13
0
logger = loghelper.get_logger("prepare_source_artifact_domain")




if __name__ == "__main__":
    start = 0
    conn =db.connect_torndb()
    while True:
        items = list(conn.query("select * from source_artifact order by id limit %s,1000",start))
        for item in items:
            if item["domain"] is not None and item["domain"].strip() != "":
                continue

            if item["type"] == 4010:
                link = url_helper.url_normalize(item["link"])
                (flag, domain) = url_helper.get_domain(link)
                if flag is True:
                    logger.info("%s, %s %s %s", item["id"], item["type"], link, domain)
                    conn.update("update source_artifact set domain=%s where id=%s", domain, item["id"])

            elif item["type"] == 4040 or item["type"] == 4050:
                (apptype, appmarket, trackid) = url_helper.get_market(item["link"])
                if (apptype == 4040 or apptype == 4050) and trackid is not None:
                    logger.info("%s %s %s %s", item["id"], apptype, item["link"], trackid)
                    conn.update("update source_artifact set type=%s, domain=%s where id=%s",apptype,trackid,item["id"])
        start += 1000
        if len(items) == 0:
            break
    conn.close()
예제 #14
0
파일: itunes.py 프로젝트: yujiye/Codes
def run():
    crawler = ItunesCrawler()
    while True:
        if len(APPS) == 0:
            return

        item = APPS.pop(0)

        mongo = db.connect_mongo()
        record = mongo.market.itunes.find_one({"trackId": item["trackId"]},
                                              projection={'histories': False})
        mongo.close()
        if record is not None:
            mongo = db.connect_mongo()
            mongo.market.itunes_index.update({"_id": item["_id"]},
                                             {"$set": {
                                                 "processed": True
                                             }})
            mongo.close()
            continue

        url = "https://itunes.apple.com/cn/lookup?id=%s" % item["trackId"]
        data = None
        while True:
            result = crawler.crawl(url)
            if result['get'] == 'success':
                rjson = json.loads(result["content"])
                if rjson["resultCount"] > 0:
                    data = rjson["results"][0]
                break
        if data is None:
            mongo = db.connect_mongo()
            mongo.market.itunes_index.update({"_id": item["_id"]},
                                             {"$set": {
                                                 "processed": True
                                             }})
            mongo.close()
            continue

        #url = item["trackViewUrl"].replace("https://","http://")
        url = item["trackViewUrl"]
        while True:
            result = crawler.crawl(url)
            if result['get'] == 'success':
                #logger.info(result["content"])
                d = pq(result["content"])

                # developer = d("div.intro> div.left> h2").text()
                # if developer is not None:
                #     developer = developer.replace("开发商:","")
                # data["developer"] = developer
                developer = d(".product-header__identity> a").text()
                if developer is not None:
                    developer = developer.replace("开发商:", "")
                data["developer"] = developer

                # supportUrl = None
                # links = d('li.t-subbody>a.targeted-link.link.icon')
                # for i in links:
                #     title = pq(i).text().strip()
                #     if title.endswith("支持"):
                #         supportUrl = pq(i).attr('href').strip()
                #         break
                # data["supportUrl"] = url_helper.url_normalize(supportUrl)

                supportUrl = None
                links = d('li.t-subbody>a.targeted-link.link.icon')
                for i in links:
                    title = pq(i).text().strip()
                    if title.endswith("支持"):
                        supportUrl = pq(i).attr('href').strip()
                        break
                data["supportUrl"] = url_helper.url_normalize(supportUrl)

                relatedApps = []
                # try:
                #     divs = d('div.swoosh')
                #     for div in divs:
                #         e = pq(div)
                #         if e('div.title').text().strip() == "Customers Also Bought" or e('div.title').text().strip() == "用户购买的还有":
                #             apps = e('div.content> div> div.application')
                #             for app in apps:
                #                 app_id = pq(app).attr('adam-id')
                #                 relatedApps.append(int(app_id))
                #                 # logger.info("*********************%s", app_id)
                # except:
                #     pass

                try:
                    apps = d('div.l-row.l-row--peek> a')
                    for app in apps:
                        appurl = pq(app).attr('href')
                        r = util.re_get_result('/id(\d*)', appurl)
                        if r is not None:

                            track_id, = r
                            try:
                                app_id = int(track_id)
                                relatedApps.append(int(app_id))
                            except:
                                pass
                except:
                    pass
                #logger.info("*********************%s", relatedApps)
                data["relatedApps"] = relatedApps

                userComments = []
                # cdivs = d('div.customer-reviews> div.customer-review')
                # for cdiv in cdivs:
                #     c = pq(cdiv)
                #     try:
                #         c_title = c('span.customerReviewTitle').text().strip()
                #         c_commentator = c('span.user-info').text().replace("评论人:", "").strip()
                #         c_content = c('p.content').text().strip()
                #
                #         comment = {
                #             "title": c_title,
                #             "commentator": c_commentator,
                #             "content": c_content
                #         }
                #         userComments.append(comment)
                #
                #     except:
                #         pass

                cdivs = d('div.l-row.l-row--peek> div.ember-view')
                for cdiv in cdivs:
                    c = pq(cdiv)
                    try:
                        c_title = c(
                            'div.we-customer-review> div.we-customer-review__header> h3'
                        ).eq(1).text().strip()
                        c_commentator = c('div.we-customer-review__user').eq(
                            1).text().replace("评论人:", "").strip()
                        c_content = c('p.we-customer-review__body').attr(
                            "aria-label")

                        comment = {
                            "title": c_title,
                            "commentator": c_commentator,
                            "content": c_content
                        }
                        userComments.append(comment)

                    except:
                        pass

                logger.info(
                    json.dumps(userComments,
                               ensure_ascii=False,
                               cls=util.CJsonEncoder))
                data["userComments"] = userComments

                break
            elif result['get'] == 'fail' and result["content"] is not None:
                if result["content"].find(
                        "Your request produced an error.") >= 0:
                    break

        if data.has_key("supportUrl") and data["supportUrl"] is not None:
            flag, domain = url_helper.get_domain(data["supportUrl"])
            if flag:
                data["supportDomain"] = domain
            else:
                data["supportDomain"] = None
        if data.has_key("sellerUrl") and data["sellerUrl"] is not None:
            data["sellerUrl"] = url_helper.url_normalize(data["sellerUrl"])
            flag, domain = url_helper.get_domain(data["sellerUrl"])
            if flag:
                data["sellerDomain"] = domain
            else:
                data["sellerDomain"] = None

        short_name = name_helper.get_short_name(data["trackName"])
        data["trackShortName"] = short_name
        logger.info(json.dumps(data, ensure_ascii=False,
                               cls=util.CJsonEncoder))

        mongo = db.connect_mongo()
        record = mongo.market.itunes.find_one({"trackId": data["trackId"]},
                                              projection={'histories': False})
        if record:
            _id = record.pop("_id")
            if LooseVersion(data["version"]) > LooseVersion(record["version"]):
                data["createTime"] = record["createTime"]
                data["modifyTime"] = datetime.datetime.now()
                mongo.market.itunes.update_one({"_id": _id}, {
                    '$set': data,
                    '$addToSet': {
                        "histories": record
                    }
                })
            # elif LooseVersion(data["version"]) == LooseVersion(record["version"]):
            #     data["modifyTime"] = datetime.datetime.now()
            #     collection.update_one({"_id": _id}, {'$set': data})
        else:
            data["createTime"] = datetime.datetime.now()
            data["modifyTime"] = data["createTime"]
            mongo.market.itunes.insert(data)
        mongo.market.itunes_index.update({"_id": item["_id"]},
                                         {"$set": {
                                             "processed": True
                                         }})
        mongo.close()
예제 #15
0
def parse_artifact(item):
    logger.info("parse_artifact")
    company_key = item["key"]
    c = item["content"]["company_base"]["data"]["company"]
    artifacts = []
    # artifact
    website = c.get("website", "").strip()
    website = url_helper.url_normalize(website)
    if website is not None and website != "":
        type, market, app_id = url_helper.get_market(website)
        if type == 4010:
            if website.find('36kr.com') > 0 and c["name"].find('36') == -1:
                pass
            else:
                artifact = {
                    "sourceCompanyId": None,
                    "name": c["name"],
                    "description": None,
                    "link": website,
                    "domain": app_id,
                    "type": type
                }
                artifacts.append(artifact)
        elif (type == 4040 or type == 4050) and app_id is not None:
            domain = get_android_domain(market, app_id)
            if (type == 4040 or type == 4050) and domain is not None:
                artifact = {
                    "sourceCompanyId": None,
                    "name": c["name"],
                    "description": None,
                    "link": website,
                    "domain": domain,
                    "type": type
                }
                artifacts.append(artifact)

    weibo = c.get("weibo", "").strip()
    if weibo is not None and weibo != "":
        artifact = {
            "sourceCompanyId": None,
            "name": c["name"],
            "description": None,
            "link": weibo,
            "domain": None,
            "type": 4030
        }
        artifacts.append(artifact)

    weixin = c.get("weixin", "").strip()
    if weixin is not None and weixin != "":
        artifact = {
            "sourceCompanyId": None,
            "name": c["name"],
            "description": None,
            "link": weixin,
            "domain": None,
            "type": 4020
        }
        artifacts.append(artifact)

    iphoneAppstoreLink = c.get("iphoneAppstoreLink", "").strip()
    if iphoneAppstoreLink is not None and iphoneAppstoreLink != "":
        type, market, app_id = url_helper.get_market(iphoneAppstoreLink)
        domain = get_android_domain(market, app_id)
        if (type == 4040 or type == 4050) and domain is not None:
            artifact = {
                "sourceCompanyId": None,
                "name": c["name"],
                "description": None,
                "link": iphoneAppstoreLink,
                "domain": domain,
                "type": type
            }
            artifacts.append(artifact)

    ipadAppstoreLink = c.get("ipadAppstoreLink", "").strip()
    if ipadAppstoreLink is not None and ipadAppstoreLink != "":
        type, market, app_id = url_helper.get_market(ipadAppstoreLink)
        domain = get_android_domain(market, app_id)
        if (type == 4040 or type == 4050) and domain is not None:
            artifact = {
                "sourceCompanyId": None,
                "name": c["name"],
                "description": None,
                "link": ipadAppstoreLink,
                "domain": domain,
                "type": type
            }
            artifacts.append(artifact)

    androidLink = c.get("androidLink", "").strip()
    if androidLink is not None and androidLink != "":
        type, market, app_id = url_helper.get_market(androidLink)
        domain = get_android_domain(market, app_id)
        if (type == 4040 or type == 4050) and domain is not None:
            artifact = {
                "sourceCompanyId": None,
                "name": c["name"],
                "description": None,
                "link": androidLink,
                "domain": domain,
                "type": type
            }
            artifacts.append(artifact)

    return artifacts
예제 #16
0
def parse_artifact(source_company_id, item):
    logger.info("parse_artifact")
    company_key = item["key"]
    cc = item["content"]["company_base"]["data"]
    cp = item["content"]["product"]["data"]["companyProduct"]
    artifacts = []
    links = []
    # artifact
    for c in [cc, cp]:
        website = c.get("website", "").strip()
        website = url_helper.url_normalize(website)
        if website is not None and website != "" and website not in links:
            type, market, app_id = url_helper.get_market(website)
            if type == 4010:
                if website.find('36kr.com') > 0 and c["name"].find('36') == -1:
                    pass
                else:
                    artifact = {
                        "sourceCompanyId": source_company_id,
                        "name": c["name"],
                        "description": None,
                        "link": website,
                        "domain": app_id,
                        "type": type
                    }
                    artifacts.append(artifact)
                    links.append(website)
            elif (type == 4040 or type == 4050) and app_id is not None:
                domain = get_android_domain(market, app_id)
                if (type == 4040 or type == 4050) and domain is not None:
                    artifact = {
                        "sourceCompanyId": source_company_id,
                        "name": c["name"],
                        "description": None,
                        "link": website,
                        "domain": domain,
                        "type": type
                    }
                    artifacts.append(artifact)
                    links.append(website)

        weibo = c.get("weibo", "").strip()
        if weibo is not None and weibo != "" and weibo.find(
                "weibo") >= 0 and weibo not in links:
            artifact = {
                "sourceCompanyId": source_company_id,
                "name": c["name"],
                "description": None,
                "link": weibo,
                "domain": None,
                "type": 4030
            }
            artifacts.append(artifact)
            links.append(weibo)

        weixin = c.get("weixin", "").strip()
        if weixin is not None and weixin != "" and weixin not in links:
            artifact = {
                "sourceCompanyId": source_company_id,
                "name": c["name"],
                "description": None,
                "link": weixin,
                "domain": weixin,
                "type": 4020
            }
            artifacts.append(artifact)
            links.append(weixin)

        iphoneAppstoreLink = c.get("ios", "").strip()
        if iphoneAppstoreLink is not None and iphoneAppstoreLink != "" and iphoneAppstoreLink not in links:
            type, market, app_id = url_helper.get_market(iphoneAppstoreLink)
            domain = get_android_domain(market, app_id)
            if (type == 4040 or type == 4050) and domain is not None:
                artifact = {
                    "sourceCompanyId": source_company_id,
                    "name": c["name"],
                    "description": None,
                    "link": iphoneAppstoreLink,
                    "domain": domain,
                    "type": type
                }
                artifacts.append(artifact)
                links.append(iphoneAppstoreLink)

        # ipadAppstoreLink = c.get("ipadAppstoreLink","").strip()
        # if ipadAppstoreLink is not None and ipadAppstoreLink != "":
        #     type, market, app_id = url_helper.get_market(ipadAppstoreLink)
        #     domain = get_android_domain(market, app_id)
        #     if (type==4040 or type==4050) and domain is not None:
        #         artifact = {
        #             "sourceCompanyId": source_company_id,
        #             "name": c["name"],
        #             "description": None,
        #             "link": ipadAppstoreLink,
        #             "domain": domain,
        #             "type": type
        #         }
        #         artifacts.append(artifact)

        androidLink = c.get("android", "").strip()
        if androidLink is not None and androidLink != "" and androidLink not in links:
            type, market, app_id = url_helper.get_market(androidLink)
            domain = get_android_domain(market, app_id)
            if (type == 4040 or type == 4050) and domain is not None:
                artifact = {
                    "sourceCompanyId": source_company_id,
                    "name": c["name"],
                    "description": None,
                    "link": androidLink,
                    "domain": domain,
                    "type": type
                }
                artifacts.append(artifact)
                links.append(androidLink)

    return artifacts
예제 #17
0
def process(crawler, url, apkname, content):
    # logger.info(content)
    if has_content(content,apkname):
        logger.info("hereherehere")
        #content = content.decode('utf-8')
        d = pq(html.fromstring(content.decode("utf-8", "ignore")))
        #content = unicode(content, encoding="utf-8", errors='replace')
        #d = pq(content)

        name = d('span.title').text()
        # logger.info("name: %s",name)

        icon = d('div.app-icon> img').attr("src")

        brief = d('p.tagline').text()
        # logger.info(brief)

        commentbyeditor= d('div.editorComment> div').text()
        #logger.info(editor_comment)

        screenshots = []
        imgs = d('div.overview> img')
        # logger.info(imgs)
        for img in imgs:
            imgurl = pq(img).attr("src")
            screenshots.append(imgurl)

        desc = d('div.desc-info> div').text()
        # logger.info(desc)
        updates = d('div.change-info> div').text()
        # logger.info(update_desc)
        try:
            size = int(d('meta[itemprop="fileSize"]').attr("content"))
        except:
            size = d('meta[itemprop="fileSize"]').attr("content")
            if size.find("KB") >= 0:
                size = int(float(size.replace("KB","").strip())* 1024)
            elif size.find("MB") >= 0:
                size = int(float(size.replace("MB","").strip())* 1024 * 1024)
            else:
                size = None
        tags = d('dd.tag-box >a').text().replace(" ",",")


        datestr = d('time#baidu_time').text()
        updatedate = datetime.datetime.strptime(datestr, "%Y年%m月%d日")
        #versionname = d(':contains("版本")').next()
        #logger.info(versionname)
        author = d('span.dev-sites').text()
        chinese, is_company = name_helper.name_check(author)
        if chinese and is_company:
            author = name_helper.company_name_normalize(author)
        try:
            website=d('a.dev-sites').attr("href")
            website = url_helper.url_normalize(website)
        except:
            website=None

        compatibility=None
        if content.find("查看权限要求") == -1:
            r1 = "content=\"Android\">(.*?)</dd>.*<dt>来自"
        else:
            r1 = "content=\"Android\">(.*?)<div>.*"
        result1 = util.re_get_result(r1, content)
        if result1:
            (compatibility,)= result1
            compatibility=compatibility.replace("\n","").replace("\r","").replace("\s","").replace(" ","")
        #logger.info(compatibility)

        versionname=None
        r2 = "<dt>版本</dt>.*<dd>(.*?)</dd>.*<dt>要求"
        result2 = util.re_get_result(r2, content)
        if result2:
            (versionname,)= result2
            versionname = versionname.replace("\n", "").replace("\r", "").replace("\s", "").replace("&nbsp;","").strip()

        #logger.info(versionname)

        try:
            versionname = versionname.split()[0]
            if versionname.startswith("V"):
                versionname = versionname.replace("V", "")
        except:
            pass
        # download = int(d("i[itemprop='interactionCount']").attr("content").split(":")[1])
        dnum = d("i[itemprop='interactionCount']").attr("content").split(":")[1]
        download = None
        try:
            download = int(dnum)
        except:
            if dnum.find("万") >= 0:
                download = int(float(dnum.replace("万", "").strip()) * 10000)
            elif dnum.find("亿") >= 0:
                download = int(float(dnum.replace("亿", "").strip()) * 10000 * 10000)
            else:
                logger.info("********download :%s cannot get", dnum)

        item = {
            "link": url,
            "apkname": apkname,
            "appmarket": APPMARKET,
            "name": name,
            "brief": brief,
            "website": website,
            "description": desc,
            "commentbyeditor": commentbyeditor,
            "updateDate": updatedate,
            "language": None,
            "tags": tags,
            "version": versionname,
            "updates": updates,
            "size": size,
            "compatibility": compatibility,
            "icon": icon,
            "author": author,
            "screenshots": screenshots,
            "type": None,
            "key": apkname,
            "key_int": None,
            "download":download,
            }

        logger.info(json.dumps(item, ensure_ascii=False, cls=util.CJsonEncoder))

        android.save(collection, APPMARKET, item)
        android.merge(item)
        collection_android.update_one({"apkname": apkname}, {"$set": {"wandoujiaprocessed": True, "wandoujiafound": True}})

    else:
        logger.info("App: %s has no content", apkname)
        #logger.info(content)
        collection_android.update_one({"apkname": apkname}, {"$set": {"wandoujiaprocessed": True, "wandoujiafound": False}})
예제 #18
0
def parse_base(item):
    if item is None:
        return None

    logger.info("*** base ***")
    company_key = item["key"]
    html = item["content"]
    #logger.info(html)
    d = pq(html)

    company_short_name = ""
    product_name = d('div.line-title> span> h1').clone().children().remove().end().text().strip()
    if product_name is None or product_name.strip() == "":
        product_name = d('div.line-title> span> b').clone().children().remove().end().text().strip()
    temps = product_name.split("/",1)
    if len(temps) == 2:
        product_name = temps[0].strip()
        company_short_name = temps[1].strip()
    if company_short_name == "":
        company_short_name = product_name
    logger.info("product name: %s" % product_name)
    logger.info("company short name: %s" % company_short_name)

    company_name = d('div.des-more> div').eq(0).text().strip().replace("公司全称:","")
    if company_name == "暂无" or company_name == "暂未收录":
        company_name = ""

    if company_name is None or company_name.strip() == "":
        try:
            company_name = d('div.des-more> h2').text().strip()
        except:
            pass
    if company_name == "暂无" or company_name == "暂未收录":
        company_name = ""

    company_name = name_helper.company_name_normalize(company_name)
    logger.info("company name: %s" % company_name)

    if company_short_name == "" and company_name == "":
        return

    establish_date = None
    str = d('div.des-more> div').eq(1).text().strip().replace("成立时间:","")
    result = util.re_get_result('(\d*)\.(\d*)',str)
    if result != None:
        (year, month) = result
        try:
            if int(month) > 12:
                month = "1"
        except:
            month = "1"
        establish_date = datetime.datetime.strptime("%s-%s-1" % (year,month), '%Y-%m-%d')
    logger.info("establish date: %s" % establish_date)

    locationId=0
    str = d('span.loca').text().strip()
    #logger.info(str)
    result = util.re_get_result(u'(.*?)·(.*?)$',str)
    if result != None:
        (province, city) = result
        province = province.strip()
        city = city.strip()
        logger.info("location: %s-%s" % (province, city))

        locationId = 0
        result = parser_db_util.get_location(city)
        if result != None:
            locationId = result["locationId"]
        else:
            result = parser_db_util.get_location(province)
            if result != None:
                locationId = result["locationId"]

    if locationId == 0:
        loc1,loc2 = name_helper.get_location_from_company_name(company_name)
        if loc1 is not None:
            result = parser_db_util.get_location(loc1)
            if result != None:
                locationId = result["locationId"]
    logger.info("locationId: %d" % locationId)

    company_status = 2010
    str = d('div.des-more> div').eq(2).text().strip()
    if str == "已关闭":
        company_status = 2020
    logger.info("company_status: %d" % company_status)

    funding_type = 0
    str = d("span.tag.bg-c").text().strip()
    logger.info("融资需求: %s" % str)
    if str == "融资需求 · 需要融资":
        funding_type = 8020
    elif str == "融资需求 · 寻求收购":
        funding_type = 8020
    logger.info("funding_type=%d" % funding_type)
    try:
        brief = d("h2.seo-slogan").text().strip()
    except:
        brief = ""
    logger.info("brief: %s" % brief)

    if brief.find("暂未收录"):
        brief = ""
    field = d("span.scope.c-gray-aset> a").eq(0).text().strip()
    logger.info("field: %s" % field)

    sub_field = d("span.scope.c-gray-aset> a").eq(1).text().strip()
    logger.info("sub field: %s" % sub_field)

    tags = d("div.tagset.dbi.c-gray-aset> a >span").text().strip().replace(" ",",")
    logger.info("tags: %s" % tags)

    desc = d("div.des,div.desc,div.introduction,div.abstract,div.summary").text().\
        replace("购买数据请联系","").replace('*****@*****.**',"").replace("itjuzi是一家数据服务公司","").strip()
    logger.info("********desc: %s" % desc)

    #logo
    logo = d("div.pic >img").attr("src")
    #if logo:
    #    logo = logo.replace("http://", "https://")
    logger.info("logo: %s", logo)


    # website = d('div.link-line> a').text().strip()
    # if website is None or website == "":
    #     website = d('div.link-line> a.webTink').text().strip()
    # if website is None or website == "":
    #     try:
    #         logger.info("here")
    #         website = d('div.link-line> span.weblink> a').eq(1).text().strip()
    #         logger.info(website)
    #     except:
    #         pass
    artifacts = []
    for ty in [1,2,3]:
        if ty == 1:
            was = d('div.link-line> a')
        else:
            was = d('div.link-line> span.weblink,span.webTink> a')

        for wa in was:
            webs =[]

            try:
                website = pq(wa).attr("href").strip()
                if website=="http://%e6%9a%82%e6%97%a0" or website == "http://tt":
                    website = ""
                website = url_helper.url_normalize(website)
                logger.info("website: %s" % website)
                webs.append(website)
            # else:

            #     website = pq(wa).text().strip()
            except:
                pass
            try:
                website = pq(wa).text().strip()
                if website == "http://%e6%9a%82%e6%97%a0" or website == "http://tt":
                    website = ""
                website = url_helper.url_normalize(website)
                logger.info("website: %s" % website)
                webs.append(website)
            # else:
            #     website = pq(wa).text().strip()
            except:
                pass

            #
            # if website=="http://%e6%9a%82%e6%97%a0":
            #     website = ""
            # website = url_helper.url_normalize(website)
            # logger.info("website: %s" % website)

            # artifacts = []
            for website in webs:
                type, app_market, app_id = url_helper.get_market(website)
                if type == 4010:
                    flag, domain = url_helper.get_domain(website)
                    if flag is not None:
                        if flag is False:
                            domain = None
                        artifacts.append({
                            "type":4010,
                            "name":product_name,
                            "desc":desc,
                            "link":website,
                            "domain": domain
                        })

                elif type == 4020:
                    domain = app_id
                    if domain is not None:
                        artifacts.append({
                            "type": 4020,
                            "name": product_name,
                            "desc": None,
                            "link": website,
                            "domain": website
                        })

                elif type == 4030:
                    domain = app_id
                    if domain is not None:
                        artifacts.append({
                            "type": 4030,
                            "name": product_name,
                            "desc": None,
                            "link": website,
                            "domain": None
                        })

                elif type == 4040:
                    domain = app_id
                    if domain is not None:
                        artifacts.append({
                                "type":4040,
                                "name":product_name,
                                "desc":desc,
                                "link":website,
                                "domain": domain
                        })
                elif type == 4050:
                    domain = None
                    if app_market == 16010 or app_market == 16020:
                        android_app = parser_db_util.find_android_market(app_market, app_id)
                        if android_app:
                            domain = android_app["apkname"]
                    else:
                        domain = app_id
                    if domain is not None:
                        artifacts.append({
                            "type":4050,
                            "name":product_name,
                            "desc":desc,
                            "link":website,
                            "domain": domain
                        })


    #获投状态
    roundStr = d('span.t-small.c-green').text().replace("(","").replace(")","").replace("获投状态:","").strip()
    fundingRound,roundStr = itjuzi_helper.getFundingRound(roundStr)
    logger.info("获投状态: %d, %s", fundingRound, roundStr)

    logger.info("")


    return {
        "shortName": company_short_name,
        "fullName": company_name if company_name is not None and company_name.strip() != "" else None,
        "productName": product_name,
        "description": desc,
        "brief": brief,
        "round": fundingRound,
        "roundDesc": roundStr,
        "companyStatus": company_status,
        "fundingType": funding_type,
        "locationId": locationId,
        "establishDate": establish_date,
        "logo": logo,
        "sourceId": company_key,
        "field": field,
        "subField": sub_field,
        "tags": tags,
        "type":41010,
        "artifacts":artifacts
    }
예제 #19
0
def handle_lookup_result(response, app, date_num):
    global total
    if response.error:
        logger.info("Error: %s, %s" % (response.error, response.request.url))
        logger.info("Last Total number of current patch: %s", total)
        request(response.request.url,
                lambda r, app=app, date_num=date_num: handle_lookup_result(
                    r, app, date_num))
        return
    else:
        logger.info("Getting result from url: %s", response.request.url)
        trackId = int(app["domain"])
        try:
            data = json.loads(response.body)
            if data["resultCount"] > 0:
                for result in data["results"]:
                    if result.get("trackId") == trackId:
                        score = result.get("averageUserRating")
                        comment = result.get("userRatingCount")
                        logger.info(
                            "companyId=%s, artifactId=%s, score=%s, comment=%s, date_num=%s"
                            % (app["companyId"], app["id"], score, comment,
                               date_num))

                        if score is not None or comment is not None:
                            save_comment(app["trackId"], score, comment)

                        logger.info("Last Total number of current patch: %s",
                                    total)

                        if result.has_key("sellerUrl") and result[
                                "sellerUrl"] is not None:
                            result["sellerUrl"] = url_helper.url_normalize(
                                result["sellerUrl"])
                            flag, domain = url_helper.get_domain(
                                result["sellerUrl"])
                            if flag:
                                result["sellerDomain"] = domain
                            else:
                                result["sellerDomain"] = None

                        short_name = name_helper.get_short_name(
                            result["trackName"])
                        result["trackShortName"] = short_name

                        record = collection_itunes.find_one(
                            {"trackId": result["trackId"]},
                            projection={'histories': False})
                        if record:
                            collection_itunes.update_one(
                                {"_id": record["_id"]}, {
                                    '$set': {
                                        "checkTime": datetime.datetime.now()
                                    }
                                })
                            if record.get("offline_itunes", None) == 'Y':
                                offrecord = {
                                    "offlineDetectTime":
                                    datetime.datetime.now(),
                                    "offline_itunes": 'N'
                                }
                                collection_itunes.update_one(
                                    {"_id": record["_id"]}, {
                                        '$set': {
                                            "offline_itunes":
                                            'N',
                                            "offlineitunesDetectTime":
                                            datetime.datetime.now()
                                        },
                                        '$addToSet': {
                                            "offline_itunes_histories":
                                            offrecord
                                        }
                                    })
                            _id = record.pop("_id")
                            if LooseVersion(result["version"]) > LooseVersion(
                                    record["version"]):
                                # if 1:
                                page_url = result.get("trackViewUrl").replace(
                                    "&uo=4", "")

                                if date_num == 6 and page_url is not None and page_url.strip(
                                ) != "":
                                    # only do it when date is 6/16/226
                                    logger.info(
                                        "Need to crawler page data: %s",
                                        page_url)
                                    total += 1
                                    request(page_url,
                                            lambda r, appdata=result:
                                            save_itunes(r, appdata))
                                else:
                                    logger.info(
                                        json.dumps(result,
                                                   ensure_ascii=False,
                                                   cls=util.CJsonEncoder))
                                    result["createTime"] = record["createTime"]
                                    result[
                                        "modifyTime"] = datetime.datetime.now(
                                        )
                                    collection_itunes.update_one(
                                        {"_id": _id}, {
                                            '$set': result,
                                            '$addToSet': {
                                                "histories": record
                                            }
                                        })
                        else:
                            result["createTime"] = datetime.datetime.now()
                            result["modifyTime"] = result["createTime"]
                            collection_itunes.insert(result)

                        break
            elif data["resultCount"] == 0:
                record = collection_itunes.find_one(
                    {"trackId": trackId}, projection={'histories': False})
                logger.info("***********Offline************")
                if record:
                    if record.get("offline_itunes",
                                  None) is None or record.get(
                                      "offline_itunes", None) == 'N':
                        offrecord = {
                            "offlineDetectTime": datetime.datetime.now(),
                            "offline_itunes": 'Y'
                        }
                        collection_itunes.update_one({"_id": record["_id"]}, {
                            '$set': {
                                "offline_itunes": 'Y',
                                "offlineitunesDetectTime":
                                datetime.datetime.now(),
                                "checkTime": datetime.datetime.now()
                            },
                            '$addToSet': {
                                "offline_itunes_histories": offrecord
                            }
                        })
                    else:
                        collection_itunes.update_one(
                            {"_id": record["_id"]},
                            {'$set': {
                                "checkTime": datetime.datetime.now()
                            }})
        except:
            traceback.print_exc()

    total -= 1
    if total <= 0:
        begin()
예제 #20
0
def parse_investor(item):

    logger.info("*** investfirm ***")

    investor_key = item["key"]
    html = item["content"]
    logger.info(investor_key)
    d = pq(html)

    logo = d('.logo-block > img').attr('src')

    if logo == "http://assets3.chuangyepu.com/chuangyepu/images/big-screenshot.png":
        logo = None
    basic_info = d('div.col-md-9> div> table> tr> td').eq(1)
    #logger.info(logo)
    name = pq(basic_info)('div.name').text().strip()
    if name is None:
        logger.info("No investor name!!!")
        return None
    desc = pq(basic_info)('div.desc').eq(0).text().strip()
    #logger.info(name+" "+desc)
    try:
        website = pq(basic_info)('div').eq(2)('a').text().strip()
    except:
        website = None

    if website is None or website.strip() == "暂无":
        website = None

    website = url_helper.url_normalize(website)
    flag, domain = url_helper.get_domain(website)
    if flag is None:
        website = None

    #logger.info(website)

    main_blocks = d('div.col-md-3> div.col-sm-12')
    #no js data
    #
    # for block in main_blocks:
    #     info = pq(block)
    #     h4 = info('h4.list_title').text().strip()
    #     logger.info(h4)
    #
    #     if h4 == "投资行业分布图":
    #         field = info('g.highcharts-axis-labels').text().strip()

    source_investor = {
        "name": name,
        "website": website,
        "description": desc,
        "logo_url": logo,
        "stage": None,
        "field": None,
        "type": 10020,
        "source": SOURCE,
        "sourceId": investor_key
    }
    logger.info(
        json.dumps(source_investor, ensure_ascii=False, cls=util.CJsonEncoder))

    return source_investor
예제 #21
0
def parse_company(item):
    if item is None:
        return None

    logger.info("*** base ***")
    company_key = item["key"]
    html = item["content"]
    logger.info(company_key)
    d = pq(html)

    if html.decode("utf-8").find("这个公司的主页还在建设") >= 0:
        return {
            "status": "No_Name",
        }
    name = d('.company_main > h1 > a').text()
    link = d('.company_main > h1 > a').attr('href')
    fullName = d('.company_main > h1 > a').attr('title')
    fullName = name_helper.company_name_normalize(fullName)
    if name is None or fullName is None or (name.find("拉勾") >= 0
                                            and company_key != "147"):
        return {
            "status": "No_Name",
        }
    if len(name) > len(fullName):
        name = fullName
    if name is None or name.strip() == "":
        name = fullName

    chinese, companycheck = name_helper.name_check(fullName)
    if companycheck is not True:
        return {
            "status": "No_Name",
        }
    logo = d('.top_info_wrap > img').attr('src')

    if logo.startswith("http") or logo.startswith("https"):
        pass
    else:
        logo = "http:" + logo

    if logo.find("logo_default") >= 0:
        logo = None

    brief = d('.company_word').text()
    desc_text = d('.company_intro_text').text()

    if u"该公司尚未添加公司介绍" in desc_text or len(desc_text) < 10:
        desc = None
    else:
        desc = d('.company_intro_text > .company_content').html()
        desc = desc.replace('<span class="text_over">展开</span>', '')

        soup = BeautifulSoup(desc, "lxml")
        raw = soup.getText()
        desc = raw

    field = ''
    stage = ''
    headCount = ''
    location = ''
    address = ''
    try:
        field = d(
            '#basic_container > .item_content >ul > li:eq(0) > span').text()
        stage = d(
            '#basic_container > .item_content >ul > li:eq(1) > span').text()
        headCount = d(
            '#basic_container > .item_content >ul > li:eq(2) > span').text()
        headCount = headCount[0:headCount.index(u'人')]
        location = d(
            '#basic_container > .item_content >ul > li:eq(3) > span').text()
        address = d('.con_mlist_ul > li:eq(0) > p:eq(1)').text()
    except:
        pass

    headCount = headCount.replace("people", "")

    if headCount == "少于15":
        min_staff = 1
        max_staff = 15
    else:
        staffarr = headCount.split('-')
        if len(staffarr) > 1:
            min_staff = staffarr[0]
            max_staff = staffarr[1]
        else:
            try:
                min_staff = int(staffarr[0].strip())
                max_staff = None
            except:
                min_staff = None
                max_staff = None

    funding_type = 0
    if stage == '不需要融资':
        stage = 0
        funding_type = 8010
    elif stage == '未融资':
        stage = 0
    elif stage == '天使轮':
        stage = 1010
    elif stage == 'A轮':
        stage = 1030
    elif stage == 'B轮':
        stage = 1040
    elif stage == 'C轮':
        stage = 1050
    elif stage == 'D轮及以上':
        stage = 1060
    elif stage == '上市公司':
        stage = 1110
    else:
        stage = 0

    location_id = 0
    location_new = parser_mongo_util.get_location(location)
    if location_new != None:
        location_id = location_new["locationId"]

    #website = util.norm_url(link)
    website = url_helper.url_normalize(link)
    logger.info("website: %s" % website)

    artifacts = []
    type, app_market, app_id = url_helper.get_market(website)
    if type == 4010:
        if item["url"] != website and website.find("lagou.com") == -1:
            flag, domain = url_helper.get_domain(website)
            if flag is not None:
                if flag is False:
                    domain = None
                artifacts.append({
                    "type": 4010,
                    "name": name,
                    "description": desc,
                    "link": website,
                    "domain": domain
                })
    elif type == 4020 or type == 4030:
        domain = None
        if domain is not None:
            artifacts.append({
                "type": type,
                "name": name,
                "description": desc,
                "link": website,
                "domain": domain
            })
    elif type == 4040:
        domain = app_id
        if domain is not None:
            artifacts.append({
                "type": 4040,
                "name": name,
                "description": desc,
                "link": website,
                "domain": domain
            })
    elif type == 4050:
        domain = None
        if app_market == 16010 or app_market == 16020:
            android_app = parser_mongo_util.find_android_market(
                app_market, app_id)
            if android_app:
                domain = android_app["apkname"]
        else:
            domain = app_id
        if domain is not None:
            artifacts.append({
                "type": 4050,
                "name": name,
                "description": desc,
                "link": website,
                "domain": domain
            })

    #parser member
    members = []

    lis = d('.manager_list > li')
    member_rank = 0
    if len(lis) > 0:
        for li in lis:
            mem = pq(li)
            try:
                logo_url = mem('img').attr('src')
                if logo_url.startswith("http") or logo_url.startswith("https"):
                    pass
                else:
                    logo_url = "http:" + logo_url
                member_rank += 1
                member_key = str(item["key"]) + '_' + str(member_rank)
                member_name = mem('p.item_manager_name > span').text()
                member_link = mem('p.item_manager_name > a').attr('href')
                member_position = mem('p.item_manager_title').text()

                member_desc = mem('div.item_manager_content').text()

                weibo = None
                if member_link is not None:
                    if 'weibo.com' in member_link:
                        weibo = member_link

                source_member = {
                    'name': member_name,
                    'photo_url': logo_url,
                    'weibo': weibo,
                    'location': None,
                    'role': member_position,
                    'description': member_desc,
                    'education': None,
                    'work': None
                }
                members.append(source_member)
            except:
                pass

    source_company = {
        "name":
        name,
        "fullName":
        fullName if fullName is not None and fullName.strip() != "" else None,
        "description":
        desc,
        "brief":
        brief,
        "round":
        None,
        "roundDesc":
        None,
        "companyStatus":
        2010,
        'fundingType':
        funding_type,
        "locationId":
        int(location_id),
        "address":
        address,
        "phone":
        None,
        "establishDate":
        None,
        "logo":
        logo,
        "source":
        SOURCE,
        "sourceId":
        company_key,
        "sourceUrl":
        "https://www.lagou.com/gongsi/%s.html" % company_key,
        "field":
        field,
        "headCountMin":
        min_staff,
        "headCountMax":
        max_staff,
        "artifacts":
        artifacts,
        "members":
        members,
        "status":
        1
    }

    return source_company
예제 #22
0
def expand_source_company(source, sourceId, beian_links_crawler, icp_chinaz_crawler, screenshot_crawler, test=False):
    logger.info("source: %s, sourceId: %s Start expand!!!", source, sourceId)
    logger.info("clean old expanded data")

    expand_clean(source, sourceId)
    sourcecompany = collection_source_company.find_one({"source": source, "sourceId": sourceId})
    # exit()
    company_fullname = sourcecompany["source_company"]["fullName"]
    if company_fullname is not None and company_fullname.strip() != "":
        company_fullname = name_helper.company_name_normalize(company_fullname)

        scnames = sourcecompany["source_company_name"]
        check_fullname = False
        for scname in scnames:
            if scname["name"] == company_fullname:
                check_fullname = True
                break
        if check_fullname is False:
            (chinese, company) = name_helper.name_check(company_fullname)
            if chinese is True:
                chinese_type = "Y"
            else:
                chinese_type = "N"
            scname_data ={
                "name": company_fullname,
                "chinese": chinese_type,
                "type": 12010,
            }
            save_mongo_source_company_name(source, sourceId, scname_data)

    round = 1

    while True:
        if round >= 6:
            collection_source_company.update_one({"_id": sourcecompany["_id"]},{'$set': {"scexpanded": True, "modifyTime": datetime.datetime.now()}})
            break

        source_company_names = find_mongo_data(collection_source_company, "source_company_name", source, sourceId)
        main_beianhaos = find_mongo_data(collection_source_company, "source_mainbeianhao", source, sourceId)
        artifacts = find_mongo_data(collection_source_company, "source_artifact", source, sourceId)

        logger.info(json.dumps(source_company_names, ensure_ascii=False, cls=util.CJsonEncoder))
        logger.info(json.dumps(main_beianhaos, ensure_ascii=False, cls=util.CJsonEncoder))
        logger.info(json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder))

        # Check if there are new stuff which need to do expansion
        if len(source_company_names) == 0 and len(artifacts) == 0 and len(main_beianhaos) == 0:
            collection_source_company.update_one({"_id": sourcecompany["_id"]}, {'$set': {"scexpanded": True, "modifyTime": datetime.datetime.now()}})
            break

        logger.info("source: %s, sourceId: %s expand for round %d", source, sourceId, round)

        # Step A/1:按公司名,备案查询
        logger.info("source: %s, sourceId: %s 按公司名备案查询", source, sourceId)
        for source_company_name in source_company_names:
            # Only check chinese company name
            if source_company_name["name"] is None or source_company_name["name"].strip() == "":
                continue

            if source_company_name["chinese"] is None:
                (chinese, companyName) = name_helper.name_check(source_company_name["name"])
            else:
                chinese = source_company_name["chinese"]

            if chinese != "Y":
                continue

            check_name = list(collection_beian.find({"organizer": source_company_name["name"]}))
            # Case that one company_name has multiple beian# : 上海汇翼->(today.ai/teambition.com)#If only one found in Mongo.beian(organizer) it is fine
            if len(check_name) == 0:
                if test:
                    items_beianlinks = []
                else:
                    items_beianlinks = beian_links_crawler.query_by_company_name(source_company_name["name"])
                    save_collection_beian(collection_beian, items_beianlinks)  # insert infos into Mongo.beian
            else:
                items_beianlinks = check_name
            save_beian_artifacts(items_beianlinks, source, sourceId)  # insert website/homepage into Mysql.source_artifact
            save_beian_company_names(items_beianlinks, source, sourceId)  # insert organizer into Mysql.source_company_names
            save_beian_mainbeianhaos(items_beianlinks, source, sourceId)  # insert mainBeianhao into Mysql.source_mainbeiahao

            # beian
            # 发现更多的artifact(website)和公司名,主备案号

        # Step A/2:按domian,备案查询
        logger.info("source: %s, sourceId: %s 按domian备案查询", source, sourceId)
        for artifact in artifacts:
            # Only check is artifact is a website
            if artifact["type"] != 4010:
                continue
            if artifact["domain"] is None:
                link = url_helper.url_normalize(artifact["link"])
                (flag, domain) = url_helper.get_domain(link)
                if flag is None:
                    continue
                if flag is False:
                    continue
            else:
                domain = artifact["domain"]

            if domain is None or domain.strip() == "":
                continue

            check_domain = list(collection_beian.find({"domain": domain}))

            if len(check_domain) == 0:
                if test:
                    items_merge =[]
                else:
                    items_beianlinks = beian_links_crawler.query_by_domain(domain)
                    items_icpchinaz = icp_chinaz_crawler.query_by_domain(domain)
                    items_merge = merge_beian(items_beianlinks, items_icpchinaz)

                    save_collection_beian(collection_beian, items_merge)  # insert infos into Mongo.beian
            else:
                items_merge = check_domain

            # filer by check domain to avoid sinaapp.cn case
            items_merge = filter_domain(items_merge, domain)

            save_beian_artifacts(items_merge, source, sourceId)  # insert website/homepage into Mysql.source_artifact
            save_beian_company_names(items_merge, source, sourceId)  # insert organizer into Mysql.source_company_names
            save_beian_mainbeianhaos(items_merge, source, sourceId)  # insert mainBeianhao into Mysql.source_mainbeiahao

            # beian
            # 发现更多的artifact(website)和公司名,主备案号

        # Step A/3 #按主备案号查询
        logger.info("source: %s, sourceId: %s 按主备案号查询", source, sourceId)
        for main_beianhao in main_beianhaos:
            mainBeianhao = main_beianhao["mainBeianhao"]
            check_mainBeianhao = collection_main_beianhao.find_one({"mainBeianhao": mainBeianhao})

            if check_mainBeianhao is None:
                if test:
                    items_merge =[]
                else:
                    items_beianlinks = beian_links_crawler.query_by_main_beianhao(mainBeianhao)
                    items_icpchinaz = icp_chinaz_crawler.query_by_main_beianhao(mainBeianhao)
                    items_merge = merge_beian(items_beianlinks, items_icpchinaz)

                    save_collection_beian(collection_beian, items_merge)  # insert infos into Mongo.beian
                # if mainBeianhao could be found in two links
                if len(items_merge) > 0:
                    items_main_beianhao = [{"mainBeianhao": mainBeianhao}]
                    save_collection_mainBeianhao(collection_main_beianhao, items_main_beianhao)  # insert mainBeianhao into Mongo.main_beianhao
            else:
                items_merge = list(collection_beian.find({"mainBeianhao": mainBeianhao}))

            save_beian_artifacts(items_merge, source, sourceId)  # insert website/homepage into Mysql.source_artifact
            save_beian_company_names(items_merge, source, sourceId)  # insert organizer into Mysql.source_company_names
            save_beian_mainbeianhaos(items_merge, source, sourceId)  # insert mainBeianhao into Mysql.source_mainbeiahao
            # 发现更多的artifact(website)和公司名

        # itunes扩展
        # Step B/1 #查询itunes artifact
        logger.info("source: %s, sourceId: %s 查询itunes artifact", source, sourceId)

        itunes_company_enames = {}
        app_by_name = {}

        for artifact in artifacts:
            if artifact["type"] != 4040:
                continue
            # Get trackid
            trackid = None
            if artifact["domain"] is None:
                (apptype, appmarket, trackid) = url_helper.get_market(artifact["link"])
                if apptype != 4040:
                    continue

            else:
                try:
                    trackid = int(artifact["domain"])
                except:
                    pass

            if trackid is not None:
                app = collection_itunes.find_one({"trackId": trackid})

                if app is None:
                    # mark it as Noactive
                    set_artifact_active(artifact, "N", source, sourceId)
                else:
                    copy_from_itunes(app, artifact, source, sourceId)  # 存在: copy from mongo.itunes
                    if app.has_key("offline") and app["offline"] is True:
                        set_artifact_active(artifact, "Offline", source, sourceId)
                    else:
                        set_artifact_active(artifact, "Y", source, sourceId)

                    english, is_company = name_helper.english_name_check(app["sellerName"])
                    if english and is_company:
                        itunes_company_enames["sellerName"] = 1
                        app_by_name = app
            else:
                set_artifact_active(artifact, "N", source, sourceId)

        # save the only english name
        if len(itunes_company_enames) == 1:
            company_name = collection_source_company.find_one({"source": source, "sourceId": sourceId, "source_company_name": {"$elemMatch": {"type": 12010, "chinese":"N"}}})

            if company_name is None:
                save_company_name(app_by_name, "sellerName", source, sourceId)

        # Step B/2根据公司名查询更多的itunes artifact
        logger.info("source: %s, sourceId: %s 根据公司名查询更多的itunes artifact", source, sourceId)
        for source_company_name in source_company_names:
            # producer name
            '''
            check_itunes_producers = list(collection_itunes.find({"developer": source_company_name["name"]}))
            if len(check_itunes_producers) > 0:
                for app in check_itunes_producers:
                    # Check if itunesId is already existed in artifacts
                    if find_itunesId(app["trackId"], source_company_id):
                        pass
                    else:
                        source_artifact_id = save_itunes_artifact(app, source_company_id)
                        #save_artifact_itunes_rel(app["_id"], source_artifact_id)
                    save_company_name(app, "developer", source_company_id)
            '''
            if source_company_name["name"] is None or source_company_name["name"].strip() == "":
                continue

            check_itunes_sellers = list(collection_itunes.find({"sellerName": source_company_name["name"]}))
            if len(check_itunes_sellers) > 0:
                '''
                domains = {}
                for app in check_itunes_sellers:
                    sellerUrl = app.get("sellerUrl")
                    flag ,domain = url_helper.get_domain(sellerUrl)
                    if flag is not None and domain is not None:
                        domains[domain] = 1
                '''
                lens_domain = count_domains(check_itunes_sellers, "sellerUrl")
                artifact_status = check_source_artifact(source, sourceId)

                for app in check_itunes_sellers:
                    # Check if itunesId is already existed in all artifacts in 1 sourceCompanyId
                    if find_itunesId(app["trackId"], source, sourceId):
                        pass
                    else:
                        save_itunes_artifact(app, source, sourceId)

                        if app.has_key("sellerUrl"):
                            # if find_link(app["sellerUrl"], source_company_id) or check_source_artifact(source_company_id):
                            if artifact_status:
                                pass
                            elif lens_domain == 1:
                                artifact_id = save_itunesSellerUrl_artifact(app, source, sourceId)

                                if artifact_id is not None:
                                    artifact_status = True

                            # comment due to incorrect expand
                            '''
                            if app.has_key("supportUrl"):
                                if find_link(app["supportUrl"], source_company_id):
                                    pass
                                else:
                                    save_itunesSupportUrl_artifact(app, source_company_id)
                            '''

                            # save_artifact_itunes_rel(app["_id"], source_artifact_id)
                            # save_company_name(app, "sellerName", source_company_id)

        # Step B/3根据域名查询更多的itunes artifact
        logger.info("source: %s, sourceId: %s 根据域名查询更多的itunes artifact", source, sourceId)
        for artifact in artifacts:
            if artifact["type"] != 4010:
                continue

            if artifact["domain"] is None:
                (flag, domain) = url_helper.get_domain(artifact["link"])
                if flag is None:
                    continue
                if flag is False:
                    continue
            else:
                domain = artifact["domain"]

            if domain is None or domain.strip() == "":
                continue

            if domain in itunesDomainEx:
                continue

            check_itunes_sellerDomains = list(collection_itunes.find({"sellerDomain": domain}))
            if len(check_itunes_sellerDomains) > 0:

                lens_company_names = count_company_names(check_itunes_sellerDomains, "sellerName")
                company_name_status = check_source_company_name(source, sourceId)

                for app in check_itunes_sellerDomains:

                    # Check if itunesId is already existed in all artifacts in 1 sourceCompanyId
                    if find_itunesId(app["trackId"], source, sourceId):
                        pass
                    else:
                        save_itunes_artifact(app, source, sourceId)

                    if company_name_status:
                        pass
                    elif lens_company_names == 1:

                        # save_artifact_itunes_rel(app["_id"], source_artifact_id)
                        chinese, is_company = name_helper.name_check(app["sellerName"])
                        if chinese and is_company:
                            save_company_name(app, "sellerName", source, sourceId)
                            company_name_status = True

                        english, is_company = name_helper.english_name_check(app["sellerName"])
                        if english and is_company:
                            save_company_name(app, "sellerName", source, sourceId)
                            company_name_status = True

            check_itunes_supportDomains = list(collection_itunes.find({"supportDomain": domain}))
            if len(check_itunes_supportDomains) > 0 and len(check_itunes_supportDomains) < 100:

                lens_company_names = count_company_names(check_itunes_supportDomains, "sellerName")
                company_name_status = check_source_company_name(source, sourceId)

                for app in check_itunes_supportDomains:
                    # Check if itunesId is already existed in all artifacts in 1 sourceCompanyId
                    if find_itunesId(app["trackId"], source, sourceId):
                        pass
                    else:
                        save_itunes_artifact(app, source, sourceId)
                        # save_artifact_itunes_rel(app["_id"], source_artifact_id)
                    if company_name_status:
                        pass
                    elif lens_company_names == 1:
                        chinese, is_company = name_helper.name_check(app["sellerName"])
                        if chinese and is_company:
                            save_company_name(app, "sellerName", source, sourceId)
                            company_name_status = True

                        english, is_company = name_helper.english_name_check(app["sellerName"])
                        if english and is_company:
                            save_company_name(app, "sellerName", source, sourceId)
                            company_name_status = True

        # 发现更多的artifact(website)和公司名,check if existed in source_art..and company_name


        # android扩展
        # Step C/1#查询android artifact
        logger.info("source: %s, sourceId: %s 查询android artifact", source, sourceId)
        for artifact in artifacts:
            if artifact["type"] != 4050:
                continue
            # Get apkname
            apkname = None
            if artifact["domain"] is None:
                (apptype, appmarket, appid) = url_helper.get_market(artifact["link"])
                # Get apkname of baidu and 360 from android market
                if apptype != 4050:
                    continue

                if appmarket == 16010 or appmarket == 16020:
                    android_app = collection_android_market.find_one({"appmarket": appmarket, "key_int": appid})
                    if android_app:
                        apkname = android_app["apkname"]
                else:
                    apkname = appid
            else:
                apkname = artifact["domain"]

            if apkname is not None:
                app = collection_android.find_one({"apkname": apkname})

                if app is None:
                    # mark it as Noactive
                    set_artifact_active(artifact, "N", source, sourceId)
                else:
                    copy_from_android(app, artifact, source, sourceId)  # 存在: copy from mongo.android
                    set_artifact_active(artifact, "Y", source, sourceId)

                    # chinese, is_company = name_helper.name_check(app["author"])
                    # if is_company:
                    #     save_company_name(app, "author", source_company_id)
            else:
                set_artifact_active(artifact, "N", source, sourceId)

        # Step C/2根据公司名查询更多的android artifact
        logger.info("source: %s, sourceId: %s 根据公司名查询更多的android artifact", source, sourceId)
        for source_company_name in source_company_names:
            # producer name
            if source_company_name["name"] is None or source_company_name["name"].strip() == "":
                continue

            check_android_authors = list(collection_android.find({"author": source_company_name["name"]}))
            if len(check_android_authors) > 0 and len(check_android_authors) < 200:

                lens_domain = count_domains(check_android_authors, "website")
                artifact_status = check_source_artifact(source, sourceId)

                # check if author is consistent
                for app in check_android_authors:
                    # Check if AnId have one 4010
                    if find_androidAppname(app["apkname"], source, sourceId):
                        pass
                    else:
                        save_android_artifact(app, source, sourceId)

                        if artifact_status:
                            pass
                        elif lens_domain == 1:
                            artifact_id = save_androidWebsite_artifact(app, source, sourceId)

                            if artifact_id is not None:
                                artifact_status = True

                                # save_artifact_android_rel(app["_id"], source_artifact_id)
                                # save_company_name(app, "author", source_company_id)

        # Step C/3根据域名查询更多的android artifact
        logger.info("source: %s, sourceId: %s 根据域名查询更多的android artifact", source, sourceId)
        for artifact in artifacts:
            if artifact["type"] != 4010:
                continue

            if artifact["domain"] is None:
                (flag, domain) = url_helper.get_domain(artifact["link"])
                if flag is None:
                    continue
                if flag is False:
                    continue
            else:
                domain = artifact["domain"]

            if domain is None or domain.strip() == "":
                continue

            check_android_websiteDomains = list(collection_android.find({"website_domain": domain}))
            if len(check_android_websiteDomains) > 0:

                lens_company_names = count_company_names(check_android_websiteDomains, "author")
                company_name_status = check_source_company_name(source, sourceId)

                for app in check_android_websiteDomains:
                    # Check if AndroidId is already existed in artifacts
                    if find_androidAppname(app["apkname"], source, sourceId):
                        pass
                    else:
                        save_android_artifact(app, source, sourceId)
                        # save_artifact_android_rel(app["_id"], source_artifact_id)
                    if company_name_status:
                        pass
                    elif lens_company_names == 1:

                        chinese, is_company = name_helper.name_check(app["author"])
                        if is_company:
                            save_company_name(app, "author", source, sourceId)
                            company_name_status = True

            check_android_apknameDomains = list(collection_android.find({"apkname_domain": domain}))
            # add threshold to avoid case: domain: com.wowotuan
            if len(check_android_apknameDomains) > 0 and len(check_android_apknameDomains) < 100:

                lens_company_names = count_company_names(check_android_apknameDomains, "author")
                company_name_status = check_source_company_name(source, sourceId)

                for app in check_android_apknameDomains:
                    # Check if AndroidId is already existed in artifacts
                    if find_androidAppname(app["apkname"], source, sourceId):
                        pass
                    else:
                        save_android_artifact(app, source, sourceId)
                        # save_artifact_android_rel(app["_id"], source_artifact_id)
                    if company_name_status:
                        pass
                    elif lens_company_names == 1:

                        chinese, is_company = name_helper.name_check(app["author"])
                        if is_company:
                            save_company_name(app, "author", source, sourceId)
                            company_name_status = True
        # 发现更多的artifact(website)和公司名

        # 曾用名 TODO

        # 清洗website artfiact
        # 查询meta信息, 标记不能访问的?website?, 处理转跳的website
        logger.info("source: %s, sourceId: %s website meta", source, sourceId)
        for artifact in artifacts:
            if artifact["type"] != 4010:
                continue
            if artifact["link"] is None or artifact["link"].strip() == "":
                # set_active("source_artifact", "N", artifact["id"])
                set_artifact_active(artifact, "N", source, sourceId)
                continue

            url = artifact["link"].strip()
            meta = collection_website.find_one({"url": url})
            if meta is None or meta["httpcode"]==404:
                meta = website.get_meta_info(url)
                if meta:
                    websiteId = save_collection_website(collection_website, meta)
                    if websiteId is not None and not test:
                        #screenshot_wesbite(collection_website, websiteId, screenshot_crawler)
                        pass
                else:
                    meta = {
                        "url": artifact["link"],
                        "httpcode": 404
                    }
                    websiteId = save_collection_website(collection_website, meta)
                    set_artifact_active(artifact, "N", source, sourceId)

            if meta:
                # 发生转跳
                # logger.info(meta)
                if meta["httpcode"] == 200:
                    redirect_url = meta.get("redirect_url")
                    if artifact["link"] != redirect_url:
                        url = url_helper.url_normalize(meta["redirect_url"])
                        (flag_new, domain_new) = url_helper.get_domain(url)

                        meta_new = {
                            "url": url,
                            "domain": domain_new if flag_new is True else None,
                            "redirect_url": url,
                            "title": meta["title"],
                            "tags": meta["tags"],
                            "description": meta["description"],
                            "httpcode": 200

                        }

                        websiteId_new = save_collection_website(collection_website, meta_new)
                        if websiteId_new is not None and not test:
                            #screenshot_wesbite(collection_website, websiteId_new, screenshot_crawler)
                            pass

                        flag, domain = url_helper.get_domain(artifact["link"])
                        if domain_new != domain:  # 跳出原域名
                            set_artifact_active(artifact, "Redirect", source, sourceId)
                        else:
                            if flag is True:  # 这是个'好'地址
                                set_artifact_active(artifact, "Y", source, sourceId)
                            else:
                                if flag_new is True:  # 转跳后是个 '好'地址
                                    set_artifact_active(artifact, "Redirect", source, sourceId)
                                    save_website_artifact(meta_new, source, sourceId)
                                else:
                                    set_artifact_active(artifact, "Y", source, sourceId)
                    else:
                        set_artifact_active(artifact, "Y", source, sourceId)
                elif meta["httpcode"] == 404:
                    set_artifact_active(artifact, "N", source, sourceId)

        # verify -> source_artifacts/source_company_name set verify
        logger.info("source: %s, sourceId: %s set verify", source, sourceId)
        for artifact in artifacts:
            set_artifact_expand(artifact, source, sourceId)
        for source_company_name in source_company_names:
            set_scname_expand(source_company_name, source, sourceId)
        for main_beianhao in main_beianhaos:
            set_scbeianhao_expand(main_beianhao, source, sourceId)

        round += 1
예제 #23
0
def save_itunes(response, data):
    global total
    if response.error:
        logger.info("Error: %s, %s" % (response.error, response.request.url))
        # request(response.request.url, lambda r, data=data: save_itunes(r,data))
        # return
    else:
        try:
            html = response.body
            d = pq(html)
            developer = d(".product-header__identity> a").text()
            if developer is not None:
                developer = developer.replace("开发商:", "")
            data["developer"] = developer

            supportUrl = None
            links = d('li.t-subbody>a.targeted-link.link.icon')
            for i in links:
                title = pq(i).text().strip()
                if title.endswith("支持"):
                    supportUrl = pq(i).attr('href').strip()
            data["supportUrl"] = url_helper.url_normalize(supportUrl)

            logger.info("********************Developer: %s->supportUrl: %s",
                        data["developer"], data["supportUrl"])

            relatedApps = []
            try:
                # divs = d('div.swoosh')
                # for div in divs:
                #     e = pq(div)
                #     if e('div.title').text().strip() == "Customers Also Bought" or e('div.title').text().strip() == "用户购买的还有":
                #         apps = e('div.content> div> div.application')
                #         for app in apps:
                #             app_id = pq(app).attr('adam-id')
                #             relatedApps.append(int(app_id))
                #logger.info("*********************%s", app_id)
                apps = d('div.l-row.l-row--peek> a')
                for app in apps:
                    appurl = pq(app).attr('href')
                    r = util.re_get_result('/id(\d*)', appurl)
                    if r is not None:

                        track_id, = r
                        try:
                            app_id = int(track_id)
                            relatedApps.append(int(app_id))
                        except:
                            pass
            except:
                pass
            logger.info("*********************%s", relatedApps)
            data["relatedApps"] = relatedApps

            userComments = []
            cdivs = d('div.l-row.l-row--peek> div.ember-view')
            for cdiv in cdivs:
                c = pq(cdiv)
                try:
                    c_title = c(
                        'div.we-customer-review> div.we-customer-review__header> h3'
                    ).eq(1).text().strip()
                    c_commentator = c('div.we-customer-review__user').eq(
                        1).text().replace("评论人:", "").strip()
                    c_content = c('p.we-customer-review__body').attr(
                        "aria-label")

                    comment = {
                        "title": c_title,
                        "commentator": c_commentator,
                        "content": c_content
                    }
                    userComments.append(comment)

                except:
                    pass

            logger.info(
                json.dumps(userComments,
                           ensure_ascii=False,
                           cls=util.CJsonEncoder))
            data["userComments"] = userComments

            if data["supportUrl"] is not None:
                flag, domain = url_helper.get_domain(data["supportUrl"])
                if flag:
                    data["supportDomain"] = domain
                else:
                    data["supportDomain"] = None
            if data.has_key("sellerUrl") and data["sellerUrl"] is not None:
                data["sellerUrl"] = url_helper.url_normalize(data["sellerUrl"])
                flag, domain = url_helper.get_domain(data["sellerUrl"])
                if flag:
                    data["sellerDomain"] = domain
                else:
                    data["sellerDomain"] = None

            short_name = name_helper.get_short_name(data["trackName"])
            data["trackShortName"] = short_name
            logger.info(
                json.dumps(data, ensure_ascii=False, cls=util.CJsonEncoder))

            record = collection_itunes.find_one(
                {"trackId": data["trackId"]}, projection={'histories': False})
            if record:
                _id = record.pop("_id")
                if LooseVersion(data["version"]) > LooseVersion(
                        record["version"]):
                    data["createTime"] = record["createTime"]
                    data["modifyTime"] = datetime.datetime.now()
                    collection_itunes.update_one({"_id": _id}, {
                        '$set': data,
                        '$addToSet': {
                            "histories": record
                        }
                    })
                # elif LooseVersion(data["version"]) == LooseVersion(record["version"]):
                #     data["modifyTime"] = datetime.datetime.now()
                #     collection_itunes.update_one({"_id": _id}, {'$set': data})
            else:
                data["createTime"] = datetime.datetime.now()
                data["modifyTime"] = data["createTime"]
                collection_itunes.insert(data)

        except:
            traceback.print_exc()

    total -= 1
    if total <= 0:
        begin()
예제 #24
0
def parse_company(item):
    if item is None:
        return None

    logger.info("*** base ***")
    company_key = item["key"]
    html = item["content"]
    logger.info(company_key)
    d = pq(html)

    # logo_id processed in parser_db_util
    '''
    logo_id = None
    if logo_url is not None:
        logo_id = parser_util.get_logo_id(source, company_key, 'company', logo_url)
    '''

    if html.decode("utf-8").find("这个公司的主页还在建设") >= 0:
        return {
            "status": "No_Name",
        }
    name = d('.company_main > h1 > a').text()
    link = d('.company_main > h1 > a').attr('href')
    fullName = d('.company_main > h1 > a').attr('title')
    fullName = name_helper.company_name_normalize(fullName)
    if name is None or fullName is None or name.find("拉勾") >= 0:
        return {
            "status": "No_Name",
        }
    if len(name) > len(fullName):
        name = fullName
    if name is None or name.strip() == "":
        name = fullName

    chinese, companycheck = name_helper.name_check(fullName)
    if companycheck is not True:
        return {
            "status": "No_Name",
        }
    logo = d('.top_info_wrap > img').attr('src')

    if logo.startswith("http") or logo.startswith("https"):
        pass
    else:
        logo = "http:" + logo

    if logo.find("logo_default") >= 0:
        logo = None

    brief = d('.company_word').text()
    desc_text = d('.company_intro_text').text()

    if u"该公司尚未添加公司介绍" in desc_text or len(desc_text) < 10:
        desc = None
    else:
        desc = d('.company_intro_text > .company_content').html()
        desc = desc.replace('<span class="text_over">展开</span>', '')

        soup = BeautifulSoup(desc, "lxml")
        raw = soup.getText()

        # logger.info(desc)
        #logger.info(raw)

        desc = raw

    # if desc is None or desc.strip() == "":
    #     return {
    #         "status": "No_Name",
    #     }
    field = ''
    stage = ''
    headCount = ''
    location = ''
    address = ''
    try:
        field = d(
            '#basic_container > .item_content >ul > li:eq(0) > span').text()
        stage = d(
            '#basic_container > .item_content >ul > li:eq(1) > span').text()
        headCount = d(
            '#basic_container > .item_content >ul > li:eq(2) > span').text()
        headCount = headCount[0:headCount.index(u'人')]
        location = d(
            '#basic_container > .item_content >ul > li:eq(3) > span').text()
        address = d('.con_mlist_ul > li:eq(0) > p:eq(1)').text()
    except:
        pass

    headCount = headCount.replace("people", "")

    if headCount == "少于15":
        min_staff = 1
        max_staff = 15
    else:
        staffarr = headCount.split('-')
        if len(staffarr) > 1:
            min_staff = staffarr[0]
            max_staff = staffarr[1]
        else:
            try:
                min_staff = int(staffarr[0].strip())
                max_staff = None
            except:
                min_staff = None
                max_staff = None

    funding_type = 0
    if stage == '不需要融资':
        stage = 0
        funding_type = 8010
    elif stage == '未融资':
        stage = 0
    elif stage == '天使轮':
        stage = 1010
    elif stage == 'A轮':
        stage = 1030
    elif stage == 'B轮':
        stage = 1040
    elif stage == 'C轮':
        stage = 1050
    elif stage == 'D轮及以上':
        stage = 1060
    elif stage == '上市公司':
        stage = 1110
    else:
        stage = 0

    location_id = 0
    location_new = parser_db_util.get_location(location)
    if location_new != None:
        location_id = location_new["locationId"]

    #website = util.norm_url(link)
    website = url_helper.url_normalize(link)
    logger.info("website: %s" % website)

    artifacts = []
    type, app_market, app_id = url_helper.get_market(website)
    if type == 4010:
        if item["url"] != website and website.find("lagou.com") == -1:
            flag, domain = url_helper.get_domain(website)
            if flag is not None:
                if flag is False:
                    domain = None
                artifacts.append({
                    "type": 4010,
                    "name": name,
                    "description": desc,
                    "link": website,
                    "domain": domain
                })
    elif type == 4020 or type == 4030:
        domain = None
        if domain is not None:
            artifacts.append({
                "type": type,
                "name": name,
                "description": desc,
                "link": website,
                "domain": domain
            })
    elif type == 4040:
        domain = app_id
        if domain is not None:
            artifacts.append({
                "type": 4040,
                "name": name,
                "description": desc,
                "link": website,
                "domain": domain
            })
    elif type == 4050:
        domain = None
        if app_market == 16010 or app_market == 16020:
            android_app = parser_db_util.find_android_market(
                app_market, app_id)
            if android_app:
                domain = android_app["apkname"]
        else:
            domain = app_id
        if domain is not None:
            artifacts.append({
                "type": 4050,
                "name": name,
                "description": desc,
                "link": website,
                "domain": domain
            })

    source_company = {
        "name":
        name,
        "fullName":
        fullName if fullName is not None and fullName.strip() != "" else None,
        "description":
        desc,
        "productDesc":
        None,
        "modelDesc":
        None,
        "operationDesc":
        None,
        "teamDesc":
        None,
        "marketDesc":
        None,
        "compititorDesc":
        None,
        "advantageDesc":
        None,
        "planDesc":
        None,
        "brief":
        brief,
        "round":
        None,
        "roundDesc":
        None,
        "companyStatus":
        2010,
        'fundingType':
        funding_type,
        "locationId":
        location_id,
        "address":
        address,
        "phone":
        None,
        "establishDate":
        None,
        "logo":
        logo,
        "source":
        SOURCE,
        "sourceId":
        company_key,
        "field":
        field,
        "subField":
        None,
        "tags":
        None,
        "headCountMin":
        min_staff,
        "headCountMax":
        max_staff,
        "artifacts":
        artifacts,
        "status":
        1
    }

    return source_company
예제 #25
0
    while True:
        logger.info("investor aggregator start")
        #get source_investors
        conn = db.connect_torndb()
        #Check verify or processStatus
        source_investors = conn.query(
            "select * from source_investor where processStatus=0 order by id")
        conn.close()

        for source_investor in source_investors:
            logger.info(source_investor["id"])
            #get Domain
            source_investor["domain"] = None
            if source_investor["website"] is not None:
                source_investor["website"] = url_helper.url_normalize(
                    source_investor["website"])
                type, market, website_domain = url_helper.get_market(
                    source_investor["website"])
                if type == 4010 and website_domain is not None:
                    source_investor["domain"] = website_domain

            if source_investor["investorId"] is not None:

                investor = find_in_investor("id",
                                            source_investor["investorId"])
                update_investor(investor, source_investor)
                set_processStatus(source_investor["id"])
                continue

            else:
                #name check
예제 #26
0
def parse_artifact(item):
    if item is None:
        return None

    artifacts = []
    company_key = item["key"]
    html = item["content"]
    #logger.info(html)
    d = pq(html)

    #artifact
    logger.info("*** artifact ***")
    lis = d('ul.list-prod> li> div.on-edit-hide')
    for li in lis:
        l = pq(li)
        strtype = l('h4> span.tag').text().strip()
        #logger.info(strtype)
        if strtype != u"网站" and strtype != "app":
            continue

        link = l('h4> b> a').attr("href").strip()
        if link == "":
            continue

        domain = None
        type = None
        if strtype == u"网站":
            type, app_market, app_id = url_helper.get_market(link)
            if type == 4010:
                link = url_helper.url_normalize(link)
                flag, domain = url_helper.get_domain(link)
                if flag is None:
                    continue
                if flag is False:
                    domain = None

        if type != 4010:
            type, app_market, app_id = url_helper.get_market(link)
            if type == 4040:
                domain = app_id
            elif type == 4050:
                if app_market == 16010 or app_market == 16020:
                    android_app = parser_db_util.find_android_market(app_market, app_id)
                    if android_app:
                        domain = android_app["apkname"]
                else:
                    domain = app_id
            if domain is None and type !=4030 and type != 4020:
                continue

        name = l('h4> b').text().strip()
        desc = l('p').text().strip()
        logger.info("type: %s, name: %s, link: %s, desc: %s" % (type, name,link,desc))
        artifact = {
            "type":type,
            "name":name,
            "desc":desc,
            "link":link,
            "domain": domain
        }
        artifacts.append(artifact)

    logger.info("")
    return artifacts