示例#1
0
def update_investor(investor, source_investor):
    conn = db.connect_torndb()
    investor_id = investor["id"]
    if replace(investor, source_investor):
        logger.info("Update investor : %d with source_investor: %d ",
                    investor_id, source_investor["id"])
        sql = "update investor set name=%s,website=%s,domain=%s,description=%s,logo=%s,stage=%s,\
            field=%s,type=%s,modifyTime=now() where id=%s"

        conn.update(sql, investor["name"], investor["website"],
                    investor["domain"], investor["description"],
                    investor["logo"], investor["stage"], investor["field"],
                    investor["type"], investor_id)
    else:
        logger.info("Not update investor : %d with source_investor: %d ",
                    investor_id, source_investor["id"])

    #insert investor_alias
    investor_alias = conn.get(
        "select * from investor_alias where name=%s and (active is null or active='Y') limit 1",
        source_investor["name"])
    if investor_alias is not None:
        chinese, is_company = name_helper.name_check(investor["name"])
        if is_company:
            type = 12010
        else:
            type = 12020
        sql = "insert investor_alias(investorId, name, type, createTime,modifyTime) values(%s,%s,%s, now(),now())"
        logger.info("Add new investor alias: %s for %s",
                    source_investor["name"], investor["id"])
        conn.insert(sql, investor["id"], source_investor["name"], type)
    conn.close()
示例#2
0
def process_corporate():
    id = -1
    conn = db.connect_torndb()
    while True:
        cs = conn.query(
            "select * from corporate where id>%s order by id limit 1000", id)
        if len(cs) == 0:
            break
        for c in cs:
            corporate_id = c["id"]
            fullname = c["fullName"]
            if fullname is None or fullname.strip() == "":
                continue
            fullname = fullname.strip()
            chinese, iscompany = name_helper.name_check(fullname)
            if chinese is False or iscompany is False:
                continue
            # logger.info(c["fullname"])
            if corporate_id > id:
                id = corporate_id

            alias = conn.get(
                "select * from corporate_alias where corporateId=%s and name=%s",
                corporate_id, fullname)
            if alias is None:
                logger.info(fullname)
                conn.insert(
                    "insert corporate_alias(corporateId,name,type,createTime,modifyTime) values(%s,%s,12010,now(),now())",
                    corporate_id, fullname)

    conn.close()
示例#3
0
def patch_company_alias():
    # 删除 fullname 12010
    # type is null 处理
    id = -1
    conn = db.connect_torndb()
    while True:
        cas = conn.query("select * from company_alias where id>%s order by id limit 1000", id)
        if len(cas) == 0:
            break
        for ca in cas:
            company_alias_id = ca["id"]
            logger.info(ca["name"])
            if company_alias_id > id:
                id = company_alias_id

            type = ca["type"]
            chinese, company = name_helper.name_check(ca["name"])
            if chinese and company:
                type = 12010
            if type is None:
                type = 12020
            if type == 12010:
                conn.update("update company_alias set active='N', modifyUser=139 where id=%s", company_alias_id)
            else:
                conn.update("update company_alias set type=12020 where id=%s", company_alias_id)
    conn.close()
示例#4
0
def process_alias():
    id = -1
    conn = db.connect_torndb()
    while True:
        cas = conn.query("select * from company_alias where id>%s order by id limit 1000", id)
        if len(cas) == 0:
            break
        for ca in cas:
            company_alias_id = ca["id"]
            logger.info(ca["name"])
            if company_alias_id > id:
                id = company_alias_id
            c = conn.get("select * from company where id=%s", ca["companyId"])
            corporate_id = c["corporateId"]
            if corporate_id is None:
                continue
            cpa = conn.get("select * from corporate_alias where corporateId=%s and name=%s limit 1", corporate_id, ca["name"])
            if cpa is not None:
                continue
            type = ca["type"]
            chinese, company = name_helper.name_check(ca["name"])
            if chinese and company:
                type = 12010
            conn.insert(
                "insert corporate_alias("
                "corporateId, name, type, verify, active, createTime, modifyTime,createUser,modifyUser,confidence)"
                "values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",
                corporate_id,
                ca["name"], type, ca["verify"], ca["active"], ca["createTime"], ca["modifyTime"],
                ca["createUser"], ca["modifyUser"], ca["confidence"]
            )
    conn.update("update corporate_alias set type=12020 where type is null")
    conn.close()
示例#5
0
def add_corporate_alias():
    conn = db.connect_torndb()
    id = 0
    while True:

        inames = conn.query("select * from corporate_alias where "
                            "(active is null or active='Y') and gongshangCheckTime is not null and id>%s"
                            " order by id limit 2000", id)
        names = []
        for iname in inames:
            if iname["id"] > id:
                id = iname["id"]
            if iname["name"] is not None and iname["name"] != "":
                chinese, company = name_helper.name_check(iname["name"])
                if chinese is True:
                    # logger.info("name: %s, time: %s", iname["name"])
                    names.append({"name": iname["name"], "lastCheckTime": iname["gongshangCheckTime"],
                                  "corporateId": int(iname["corporateId"])})
        # names = [{"name": iname["name"], "lastCheckTime": iname["gongshangCheckTime"]}
        #          for iname in inames if iname["name"] is not None and iname["name"] != ""]
        # logger.info(names)
        save_names(names, 3)

        if len(inames) == 0:
            break
    conn.close()
示例#6
0
def count_company_names(apps, item_of_name):
    names = {}
    for app in apps:
        company_name = app.get(item_of_name)
        if company_name is not None:
            ischinese, iscompany = name_helper.name_check(company_name)
            if iscompany == True:
                names[company_name] = 1
    return len(names)
示例#7
0
def find_company_candidate(name, fullname):
    conn = db.connect_torndb()
    candidate_companies = []
    companies = conn.query(
        "select * from company where name=%s and (active is null or active='Y')",
        name)
    for c in companies:
        #logger.info("company: %s", c["name"])
        if not is_exist(candidate_companies, c):
            candidate_companies.append(c)

    cas = conn.query(
        "select * from company_alias where name=%s and (active is null or active='Y')",
        name)
    for ca in cas:
        company = conn.get("select * from company where id=%s",
                           ca["companyId"])
        if company['active'] != 'N':
            #logger.info("company: %s", company["name"])
            if not is_exist(candidate_companies, company):
                candidate_companies.append(company)

    if fullname == u"":
        isCN, isCompany = name_helper.name_check(name)
        #logger.info("isCN: %s, isCompany: %s", isCN, isCompany)
        if isCN and isCompany:
            fullname = name

    if fullname != u"":
        #logger.info("***fullname: %s", fullname)
        companies = conn.query(
            "select * from company where fullname=%s and (active is null or active='Y')",
            fullname)
        for c in companies:
            #logger.info("company: %s", c["name"])
            if not is_exist(candidate_companies, c):
                candidate_companies.append(c)

        cas = conn.query(
            "select * from company_alias where name=%s and (active is null or active='Y')",
            fullname)
        for ca in cas:
            company = conn.get("select * from company where id=%s",
                               ca["companyId"])
            if company['active'] != 'N':
                #logger.info("company: %s", company["name"])
                if not is_exist(candidate_companies, company):
                    candidate_companies.append(company)

    conn.close()

    return candidate_companies
示例#8
0
文件: gongshang.py 项目: yujiye/Codes
def start_run(concurrent_num):
    # while True:
    #     logger.info("Company gongshang start...")
    #
    #     conn = db.connect_torndb()
    #     #source_company_names = conn.query("select * from source_company_name where type=12010 and chines='Y' and gongshangCheckTime is null order by id desc")
    #     source_company_names = conn.query("select * from source_company_name where sourceCompanyId=31098")
    #     conn.close()
    #     for source_company_name in source_company_names:
    #         company_name = source_company_name["name"]
    #         #NAME CHECK
    #         chinese, is_company = name_helper.name_check(company_name)
    #         if chinese and is_company:
    #             COMPANIES.append(source_company_name)
    #
    #     logger.info(json.dumps(COMPANIES, ensure_ascii=False, cls=util.CJsonEncoder))
    #
    #     threads = [gevent.spawn(query_goshang()) for i in xrange(concurrent_num)]
    #     gevent.joinall(threads)
    #
    #
    #     logger.info("Company gongshang end.")
    #
    #     if len(COMPANIES) == 0:
    #         gevent.sleep(10*60)

    logger.info("Company gongshang start...")
    while True:
        conn = db.connect_torndb()
        company_aliases = conn.query(
            "select * from company_alias where type=12010 and gongshangCheckTime is null order by id desc limit 1000"
        )
        conn.close()
        for alias in company_aliases:
            company_name = alias["name"]
            #NAME CHECK
            chinese, is_company = name_helper.name_check(company_name)
            if chinese and is_company:
                COMPANIES.append(alias)

        #logger.info(json.dumps(COMPANIES, ensure_ascii=False, cls=util.CJsonEncoder))

        if len(COMPANIES) > 0:
            threads = [
                gevent.spawn(query_goshang) for i in xrange(concurrent_num)
            ]
            gevent.joinall(threads)
        else:
            logger.info("Company gongshang end.")
            gevent.sleep(10 * 60)
            logger.info("Company gongshang start...")
示例#9
0
def find_from_gongshang(name):
    name = name_helper.company_name_normalize(name)
    if name is None:
        return
    chinese, company = name_helper.name_check(name)
    if chinese is True and company is True:
        gs = mongo.info.gongshang.find_one({"name": name})
        if gs is not None:
            for investor in gs["investors"]:
                if investor["type"] == u"企业投资":
                    logger.info("gongshang name: %s", investor["name"])
                    add_2_company_list(investor["name"])
            if gs.has_key("invests"):
                for invest in gs["invests"]:
                    add_2_company_list(invest["name"])
示例#10
0
def add_2_company_list(name):
    name = name_helper.company_name_normalize(name)
    if name is None:
        return
    chinese, company = name_helper.name_check(name)
    if chinese is True and company is True:
        logger.info("fullname: %s", name)
        name_md5 = util.md5str(name)
        c = mongo.info.company_idx.find_one({"name_md5": name_md5})
        if c is None:
            data = {
                "name": name,
                "name_md5": name_md5,
                "createTime": datetime.datetime.utcnow()
            }
            mongo.info.company_idx.insert_one(data)
示例#11
0
def patch_corporate_fullname_new(corporate_id):
    flag = False
    conn = db.connect_torndb()
    corporate1 = conn.get("select * from corporate where id=%s", corporate_id)
    patch = False
    if corporate1["fullName"] is None or corporate1["fullName"].strip() == "":
        patch = True
    else:
        chinese, iscompany = name_helper.name_check(corporate1["fullName"])
        if chinese is False:
            patch = True
        elif iscompany is False:
            patch = True

        if patch is False:
            gs = gongshang.find_one({"name": corporate1["fullName"]})
            if gs is None:
                patch = True

    if patch:
        logger.info("patch: %s, %s", corporate1, corporate1["id"])
        aliases = conn.query(
            "select * from corporate_alias where corporateId=%s",
            corporate1["id"])
        for alias in aliases:
            company_name = alias["name"]
            gs = gongshang.find_one({"name": company_name})
            if gs:
                logger.info("fullname: %s", company_name)
                conn.update("update corporate set fullName=%s where id=%s",
                            company_name, corporate_id)
                flag = True
                break
        if flag is False:
            if corporate1["fullName"] is None or corporate1["fullName"].strip(
            ) == "":
                for alias in aliases:
                    company_name = alias["name"]
                    logger.info("fullname: %s", company_name)
                    conn.update("update corporate set fullName=%s where id=%s",
                                company_name, corporate_id)
                    flag = True
                    break
    conn.close()
    return flag
示例#12
0
def begin():
    global total
    NUM = 100

    # while True:
    #     conn2 = db.connect_torndb_crawler()
    #     result = conn2.get("select count(*) cnt from proxy_tyc where status = 0 and DATE_ADD(createTime,INTERVAL 2 SECOND) < now()")
    #     conn2.close()
    #     if result["cnt"] > 0:
    #         break
    #     time.sleep(5)

    #time.sleep(random.randint(1,10))

    while True:
        has_request = False
        conn = db.connect_torndb()
        company_aliases = conn.query(
            "select * from company_alias where type=12010 and "
            "(gongshangCheckTime is null or gongshangCheckTime < date_sub(now(),interval 30 day)) "
            "order by id desc limit %s", NUM)
        #company_aliases = conn.query("select * from company_alias where type=12010 and gongshangCheckTime is null order by id desc limit %s", NUM)
        #company_aliases = conn.query("select * from company_alias where id=428826")
        conn.close()
        if len(company_aliases) <= 0:
            logger.info("Finish.")
            time.sleep(60)
            logger.info("Start...")
            continue

        for alias in company_aliases:
            company_name = alias["name"]
            #NAME CHECK
            chinese, is_company = name_helper.name_check(company_name)
            if chinese and is_company:
                logger.info(company_name)
                first_request(company_name, first=True)
                has_request = True
            else:
                update_time(alias["id"])

        if has_request:
            break
示例#13
0
def run():
    conn = db.connect_torndb()
    sql = '''select name,fullname,sourceid,id  from source_company where source=13821
    '''
    results = conn.query(sql)  # TODO
    conn.close()

    for c in results:
        if c['fullname'] is not None and not name_helper.name_check(
                c['fullname'])[1] == True:
            logger.info('%s not company', c['fullname'])

            conn = db.connect_torndb()
            conn.update(
                '''UPDATE source_company SET fullName=null where id = %s''',
                c['id'])
            conn.update(
                'UPDATE source_company_name SET type=12020 where sourcecompanyid = %s and type=12010 and name=%s',
                c['id'], c['fullname'])

            conn.close()
示例#14
0
def insert_investor_alias(investorId, selected_investorId):
    conn = db.connect_torndb()
    investor_aliaes = conn.query(
        "select * from investor_alias where investorId=%s and (active is null or active!='N')",
        investorId)
    for investor in investor_aliaes:
        # investor = conn.get("select * from investor where id=%s",investorId)
        ia = conn.get(
            "select * from investor_alias where name=%s and investorId=%s and "
            "(active is null or active!='N') limit 1", investor["name"],
            selected_investorId)
        if ia is None:
            chinese, is_company = name_helper.name_check(investor["name"])
            if is_company:
                type = 12010
            else:
                type = 12020
            sql = "insert investor_alias(investorId, name, type, createTime,modifyTime) values(%s,%s,%s, now(),now())"
            logger.info("Add new investor alias: %s for %s", investor["name"],
                        selected_investorId)
            conn.insert(sql, selected_investorId, investor["name"], type)
    conn.close()
示例#15
0
def process_corporate_alias():
    id = -1
    conn = db.connect_torndb()
    while True:
        cs = conn.query(
            "select * from corporate_alias where id>%s and (type=12020 or type is null) order by id limit 1000",
            id)
        if len(cs) == 0:
            break
        for c in cs:
            if c["id"] > id:
                id = c["id"]
            name = c["name"].strip()
            if len(name) < 6:
                continue
            chinese, iscompany = name_helper.name_check(name)
            if chinese is False or iscompany is False:
                continue
            logger.info("%s, %s", c["createTime"], name)
            conn.update("update corporate_alias set type=12010 where id=%s",
                        c["id"])
    conn.close()
示例#16
0
def save_company_name(app, item_of_name, source, sourceId):
    company_name = app[item_of_name]
    if company_name is None or company_name.strip() == "":
        return

    company_name = name_helper.company_name_normalize(company_name)

    source_company_name = collection_source_company.find_one({"source": source, "sourceId": sourceId, "source_company_name.name": company_name})

    if source_company_name is None:
        (chinese, company) = name_helper.name_check(app[item_of_name])
        if chinese is True:
            chinese_type = "Y"
        else:
            chinese_type = "N"

        scnamedata = {
            "name": company_name,
            "chinese": chinese_type,
            "type": 12010,
            "extended": 'Y',
        }
        save_mongo_source_company_name(source, sourceId, scnamedata)
示例#17
0
            c["id"])
        for alias in aliases:
            name = alias["name"]
            new_name = name_helper.company_name_normalize(name)
            if name != new_name:
                logger.info("1. %s --- %s", name, new_name)
                update_company_alias(alias["id"], new_name)
            main_name = name_helper.get_main_company_name(new_name)
            if main_name != new_name:
                logger.info("2. %s --- %s", new_name, main_name)
                save_company_alias(c["id"], main_name)

        fullname = c["fullname"]
        if fullname is None or fullname.strip() == "":
            continue
        is_chinese, is_company = name_helper.name_check(fullname)
        if is_company:
            new_name = name_helper.company_name_normalize(fullname)
            if fullname != new_name:
                save_company_alias(c["id"], new_name)
                logger.info("3. %s --- %s", fullname, new_name)
            main_name = name_helper.get_main_company_name(new_name)
            if main_name != new_name:
                save_company_alias(c["id"], main_name)
                logger.info("4. %s --- %s", new_name, main_name)

            if main_name != fullname:
                update_company_fullname(c["id"], main_name)
    conn.close()

    logger.info("End.")
示例#18
0
def expand_source_company(source, sourceId, beian_links_crawler, icp_chinaz_crawler, screenshot_crawler, test=False):
    logger.info("source: %s, sourceId: %s Start expand!!!", source, sourceId)
    logger.info("clean old expanded data")

    expand_clean(source, sourceId)
    sourcecompany = collection_source_company.find_one({"source": source, "sourceId": sourceId})
    # exit()
    company_fullname = sourcecompany["source_company"]["fullName"]
    if company_fullname is not None and company_fullname.strip() != "":
        company_fullname = name_helper.company_name_normalize(company_fullname)

        scnames = sourcecompany["source_company_name"]
        check_fullname = False
        for scname in scnames:
            if scname["name"] == company_fullname:
                check_fullname = True
                break
        if check_fullname is False:
            (chinese, company) = name_helper.name_check(company_fullname)
            if chinese is True:
                chinese_type = "Y"
            else:
                chinese_type = "N"
            scname_data ={
                "name": company_fullname,
                "chinese": chinese_type,
                "type": 12010,
            }
            save_mongo_source_company_name(source, sourceId, scname_data)

    round = 1

    while True:
        if round >= 6:
            collection_source_company.update_one({"_id": sourcecompany["_id"]},{'$set': {"scexpanded": True, "modifyTime": datetime.datetime.now()}})
            break

        source_company_names = find_mongo_data(collection_source_company, "source_company_name", source, sourceId)
        main_beianhaos = find_mongo_data(collection_source_company, "source_mainbeianhao", source, sourceId)
        artifacts = find_mongo_data(collection_source_company, "source_artifact", source, sourceId)

        logger.info(json.dumps(source_company_names, ensure_ascii=False, cls=util.CJsonEncoder))
        logger.info(json.dumps(main_beianhaos, ensure_ascii=False, cls=util.CJsonEncoder))
        logger.info(json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder))

        # Check if there are new stuff which need to do expansion
        if len(source_company_names) == 0 and len(artifacts) == 0 and len(main_beianhaos) == 0:
            collection_source_company.update_one({"_id": sourcecompany["_id"]}, {'$set': {"scexpanded": True, "modifyTime": datetime.datetime.now()}})
            break

        logger.info("source: %s, sourceId: %s expand for round %d", source, sourceId, round)

        # Step A/1:按公司名,备案查询
        logger.info("source: %s, sourceId: %s 按公司名备案查询", source, sourceId)
        for source_company_name in source_company_names:
            # Only check chinese company name
            if source_company_name["name"] is None or source_company_name["name"].strip() == "":
                continue

            if source_company_name["chinese"] is None:
                (chinese, companyName) = name_helper.name_check(source_company_name["name"])
            else:
                chinese = source_company_name["chinese"]

            if chinese != "Y":
                continue

            check_name = list(collection_beian.find({"organizer": source_company_name["name"]}))
            # Case that one company_name has multiple beian# : 上海汇翼->(today.ai/teambition.com)#If only one found in Mongo.beian(organizer) it is fine
            if len(check_name) == 0:
                if test:
                    items_beianlinks = []
                else:
                    items_beianlinks = beian_links_crawler.query_by_company_name(source_company_name["name"])
                    save_collection_beian(collection_beian, items_beianlinks)  # insert infos into Mongo.beian
            else:
                items_beianlinks = check_name
            save_beian_artifacts(items_beianlinks, source, sourceId)  # insert website/homepage into Mysql.source_artifact
            save_beian_company_names(items_beianlinks, source, sourceId)  # insert organizer into Mysql.source_company_names
            save_beian_mainbeianhaos(items_beianlinks, source, sourceId)  # insert mainBeianhao into Mysql.source_mainbeiahao

            # beian
            # 发现更多的artifact(website)和公司名,主备案号

        # Step A/2:按domian,备案查询
        logger.info("source: %s, sourceId: %s 按domian备案查询", source, sourceId)
        for artifact in artifacts:
            # Only check is artifact is a website
            if artifact["type"] != 4010:
                continue
            if artifact["domain"] is None:
                link = url_helper.url_normalize(artifact["link"])
                (flag, domain) = url_helper.get_domain(link)
                if flag is None:
                    continue
                if flag is False:
                    continue
            else:
                domain = artifact["domain"]

            if domain is None or domain.strip() == "":
                continue

            check_domain = list(collection_beian.find({"domain": domain}))

            if len(check_domain) == 0:
                if test:
                    items_merge =[]
                else:
                    items_beianlinks = beian_links_crawler.query_by_domain(domain)
                    items_icpchinaz = icp_chinaz_crawler.query_by_domain(domain)
                    items_merge = merge_beian(items_beianlinks, items_icpchinaz)

                    save_collection_beian(collection_beian, items_merge)  # insert infos into Mongo.beian
            else:
                items_merge = check_domain

            # filer by check domain to avoid sinaapp.cn case
            items_merge = filter_domain(items_merge, domain)

            save_beian_artifacts(items_merge, source, sourceId)  # insert website/homepage into Mysql.source_artifact
            save_beian_company_names(items_merge, source, sourceId)  # insert organizer into Mysql.source_company_names
            save_beian_mainbeianhaos(items_merge, source, sourceId)  # insert mainBeianhao into Mysql.source_mainbeiahao

            # beian
            # 发现更多的artifact(website)和公司名,主备案号

        # Step A/3 #按主备案号查询
        logger.info("source: %s, sourceId: %s 按主备案号查询", source, sourceId)
        for main_beianhao in main_beianhaos:
            mainBeianhao = main_beianhao["mainBeianhao"]
            check_mainBeianhao = collection_main_beianhao.find_one({"mainBeianhao": mainBeianhao})

            if check_mainBeianhao is None:
                if test:
                    items_merge =[]
                else:
                    items_beianlinks = beian_links_crawler.query_by_main_beianhao(mainBeianhao)
                    items_icpchinaz = icp_chinaz_crawler.query_by_main_beianhao(mainBeianhao)
                    items_merge = merge_beian(items_beianlinks, items_icpchinaz)

                    save_collection_beian(collection_beian, items_merge)  # insert infos into Mongo.beian
                # if mainBeianhao could be found in two links
                if len(items_merge) > 0:
                    items_main_beianhao = [{"mainBeianhao": mainBeianhao}]
                    save_collection_mainBeianhao(collection_main_beianhao, items_main_beianhao)  # insert mainBeianhao into Mongo.main_beianhao
            else:
                items_merge = list(collection_beian.find({"mainBeianhao": mainBeianhao}))

            save_beian_artifacts(items_merge, source, sourceId)  # insert website/homepage into Mysql.source_artifact
            save_beian_company_names(items_merge, source, sourceId)  # insert organizer into Mysql.source_company_names
            save_beian_mainbeianhaos(items_merge, source, sourceId)  # insert mainBeianhao into Mysql.source_mainbeiahao
            # 发现更多的artifact(website)和公司名

        # itunes扩展
        # Step B/1 #查询itunes artifact
        logger.info("source: %s, sourceId: %s 查询itunes artifact", source, sourceId)

        itunes_company_enames = {}
        app_by_name = {}

        for artifact in artifacts:
            if artifact["type"] != 4040:
                continue
            # Get trackid
            trackid = None
            if artifact["domain"] is None:
                (apptype, appmarket, trackid) = url_helper.get_market(artifact["link"])
                if apptype != 4040:
                    continue

            else:
                try:
                    trackid = int(artifact["domain"])
                except:
                    pass

            if trackid is not None:
                app = collection_itunes.find_one({"trackId": trackid})

                if app is None:
                    # mark it as Noactive
                    set_artifact_active(artifact, "N", source, sourceId)
                else:
                    copy_from_itunes(app, artifact, source, sourceId)  # 存在: copy from mongo.itunes
                    if app.has_key("offline") and app["offline"] is True:
                        set_artifact_active(artifact, "Offline", source, sourceId)
                    else:
                        set_artifact_active(artifact, "Y", source, sourceId)

                    english, is_company = name_helper.english_name_check(app["sellerName"])
                    if english and is_company:
                        itunes_company_enames["sellerName"] = 1
                        app_by_name = app
            else:
                set_artifact_active(artifact, "N", source, sourceId)

        # save the only english name
        if len(itunes_company_enames) == 1:
            company_name = collection_source_company.find_one({"source": source, "sourceId": sourceId, "source_company_name": {"$elemMatch": {"type": 12010, "chinese":"N"}}})

            if company_name is None:
                save_company_name(app_by_name, "sellerName", source, sourceId)

        # Step B/2根据公司名查询更多的itunes artifact
        logger.info("source: %s, sourceId: %s 根据公司名查询更多的itunes artifact", source, sourceId)
        for source_company_name in source_company_names:
            # producer name
            '''
            check_itunes_producers = list(collection_itunes.find({"developer": source_company_name["name"]}))
            if len(check_itunes_producers) > 0:
                for app in check_itunes_producers:
                    # Check if itunesId is already existed in artifacts
                    if find_itunesId(app["trackId"], source_company_id):
                        pass
                    else:
                        source_artifact_id = save_itunes_artifact(app, source_company_id)
                        #save_artifact_itunes_rel(app["_id"], source_artifact_id)
                    save_company_name(app, "developer", source_company_id)
            '''
            if source_company_name["name"] is None or source_company_name["name"].strip() == "":
                continue

            check_itunes_sellers = list(collection_itunes.find({"sellerName": source_company_name["name"]}))
            if len(check_itunes_sellers) > 0:
                '''
                domains = {}
                for app in check_itunes_sellers:
                    sellerUrl = app.get("sellerUrl")
                    flag ,domain = url_helper.get_domain(sellerUrl)
                    if flag is not None and domain is not None:
                        domains[domain] = 1
                '''
                lens_domain = count_domains(check_itunes_sellers, "sellerUrl")
                artifact_status = check_source_artifact(source, sourceId)

                for app in check_itunes_sellers:
                    # Check if itunesId is already existed in all artifacts in 1 sourceCompanyId
                    if find_itunesId(app["trackId"], source, sourceId):
                        pass
                    else:
                        save_itunes_artifact(app, source, sourceId)

                        if app.has_key("sellerUrl"):
                            # if find_link(app["sellerUrl"], source_company_id) or check_source_artifact(source_company_id):
                            if artifact_status:
                                pass
                            elif lens_domain == 1:
                                artifact_id = save_itunesSellerUrl_artifact(app, source, sourceId)

                                if artifact_id is not None:
                                    artifact_status = True

                            # comment due to incorrect expand
                            '''
                            if app.has_key("supportUrl"):
                                if find_link(app["supportUrl"], source_company_id):
                                    pass
                                else:
                                    save_itunesSupportUrl_artifact(app, source_company_id)
                            '''

                            # save_artifact_itunes_rel(app["_id"], source_artifact_id)
                            # save_company_name(app, "sellerName", source_company_id)

        # Step B/3根据域名查询更多的itunes artifact
        logger.info("source: %s, sourceId: %s 根据域名查询更多的itunes artifact", source, sourceId)
        for artifact in artifacts:
            if artifact["type"] != 4010:
                continue

            if artifact["domain"] is None:
                (flag, domain) = url_helper.get_domain(artifact["link"])
                if flag is None:
                    continue
                if flag is False:
                    continue
            else:
                domain = artifact["domain"]

            if domain is None or domain.strip() == "":
                continue

            if domain in itunesDomainEx:
                continue

            check_itunes_sellerDomains = list(collection_itunes.find({"sellerDomain": domain}))
            if len(check_itunes_sellerDomains) > 0:

                lens_company_names = count_company_names(check_itunes_sellerDomains, "sellerName")
                company_name_status = check_source_company_name(source, sourceId)

                for app in check_itunes_sellerDomains:

                    # Check if itunesId is already existed in all artifacts in 1 sourceCompanyId
                    if find_itunesId(app["trackId"], source, sourceId):
                        pass
                    else:
                        save_itunes_artifact(app, source, sourceId)

                    if company_name_status:
                        pass
                    elif lens_company_names == 1:

                        # save_artifact_itunes_rel(app["_id"], source_artifact_id)
                        chinese, is_company = name_helper.name_check(app["sellerName"])
                        if chinese and is_company:
                            save_company_name(app, "sellerName", source, sourceId)
                            company_name_status = True

                        english, is_company = name_helper.english_name_check(app["sellerName"])
                        if english and is_company:
                            save_company_name(app, "sellerName", source, sourceId)
                            company_name_status = True

            check_itunes_supportDomains = list(collection_itunes.find({"supportDomain": domain}))
            if len(check_itunes_supportDomains) > 0 and len(check_itunes_supportDomains) < 100:

                lens_company_names = count_company_names(check_itunes_supportDomains, "sellerName")
                company_name_status = check_source_company_name(source, sourceId)

                for app in check_itunes_supportDomains:
                    # Check if itunesId is already existed in all artifacts in 1 sourceCompanyId
                    if find_itunesId(app["trackId"], source, sourceId):
                        pass
                    else:
                        save_itunes_artifact(app, source, sourceId)
                        # save_artifact_itunes_rel(app["_id"], source_artifact_id)
                    if company_name_status:
                        pass
                    elif lens_company_names == 1:
                        chinese, is_company = name_helper.name_check(app["sellerName"])
                        if chinese and is_company:
                            save_company_name(app, "sellerName", source, sourceId)
                            company_name_status = True

                        english, is_company = name_helper.english_name_check(app["sellerName"])
                        if english and is_company:
                            save_company_name(app, "sellerName", source, sourceId)
                            company_name_status = True

        # 发现更多的artifact(website)和公司名,check if existed in source_art..and company_name


        # android扩展
        # Step C/1#查询android artifact
        logger.info("source: %s, sourceId: %s 查询android artifact", source, sourceId)
        for artifact in artifacts:
            if artifact["type"] != 4050:
                continue
            # Get apkname
            apkname = None
            if artifact["domain"] is None:
                (apptype, appmarket, appid) = url_helper.get_market(artifact["link"])
                # Get apkname of baidu and 360 from android market
                if apptype != 4050:
                    continue

                if appmarket == 16010 or appmarket == 16020:
                    android_app = collection_android_market.find_one({"appmarket": appmarket, "key_int": appid})
                    if android_app:
                        apkname = android_app["apkname"]
                else:
                    apkname = appid
            else:
                apkname = artifact["domain"]

            if apkname is not None:
                app = collection_android.find_one({"apkname": apkname})

                if app is None:
                    # mark it as Noactive
                    set_artifact_active(artifact, "N", source, sourceId)
                else:
                    copy_from_android(app, artifact, source, sourceId)  # 存在: copy from mongo.android
                    set_artifact_active(artifact, "Y", source, sourceId)

                    # chinese, is_company = name_helper.name_check(app["author"])
                    # if is_company:
                    #     save_company_name(app, "author", source_company_id)
            else:
                set_artifact_active(artifact, "N", source, sourceId)

        # Step C/2根据公司名查询更多的android artifact
        logger.info("source: %s, sourceId: %s 根据公司名查询更多的android artifact", source, sourceId)
        for source_company_name in source_company_names:
            # producer name
            if source_company_name["name"] is None or source_company_name["name"].strip() == "":
                continue

            check_android_authors = list(collection_android.find({"author": source_company_name["name"]}))
            if len(check_android_authors) > 0 and len(check_android_authors) < 200:

                lens_domain = count_domains(check_android_authors, "website")
                artifact_status = check_source_artifact(source, sourceId)

                # check if author is consistent
                for app in check_android_authors:
                    # Check if AnId have one 4010
                    if find_androidAppname(app["apkname"], source, sourceId):
                        pass
                    else:
                        save_android_artifact(app, source, sourceId)

                        if artifact_status:
                            pass
                        elif lens_domain == 1:
                            artifact_id = save_androidWebsite_artifact(app, source, sourceId)

                            if artifact_id is not None:
                                artifact_status = True

                                # save_artifact_android_rel(app["_id"], source_artifact_id)
                                # save_company_name(app, "author", source_company_id)

        # Step C/3根据域名查询更多的android artifact
        logger.info("source: %s, sourceId: %s 根据域名查询更多的android artifact", source, sourceId)
        for artifact in artifacts:
            if artifact["type"] != 4010:
                continue

            if artifact["domain"] is None:
                (flag, domain) = url_helper.get_domain(artifact["link"])
                if flag is None:
                    continue
                if flag is False:
                    continue
            else:
                domain = artifact["domain"]

            if domain is None or domain.strip() == "":
                continue

            check_android_websiteDomains = list(collection_android.find({"website_domain": domain}))
            if len(check_android_websiteDomains) > 0:

                lens_company_names = count_company_names(check_android_websiteDomains, "author")
                company_name_status = check_source_company_name(source, sourceId)

                for app in check_android_websiteDomains:
                    # Check if AndroidId is already existed in artifacts
                    if find_androidAppname(app["apkname"], source, sourceId):
                        pass
                    else:
                        save_android_artifact(app, source, sourceId)
                        # save_artifact_android_rel(app["_id"], source_artifact_id)
                    if company_name_status:
                        pass
                    elif lens_company_names == 1:

                        chinese, is_company = name_helper.name_check(app["author"])
                        if is_company:
                            save_company_name(app, "author", source, sourceId)
                            company_name_status = True

            check_android_apknameDomains = list(collection_android.find({"apkname_domain": domain}))
            # add threshold to avoid case: domain: com.wowotuan
            if len(check_android_apknameDomains) > 0 and len(check_android_apknameDomains) < 100:

                lens_company_names = count_company_names(check_android_apknameDomains, "author")
                company_name_status = check_source_company_name(source, sourceId)

                for app in check_android_apknameDomains:
                    # Check if AndroidId is already existed in artifacts
                    if find_androidAppname(app["apkname"], source, sourceId):
                        pass
                    else:
                        save_android_artifact(app, source, sourceId)
                        # save_artifact_android_rel(app["_id"], source_artifact_id)
                    if company_name_status:
                        pass
                    elif lens_company_names == 1:

                        chinese, is_company = name_helper.name_check(app["author"])
                        if is_company:
                            save_company_name(app, "author", source, sourceId)
                            company_name_status = True
        # 发现更多的artifact(website)和公司名

        # 曾用名 TODO

        # 清洗website artfiact
        # 查询meta信息, 标记不能访问的?website?, 处理转跳的website
        logger.info("source: %s, sourceId: %s website meta", source, sourceId)
        for artifact in artifacts:
            if artifact["type"] != 4010:
                continue
            if artifact["link"] is None or artifact["link"].strip() == "":
                # set_active("source_artifact", "N", artifact["id"])
                set_artifact_active(artifact, "N", source, sourceId)
                continue

            url = artifact["link"].strip()
            meta = collection_website.find_one({"url": url})
            if meta is None or meta["httpcode"]==404:
                meta = website.get_meta_info(url)
                if meta:
                    websiteId = save_collection_website(collection_website, meta)
                    if websiteId is not None and not test:
                        #screenshot_wesbite(collection_website, websiteId, screenshot_crawler)
                        pass
                else:
                    meta = {
                        "url": artifact["link"],
                        "httpcode": 404
                    }
                    websiteId = save_collection_website(collection_website, meta)
                    set_artifact_active(artifact, "N", source, sourceId)

            if meta:
                # 发生转跳
                # logger.info(meta)
                if meta["httpcode"] == 200:
                    redirect_url = meta.get("redirect_url")
                    if artifact["link"] != redirect_url:
                        url = url_helper.url_normalize(meta["redirect_url"])
                        (flag_new, domain_new) = url_helper.get_domain(url)

                        meta_new = {
                            "url": url,
                            "domain": domain_new if flag_new is True else None,
                            "redirect_url": url,
                            "title": meta["title"],
                            "tags": meta["tags"],
                            "description": meta["description"],
                            "httpcode": 200

                        }

                        websiteId_new = save_collection_website(collection_website, meta_new)
                        if websiteId_new is not None and not test:
                            #screenshot_wesbite(collection_website, websiteId_new, screenshot_crawler)
                            pass

                        flag, domain = url_helper.get_domain(artifact["link"])
                        if domain_new != domain:  # 跳出原域名
                            set_artifact_active(artifact, "Redirect", source, sourceId)
                        else:
                            if flag is True:  # 这是个'好'地址
                                set_artifact_active(artifact, "Y", source, sourceId)
                            else:
                                if flag_new is True:  # 转跳后是个 '好'地址
                                    set_artifact_active(artifact, "Redirect", source, sourceId)
                                    save_website_artifact(meta_new, source, sourceId)
                                else:
                                    set_artifact_active(artifact, "Y", source, sourceId)
                    else:
                        set_artifact_active(artifact, "Y", source, sourceId)
                elif meta["httpcode"] == 404:
                    set_artifact_active(artifact, "N", source, sourceId)

        # verify -> source_artifacts/source_company_name set verify
        logger.info("source: %s, sourceId: %s set verify", source, sourceId)
        for artifact in artifacts:
            set_artifact_expand(artifact, source, sourceId)
        for source_company_name in source_company_names:
            set_scname_expand(source_company_name, source, sourceId)
        for main_beianhao in main_beianhaos:
            set_scbeianhao_expand(main_beianhao, source, sourceId)

        round += 1
示例#19
0
def parse_company(item):
    # logger.info("parse_company")

    d = pq(html.fromstring(item['content'].decode("utf-8")))
    company_key = item["key"]

    # company basic info
    tags = []
    for tag in d('.word_list').text().split():
        if tag.strip() not in tags: tags.append(tag)

    tags_str = ",".join(tags)

    logo = d('.peoimg img').attr('src')
    if logo:
        logo = logo.replace("https://", "http://")

    establish_date = None
    time_content = d('.time_content li:last-child')
    if d(time_content)('.upword').text().find('成立') > 0:
        establish_date = d(time_content)('.time_up').text()
        establish_date = datetime.datetime.strptime(establish_date, '%Y-%m-%d')

    companyName = d('.company_div h5').text()
    city = name_helper.get_location_from_company_name(companyName)[0]
    location_id = 0
    if city != None:
        location = parser_db_util.get_location(city)
        if location != None:
            location_id = location["locationId"]

    # logger.info("locationid =%s",location_id)

    fullName = companyName.replace("_", "")
    fullName = name_helper.company_name_normalize(fullName)

    desc = d('#intro_srocll p').text()
    productDesc = ''
    website = ''
    for p in d('.procont_lis p'):
        if d(p).text().find('官网') > 0 and d(p)('a').attr('href') is not None:
            website = d(p)('a').attr('href')
            continue
        productDesc += d(p).text() + '\n'

    if desc == '' or desc is None: desc = productDesc

    shortName = d('.peo_center h4').text().split(':')[0].split(':')[0].split(
        '——')[0].split(',')[0].split('|')[0]

    companyResult = {}
    # isCompany
    # print companyName,company_key, ',', name_helper.name_check(companyName)[1], ',', len(website)>0
    if name_helper.name_check(companyName)[1] == True:
        # English name
        if name_helper.name_check(shortName)[0] == False:
            pass
        else:
            cnt = 0
            for s in shortName:
                if s in companyName: cnt += 1

            if not cnt > 2:
                shortName = companyName
    else:
        if not len(website) > 0:
            return 0
        else:
            companyResult['fakeName'] = fullName
            fullName = None

    companyResult.update({
        "name": shortName,
        "fullName": fullName,
        "description": desc,
        "round": 0,
        "roundDesc": None,
        "companyStatus": 2010,
        'fundingType': 0,
        "locationId": location_id,
        "address": None,
        "phone": None,
        "establishDate": establish_date,
        "logo": logo,
        "source": SOURCE,
        "sourceId": company_key,
        "field": None,
        "subField": None,
        "tags": tags_str,
        "headCountMin": None,
        "headCountMax": None,
        "brief": None,
        "website": website,
    })

    return companyResult
示例#20
0
def kuohao_alias():
    tline = ""
    conn = db.connect_torndb()
    n = 0
    n1 = 0
    n2 = 0
    n3 = 0
    n4 = 0
    # cnames = conn.query("select * from investor_alias where (active is null or active !='N') and name like %s", '%(%')
    cnames = conn.query(
        "select name,count(*) as cnt from investor_alias where (active is null or active !='N') "
        "and (name like %s or name like %s) group by name", '%(%', '%)%')

    for cname in cnames:
        wname = cname["name"]
        investors = conn.query(
            "select * from investor_alias where (active is null or active !='N') and name=%s",
            wname)
        for inv in investors:
            if inv["type"] != 12010: continue
            wid = inv["investorId"]
            investor = conn.get(
                "select * from investor where (active is null or active !='N') and id=%s",
                wid)
            if investor is None: continue
            n1 += 1
            # logger.info("*****************name:%s",inv["name"])
            mnames = [wname.replace("(", "(").replace(")", ")").strip()]
            # csameiid = ""
            investor_ids = []
            for mname in mnames:
                # i0 = conn.get("select * from investor_alias where name=%s and (active is null or active !='N') and "
                #               "investorId=%s limit 1", mname, wid)
                i0 = None
                if i0 is None:
                    i1s = conn.query(
                        "select * from investor_alias where name=%s and (active is null or active !='N')",
                        mname)
                    for i1 in i1s:
                        iv1 = conn.get(
                            "select * from investor where (active is null or active !='N') and id=%s",
                            i1["investorId"])
                        if iv1 is not None and iv1["id"] not in investor_ids:
                            investor_ids.append(iv1["id"])
                else:
                    if wid not in investor_ids:
                        investor_ids.append(wid)

            if len(investor_ids) > 0:
                if wid in investor_ids and len(investor_ids) == 1:
                    csameiid = "同一机构"
                    n2 += 1
                    conn.update(
                        "update investor_alias set active='N',modifyUser=-561 where id=%s",
                        inv["id"])
                else:
                    csameiid = "多个机构"
                    n3 += 1
                    line = "%s+++%s+++%s\n" % (cname["name"], ";".join([
                        str(id) for id in [str(wid)] + investor_ids
                    ]), get_links([str(wid)] + investor_ids))
                    tline += line
                logger.info("%s - %s - %s - %s", wname, str(wid),
                            ";".join([str(id) for id in investor_ids]),
                            csameiid)
                n += 1
            else:
                (chinese, cccompany) = name_helper.name_check(mnames[0])
                if chinese is True:
                    n4 += 1
                    logger.info("update!!!!!")
                    conn.update(
                        "update investor_alias set name=%s,modifyUser=-561 where id=%s",
                        mnames[0], inv["id"])
    logger.info("%s - %s - %s - %s - %s", n, n1, n2, n3, n4)

    fp2 = open("me.txt", "w")
    fp2.write(tline)
    content = '''<div>Dears,    <br /><br />

                附件是目前系统中存在重复的公司,请在后台搜索
                </div>
                '''
    fp2.close()
    path = os.path.join(sys.path[0], "me.txt")
    logger.info(path)
    email_helper.send_mail_file(
        "烯牛数据数据开发组", "烯牛数据数据开发组", "*****@*****.**",
        ';'.join([i + '@xiniudata.com' for i in ["bamy"]]), "重复机构检索--人工审查",
        content, path)
    fp2.close()
    conn.close()
示例#21
0
def parse_company(item):
    if item is None:
        logger.info("here")
        return None

    logger.info("*** base ***")
    company_key = item["key"]
    html1 = item["content"]
    logger.info(company_key)
    d = pq((html.fromstring(html1.decode("utf-8"))))

    name = d('h1.name').text().strip()

    fullName = d('div.company-business> h4').text()
    if fullName.find("来源")>=0:
        fullName = fullName.split(" ")[-1]

    fullName = name_helper.company_name_normalize(fullName)

    if (name is None or name == "") or (fullName is None or fullName == ""):
        logger.info("here1: %s", name)
        return {
            "status": "No_Name",
        }
    if len(name) > len(fullName):
        name = fullName
    if name is None or name.strip() == "":
        name = fullName

    chinese, companycheck = name_helper.name_check(fullName)
    if companycheck is not True:
        logger.info("here")
        return {
            "status": "No_Name",
        }
    logo = d('div.company-logo> img').attr('src')

    if logo.startswith("http") or logo.startswith("https") or logo.find("default") >= 0:
        pass
    else:
        logo = None

    # if logo.find("default") >= 0:
    #     logo = None

    brief = None
    desc_text = d('div.job-sec> div.text').text()
    logger.info("desc: %s", desc_text)

    if u"该公司尚未添加公司介绍" in desc_text or desc_text == "" or len(desc_text) < 5:
        desc = None
    else:

        desc = desc_text.replace('公司简介:',"").replace("收起","").replace("展开","").replace("&nbsp;","").strip()

    field = ''
    stage = ''
    headCount = ''
    location = ''
    address = ''
    try:
        lll = d('div.info-primary> p').text().strip()
        if len(lll.split(" ")) == 3:
            field = lll.split(" ")[2]
            stage = lll.split(" ")[0]
            headCount = lll.split(" ")[1]

    except:
        pass

    headCount = headCount.replace("人", "")

    if headCount == "少于15":
        min_staff = 1
        max_staff = 15
    else:
        staffarr = headCount.split('-')
        if len(staffarr) > 1:
            min_staff = staffarr[0]
            max_staff = staffarr[1]
        else:
            try:
                min_staff = int(staffarr[0].strip())
                max_staff = None
            except:
                min_staff = None
                max_staff = None



    funding_type = 0
    if stage == '不需要融资':
        stage = 0
        funding_type = 8010
    elif stage == '未融资':
        stage = 0
    elif stage == '天使轮':
        stage = 1010
    elif stage == 'A轮':
        stage = 1030
    elif stage == 'B轮':
        stage = 1040
    elif stage == 'C轮':
        stage = 1050
    elif stage == 'D轮及以上':
        stage = 1060
    elif stage == '上市公司':
        stage = 1110
    else:
        stage = 0



    links = d('div.company-products> ul> li> div.text> div.name> a')
    artifacts = []
    for linkp in links:
        link = pq(linkp)('a').attr("href")
        website = url_helper.url_normalize(link)
        logger.info("website: %s" % website)

        type, app_market, app_id = url_helper.get_market(website)
        if type == 4010:
            if item["url"] != website and website.find("zhipin") == -1:
                flag, domain = url_helper.get_domain(website)
                if flag is not None:
                    if flag is False:
                        domain = None
                    artifacts.append({
                        "type": 4010,
                        "name": name,
                        "description": None,
                        "link": website,
                        "domain": domain
                    })
        elif type == 4020 or type == 4030:
            domain = None
            if domain is not None:
                artifacts.append({
                    "type": type,
                    "name": name,
                    "description": None,
                    "link": website,
                    "domain": domain
                })
        elif type == 4040:
            domain = app_id
            if domain is not None:
                artifacts.append({
                    "type": 4040,
                    "name": name,
                    "description": None,
                    "link": website,
                    "domain": domain
                })
        elif type == 4050:
            domain = None
            if app_market == 16010 or app_market == 16020:
                android_app = parser_mongo_util.find_android_market(app_market, app_id)
                if android_app:
                    domain = android_app["apkname"]
            else:
                domain = app_id
            if domain is not None:
                artifacts.append({
                    "type": 4050,
                    "name": name,
                    "description": None,
                    "link": website,
                    "domain": domain
                })

    #parser member
    members = []

    lis = d('div.manager-list> div> ul >li> div')
    member_rank = 0
    if len(lis) > 0:
        for li in lis:
            mem = pq(li)
            try:
                logo_url = mem('div.info-user> img').attr('src')
                if logo_url.startswith("http") or logo_url.startswith("https"):
                    pass
                else:
                    logo_url = "http:" + logo_url
                member_rank += 1
                member_key = str(item["key"]) + '_' + str(member_rank)
                member_name = mem('p> span.name').text()
                # member_link = mem('p.item_manager_name > a').attr('href')
                member_position = mem('p> span.job-title').text()

                member_desc = mem('div.item_manager_content').text()

                # weibo = None
                # if member_link is not None:
                #     if 'weibo.com' in member_link:
                #         weibo = member_link

                source_member = {'name': member_name,
                                 'photo_url': logo_url,
                                 'weibo': None,
                                 'location': None,
                                 'role': member_position,
                                 'description': member_desc,
                                 'education': None,
                                 'work': None
                                 }
                members.append(source_member)
            except:
                pass

    sourceId2link =  d('div.company-tab> a').eq(0).attr("href")
    if sourceId2link is not None and sourceId2link.find("gongsi") >=0:
        sourceId2 = sourceId2link.split("/")[-1].replace(".html","")
    else:
        sourceId2 =  None

    source_company = {
                      "name": name,
                      "fullName": fullName  if fullName is not None and fullName.strip() != "" else None,
                      "description": desc,
                      "brief": brief,
                      "round": None,
                      "roundDesc": None,
                      "companyStatus": 2010,
                      'fundingType': funding_type,
                      "locationId": int(0),
                      "address": address,
                      "phone": None,
                      "establishDate": None,
                      "logo": logo,
                      "source": SOURCE,
                      "sourceId": company_key,
                      "sourceId2": sourceId2,
                      "sourceUrl": "https://www.zhipin.com/gongsi/%s.html?ka=company-intro" % company_key,
                      "field": field,
                      "headCountMin": min_staff,
                      "headCountMax": max_staff,
                      "artifacts": artifacts,
                      "members": members,
                      "status": 1,
                      "stage": 0,
                      }

    return source_company
示例#22
0
def process(crawler, app, content):
    if content.find('请检查您所输入的URL地址是否有误') != -1:
        return

    key = app["key_int"]
    url = app["link"]

    d = pq(content)
    cate = d('div.nav> span >a').eq(1).text().strip()
    if cate == "游戏":
        return

    sub_cate = d('div.nav> span >a').eq(2).text().strip()
    name = d('h1.app-name> span').text().strip()
    downloadstr = d("span.download-num").eq(0).text().replace("下载次数:","").replace("+","").strip()
    if downloadstr.endswith("千"):
        download = float(downloadstr.replace("千","")) * 1000
    elif downloadstr.endswith("万"):
        download = float(downloadstr.replace("万","")) * 10000
    elif downloadstr.endswith("亿"):
        download = float(downloadstr.replace("亿","")) * 10000 * 10000
    else:
        download = int(downloadstr)
    logger.info("%s-%s, %s, %s", cate, sub_cate, name, download)


    mosug_url = "http://m.baidu.com/mosug?wd=%s&type=soft" % urllib.quote(name.encode("utf-8"))
    while True:
        result = crawler.crawl(mosug_url)
        if result['get'] == 'success':
            mosug_content = result["content"]
            break
    #logger.info(mosug_content)

    data = json.loads(mosug_content)
    if data["result"].get("s") is None:
        return

    found = False
    for dt in data["result"].get("s"):
        if dt.get("package") is None:
            continue
        if long(dt["docid"]) == key:
            download = int(dt["download_num"])
            score = int(dt["score"]) * 0.05
            break


    # screenshot
    screenshots = []
    imgs = d('img.imagefix')
    #logger.info(imgs)
    for img in imgs:
        surl = pq(img).attr("src")
        #logger.info(url)
        screenshots.append(surl)

    # content
    desc = d('p.content').text()
    #logger.info(desc)

    icon = d('div.app-pic> img').attr("src")
    #logger.info(icon)
    author = d('div.origin-wrap> span> span').eq(1).text()
    chinese, is_company = name_helper.name_check(author)
    if chinese and is_company:
        author = name_helper.company_name_normalize(author)
    #logger.info("author: %s", author)
    commentbyeditor = d('span.head-content').text()

    item = {
        "link": url,
        "apkname": app["apkname"],
        "appmarket": APPMARKET,
        "name": name,
        "brief": None,
        "website": None,
        "description": desc,
        "commentbyeditor": commentbyeditor,
        "updateDate": None,
        "language": None,
        "tags": sub_cate,
        "version": app["version"],
        "updates": None,
        "size": app["size"],
        "compatibility": None,
        "icon": icon,
        "author": author,
        "screenshots": screenshots,
        "type": app["type"],
        "key": str(key),
        "key_int": key,
        "download": download
    }
    logger.info(json.dumps(item, ensure_ascii=False, cls=util.CJsonEncoder))

    android.save(collection, APPMARKET, item)
    android.merge(item)
示例#23
0
def parse_company(item):
    if item is None:
        logger.info("here")
        return None

    logger.info("*** base ***")
    company_key = item["key"]
    html1 = item["content"]
    logger.info(company_key)
    d = pq((html.fromstring(html1.decode("utf-8"))))

    name = d('h1').text().split()[0].strip()

    fullName = name

    fullName = name_helper.company_name_normalize(fullName)

    if (name is None or name == "") or (fullName is None or fullName == ""):
        logger.info("here1: %s", name)
        return {
            "status": "No_Name",
        }
    if len(name) > len(fullName):
        name = fullName
    if name is None or name.strip() == "":
        name = fullName

    chinese, companycheck = name_helper.name_check(fullName)
    # if companycheck is not True:
    #     logger.info("here")
    #     return {
    #         "status": "No_Name",
    #     }
    logo = d('.bigELogo').attr('src')

    if logo.startswith("http") or logo.startswith(
            "https") or logo.find("default") >= 0:
        pass
    else:
        logo = None

    brief = None
    desc_text = d('.profile').text()
    logger.info("desc: %s", desc_text)

    if u"该公司尚未添加公司介绍" in desc_text or desc_text == "" or len(desc_text) < 5:
        desc = None
    else:
        desc = desc_text.replace('公司简介:', "").replace("收起", "").replace(
            "展开", "").replace("&nbsp;", "").strip()

    field = d('.comp-industry').text().strip()
    stage = ''
    headCount = d('.new-compintro li:nth-child(2)').text().split()[-1]
    location = d('.new-compintro li:nth-child(3)').attr('data-city')
    address = d('.new-compintro li:nth-child(3)').text().replace('公司地址:',
                                                                 '').strip()
    headCount = headCount.replace("人", "")

    if headCount == "少于15":
        min_staff = 1
        max_staff = 15
    else:
        staffarr = headCount.split('-')
        if len(staffarr) > 1:
            min_staff = staffarr[0]
            max_staff = staffarr[1]
        else:
            try:
                min_staff = int(staffarr[0].strip())
                max_staff = None
            except:
                min_staff = None
                max_staff = None

    #
    # funding_type = 0
    # if stage == '不需要融资':
    #     stage = 0
    #     funding_type = 8010
    # elif stage == '未融资':
    #     stage = 0
    # elif stage == '天使轮':
    #     stage = 1010
    # elif stage == 'A轮':
    #     stage = 1030
    # elif stage == 'B轮':
    #     stage = 1040
    # elif stage == 'C轮':
    #     stage = 1050
    # elif stage == 'D轮及以上':
    #     stage = 1060
    # elif stage == '上市公司':
    #     stage = 1110
    # else:
    #     stage = 0
    #

    # links = d('div.company-products> ul> li> div.text> div.name> a')
    artifacts = []
    # for linkp in links:
    #     link = pq(linkp)('a').attr("href")
    #     website = url_helper.url_normalize(link)
    #     logger.info("website: %s" % website)
    #
    #     type, app_market, app_id = url_helper.get_market(website)
    #     if type == 4010:
    #         if item["url"] != website and website.find("zhipin") == -1:
    #             flag, domain = url_helper.get_domain(website)
    #             if flag is not None:
    #                 if flag is False:
    #                     domain = None
    #                 artifacts.append({
    #                     "type": 4010,
    #                     "name": name,
    #                     "description": None,
    #                     "link": website,
    #                     "domain": domain
    #                 })
    #     elif type == 4020 or type == 4030:
    #         domain = None
    #         if domain is not None:
    #             artifacts.append({
    #                 "type": type,
    #                 "name": name,
    #                 "description": None,
    #                 "link": website,
    #                 "domain": domain
    #             })
    #     elif type == 4040:
    #         domain = app_id
    #         if domain is not None:
    #             artifacts.append({
    #                 "type": 4040,
    #                 "name": name,
    #                 "description": None,
    #                 "link": website,
    #                 "domain": domain
    #             })
    #     elif type == 4050:
    #         domain = None
    #         if app_market == 16010 or app_market == 16020:
    #             android_app = parser_mongo_util.find_android_market(app_market, app_id)
    #             if android_app:
    #                 domain = android_app["apkname"]
    #         else:
    #             domain = app_id
    #         if domain is not None:
    #             artifacts.append({
    #                 "type": 4050,
    #                 "name": name,
    #                 "description": None,
    #                 "link": website,
    #                 "domain": domain
    #             })

    # parser member
    members = []

    lis = d('div.executive dl')
    member_rank = 0
    if len(lis) > 0:
        for li in lis:
            mem = pq(li)
            try:
                logo_url = mem('img').attr('src')
                if logo_url.startswith("http") or logo_url.startswith("https"):
                    pass
                else:
                    logo_url = "http:" + logo_url
                member_rank += 1
                member_key = str(item["key"]) + '_' + str(member_rank)
                member_name = mem('p:nth-child(2)').text()
                # member_link = mem('p.item_manager_name > a').attr('href')
                member_position = mem('p:nth-child(3)').text()

                member_desc = mem('dd').text()

                # weibo = None
                # if member_link is not None:
                #     if 'weibo.com' in member_link:
                #         weibo = member_link

                source_member = {
                    'name': member_name,
                    'photo_url': logo_url,
                    'weibo': None,
                    'location': None,
                    'role': member_position,
                    'description': member_desc,
                    'education': None,
                    'work': None
                }
                members.append(source_member)
            except:
                pass

    source_company = {
        "name": name,
        "fullName": fullName,
        "description": desc,
        "brief": brief,
        "round": None,
        "roundDesc": None,
        "companyStatus": 2010,
        'fundingType': None,
        "locationId": int(0),
        "address": address,
        "phone": None,
        "establishDate": None,
        "logo": logo,
        "source": SOURCE,
        "sourceId": company_key,
        "sourceUrl": "https://www.liepin.com/company/%s/" % company_key,
        "field": field,
        "headCountMin": min_staff,
        "headCountMax": max_staff,
        "artifacts": artifacts,
        "members": members,
        "status": 1,
        "stage": 0,
    }

    return source_company
示例#24
0
def process(crawler, url, apkname, content):
    # logger.info(content)
    if has_content(content,apkname):
        logger.info("hereherehere")
        #content = content.decode('utf-8')
        d = pq(html.fromstring(content.decode("utf-8", "ignore")))
        #content = unicode(content, encoding="utf-8", errors='replace')
        #d = pq(content)

        name = d('span.title').text()
        # logger.info("name: %s",name)

        icon = d('div.app-icon> img').attr("src")

        brief = d('p.tagline').text()
        # logger.info(brief)

        commentbyeditor= d('div.editorComment> div').text()
        #logger.info(editor_comment)

        screenshots = []
        imgs = d('div.overview> img')
        # logger.info(imgs)
        for img in imgs:
            imgurl = pq(img).attr("src")
            screenshots.append(imgurl)

        desc = d('div.desc-info> div').text()
        # logger.info(desc)
        updates = d('div.change-info> div').text()
        # logger.info(update_desc)
        try:
            size = int(d('meta[itemprop="fileSize"]').attr("content"))
        except:
            size = d('meta[itemprop="fileSize"]').attr("content")
            if size.find("KB") >= 0:
                size = int(float(size.replace("KB","").strip())* 1024)
            elif size.find("MB") >= 0:
                size = int(float(size.replace("MB","").strip())* 1024 * 1024)
            else:
                size = None
        tags = d('dd.tag-box >a').text().replace(" ",",")


        datestr = d('time#baidu_time').text()
        updatedate = datetime.datetime.strptime(datestr, "%Y年%m月%d日")
        #versionname = d(':contains("版本")').next()
        #logger.info(versionname)
        author = d('span.dev-sites').text()
        chinese, is_company = name_helper.name_check(author)
        if chinese and is_company:
            author = name_helper.company_name_normalize(author)
        try:
            website=d('a.dev-sites').attr("href")
            website = url_helper.url_normalize(website)
        except:
            website=None

        compatibility=None
        if content.find("查看权限要求") == -1:
            r1 = "content=\"Android\">(.*?)</dd>.*<dt>来自"
        else:
            r1 = "content=\"Android\">(.*?)<div>.*"
        result1 = util.re_get_result(r1, content)
        if result1:
            (compatibility,)= result1
            compatibility=compatibility.replace("\n","").replace("\r","").replace("\s","").replace(" ","")
        #logger.info(compatibility)

        versionname=None
        r2 = "<dt>版本</dt>.*<dd>(.*?)</dd>.*<dt>要求"
        result2 = util.re_get_result(r2, content)
        if result2:
            (versionname,)= result2
            versionname = versionname.replace("\n", "").replace("\r", "").replace("\s", "").replace("&nbsp;","").strip()

        #logger.info(versionname)

        try:
            versionname = versionname.split()[0]
            if versionname.startswith("V"):
                versionname = versionname.replace("V", "")
        except:
            pass
        # download = int(d("i[itemprop='interactionCount']").attr("content").split(":")[1])
        dnum = d("i[itemprop='interactionCount']").attr("content").split(":")[1]
        download = None
        try:
            download = int(dnum)
        except:
            if dnum.find("万") >= 0:
                download = int(float(dnum.replace("万", "").strip()) * 10000)
            elif dnum.find("亿") >= 0:
                download = int(float(dnum.replace("亿", "").strip()) * 10000 * 10000)
            else:
                logger.info("********download :%s cannot get", dnum)

        item = {
            "link": url,
            "apkname": apkname,
            "appmarket": APPMARKET,
            "name": name,
            "brief": brief,
            "website": website,
            "description": desc,
            "commentbyeditor": commentbyeditor,
            "updateDate": updatedate,
            "language": None,
            "tags": tags,
            "version": versionname,
            "updates": updates,
            "size": size,
            "compatibility": compatibility,
            "icon": icon,
            "author": author,
            "screenshots": screenshots,
            "type": None,
            "key": apkname,
            "key_int": None,
            "download":download,
            }

        logger.info(json.dumps(item, ensure_ascii=False, cls=util.CJsonEncoder))

        android.save(collection, APPMARKET, item)
        android.merge(item)
        collection_android.update_one({"apkname": apkname}, {"$set": {"wandoujiaprocessed": True, "wandoujiafound": True}})

    else:
        logger.info("App: %s has no content", apkname)
        #logger.info(content)
        collection_android.update_one({"apkname": apkname}, {"$set": {"wandoujiaprocessed": True, "wandoujiafound": False}})
示例#25
0
def process(url, key, content):
    global LATEST
    if content.find('360安全中心') == -1:
        return

    #logger.info(content)

    r = "var detail = \(function \(\) \{\s*?return\s*?(.*?);\s*?\}\)"
    result = util.re_get_result(r, content)
    (b, ) = result
    base = json.loads(b.replace("'", '"'), strict=False)
    name = base["sname"]
    type = base["type"]
    package = base["pname"].strip()
    #logger.info("%s, %s, %s" % (type, name, package))

    d = pq(html.fromstring(content.decode("utf-8")))
    desc = ""
    try:
        # desc = d('div.breif').contents()[0].strip()
        desc = d('div.breif').text().strip()
        ts = desc.split("【基本信息】")
        desc = ts[0].strip()
    except:
        pass
    if desc == "":
        try:
            desc = d('div#html-brief').text().strip()
        except:
            pass

    #logger.info(desc)

    author = d('div.base-info> table> tbody> tr> td').eq(
        0).contents()[1].strip()
    chinese, is_company = name_helper.name_check(author)
    if chinese and is_company:
        author = name_helper.company_name_normalize(author)
    author = None

    #logger.info(author)
    modify_date_str = d('div.base-info> table> tbody> tr> td').eq(
        1).contents()[1].strip()
    #logger.info(modify_date_str)
    modify_date = datetime.datetime.strptime(modify_date_str, "%Y-%m-%d")
    #logger.info(modify_date)
    versionname = None
    try:
        versionname = d('div.base-info> table> tbody> tr> td').eq(
            2).contents()[1].strip()
        if versionname.startswith("V"):
            versionname = versionname.replace("V", "")
    except:
        pass
    #logger.info(versionname)
    compatibility = d('div.base-info> table> tbody> tr> td').eq(
        3).contents()[1].strip()
    language = d('div.base-info> table> tbody> tr> td').eq(
        4).contents()[1].strip()

    if language == "其他":
        if hz.is_chinese_string(desc):
            language = "中文"
    #logger.info(language)

    icon = d('div#app-info-panel> div> dl> dt >img').attr("src").strip()
    #logger.info(icon)

    screenshots = []
    try:
        screenshots = d('div#scrollbar').attr("data-snaps").split(",")
    except:
        pass

    commentbyeditor = None
    r = "<p><strong>【小编点评】</strong>(.*?)</p>"
    result = util.re_get_result(r, content)
    if result:
        (commentbyeditor, ) = result

    updates = None
    r = "<br/><b>【更新内容】</b><br/>(.*?)</div>"
    result = util.re_get_result(r, content)
    if result:
        (updates, ) = result
        updates = updates.replace("<br />", "\n").strip()

    tags = d("div.app-tags> a").text().replace(" ", ",")

    size = None
    r = "'size':'(.*?)'"
    result = util.re_get_result(r, content)
    if result:
        (size, ) = result
        size = int(size)

    downloadstr = d("span.s-3").eq(0).text().replace("下载:", "").replace(
        "次", "").replace("+", "").strip()
    download = None
    try:
        if downloadstr.endswith("千"):
            download = float(downloadstr.replace("千", "")) * 1000
        elif downloadstr.endswith("万"):
            download = float(downloadstr.replace("万", "")) * 10000
        elif downloadstr.endswith("亿"):
            download = float(downloadstr.replace("亿", "")) * 10000 * 10000
        else:
            download = int(downloadstr)
        score = float(d("span.s-1").text().replace("分", "").strip()) * 0.5
    except:
        traceback.print_exc()

    item = {
        "link": url,
        "apkname": package,
        "appmarket": APPMARKET,
        "name": name,
        "brief": None,
        "website": None,
        "description": desc,
        "commentbyeditor": commentbyeditor,
        "updateDate": modify_date,
        "language": language,
        "tags": tags,
        "version": versionname,
        "updates": updates,
        "size": size,
        "compatibility": compatibility,
        "icon": icon,
        "author": author,
        "screenshots": screenshots,
        "type": type,
        "key": str(key),
        "key_int": key,
        "download": download,
    }
    logger.info(json.dumps(item, ensure_ascii=False, cls=util.CJsonEncoder))

    android.save(collection, APPMARKET, item)
    android.merge(item)

    if LATEST < key:
        LATEST = key
示例#26
0
def parse_company(item):
    if item is None:
        return None

    logger.info("*** base ***")
    company_key = item["key"]
    html = item["content"]
    logger.info(company_key)
    d = pq(html)

    # logo_id processed in parser_db_util
    '''
    logo_id = None
    if logo_url is not None:
        logo_id = parser_util.get_logo_id(source, company_key, 'company', logo_url)
    '''

    if html.decode("utf-8").find("这个公司的主页还在建设") >= 0:
        return {
            "status": "No_Name",
        }
    name = d('.company_main > h1 > a').text()
    link = d('.company_main > h1 > a').attr('href')
    fullName = d('.company_main > h1 > a').attr('title')
    fullName = name_helper.company_name_normalize(fullName)
    if name is None or fullName is None or name.find("拉勾") >= 0:
        return {
            "status": "No_Name",
        }
    if len(name) > len(fullName):
        name = fullName
    if name is None or name.strip() == "":
        name = fullName

    chinese, companycheck = name_helper.name_check(fullName)
    if companycheck is not True:
        return {
            "status": "No_Name",
        }
    logo = d('.top_info_wrap > img').attr('src')

    if logo.startswith("http") or logo.startswith("https"):
        pass
    else:
        logo = "http:" + logo

    if logo.find("logo_default") >= 0:
        logo = None

    brief = d('.company_word').text()
    desc_text = d('.company_intro_text').text()

    if u"该公司尚未添加公司介绍" in desc_text or len(desc_text) < 10:
        desc = None
    else:
        desc = d('.company_intro_text > .company_content').html()
        desc = desc.replace('<span class="text_over">展开</span>', '')

        soup = BeautifulSoup(desc, "lxml")
        raw = soup.getText()

        # logger.info(desc)
        #logger.info(raw)

        desc = raw

    # if desc is None or desc.strip() == "":
    #     return {
    #         "status": "No_Name",
    #     }
    field = ''
    stage = ''
    headCount = ''
    location = ''
    address = ''
    try:
        field = d(
            '#basic_container > .item_content >ul > li:eq(0) > span').text()
        stage = d(
            '#basic_container > .item_content >ul > li:eq(1) > span').text()
        headCount = d(
            '#basic_container > .item_content >ul > li:eq(2) > span').text()
        headCount = headCount[0:headCount.index(u'人')]
        location = d(
            '#basic_container > .item_content >ul > li:eq(3) > span').text()
        address = d('.con_mlist_ul > li:eq(0) > p:eq(1)').text()
    except:
        pass

    headCount = headCount.replace("people", "")

    if headCount == "少于15":
        min_staff = 1
        max_staff = 15
    else:
        staffarr = headCount.split('-')
        if len(staffarr) > 1:
            min_staff = staffarr[0]
            max_staff = staffarr[1]
        else:
            try:
                min_staff = int(staffarr[0].strip())
                max_staff = None
            except:
                min_staff = None
                max_staff = None

    funding_type = 0
    if stage == '不需要融资':
        stage = 0
        funding_type = 8010
    elif stage == '未融资':
        stage = 0
    elif stage == '天使轮':
        stage = 1010
    elif stage == 'A轮':
        stage = 1030
    elif stage == 'B轮':
        stage = 1040
    elif stage == 'C轮':
        stage = 1050
    elif stage == 'D轮及以上':
        stage = 1060
    elif stage == '上市公司':
        stage = 1110
    else:
        stage = 0

    location_id = 0
    location_new = parser_db_util.get_location(location)
    if location_new != None:
        location_id = location_new["locationId"]

    #website = util.norm_url(link)
    website = url_helper.url_normalize(link)
    logger.info("website: %s" % website)

    artifacts = []
    type, app_market, app_id = url_helper.get_market(website)
    if type == 4010:
        if item["url"] != website and website.find("lagou.com") == -1:
            flag, domain = url_helper.get_domain(website)
            if flag is not None:
                if flag is False:
                    domain = None
                artifacts.append({
                    "type": 4010,
                    "name": name,
                    "description": desc,
                    "link": website,
                    "domain": domain
                })
    elif type == 4020 or type == 4030:
        domain = None
        if domain is not None:
            artifacts.append({
                "type": type,
                "name": name,
                "description": desc,
                "link": website,
                "domain": domain
            })
    elif type == 4040:
        domain = app_id
        if domain is not None:
            artifacts.append({
                "type": 4040,
                "name": name,
                "description": desc,
                "link": website,
                "domain": domain
            })
    elif type == 4050:
        domain = None
        if app_market == 16010 or app_market == 16020:
            android_app = parser_db_util.find_android_market(
                app_market, app_id)
            if android_app:
                domain = android_app["apkname"]
        else:
            domain = app_id
        if domain is not None:
            artifacts.append({
                "type": 4050,
                "name": name,
                "description": desc,
                "link": website,
                "domain": domain
            })

    source_company = {
        "name":
        name,
        "fullName":
        fullName if fullName is not None and fullName.strip() != "" else None,
        "description":
        desc,
        "productDesc":
        None,
        "modelDesc":
        None,
        "operationDesc":
        None,
        "teamDesc":
        None,
        "marketDesc":
        None,
        "compititorDesc":
        None,
        "advantageDesc":
        None,
        "planDesc":
        None,
        "brief":
        brief,
        "round":
        None,
        "roundDesc":
        None,
        "companyStatus":
        2010,
        'fundingType':
        funding_type,
        "locationId":
        location_id,
        "address":
        address,
        "phone":
        None,
        "establishDate":
        None,
        "logo":
        logo,
        "source":
        SOURCE,
        "sourceId":
        company_key,
        "field":
        field,
        "subField":
        None,
        "tags":
        None,
        "headCountMin":
        min_staff,
        "headCountMax":
        max_staff,
        "artifacts":
        artifacts,
        "status":
        1
    }

    return source_company
示例#27
0
def expand(company_id):
    # mongo
    mongo = db.connect_mongo()
    # create index?
    # collection = mongo.crawler_v3.projectdata

    collection_itunes = mongo.market.itunes
    collection_beian = mongo.info.beian
    collection_android = mongo.market.android
    collection_android_market = mongo.market.android_market

    logger.info("Company id: %s Start app check!!!", company_id)

    conn = db.connect_torndb()
    company_names = conn.query(
        "select * from corporate_alias where corporateId in (select corporateId from "
        "company where id=%s) and (active is null or active='Y')", company_id)
    artifacts = conn.query(
        "select * from artifact where companyId=%s and (active is null or active='Y')",
        company_id)
    logger.info(
        json.dumps(company_names, ensure_ascii=False, cls=util.CJsonEncoder))
    logger.info(
        json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder))
    conn.close()

    # Step A/1:按公司名,备案查询
    logger.info("%s 按公司名备案查询", company_id)
    for company_name in company_names:
        # Only check chinese company name
        if company_name["name"] is None or company_name["name"].strip() == "":
            continue
        (chinese, companyName) = name_helper.name_check(company_name["name"])

        if chinese != "Y":
            continue

        check_names = list(
            collection_beian.find({"organizer": company_name["name"]}))

        if len(check_names) > 0:

            save_beian_artifacts(
                check_names,
                company_id)  # insert website/homepage into Mysql.artifact

    #itunes扩展
    #Step B/2根据公司名查询更多的itunes artifact
    logger.info("%s 根据公司名查询更多的itunes artifact", company_id)
    for company_name in company_names:

        if company_name["name"] is None or company_name["name"].strip() == "":
            continue

        check_itunes_sellers = list(
            collection_itunes.find({"sellerName": company_name["name"]}))
        logger.info("**********%s find %s", company_name["name"],
                    len(check_itunes_sellers))
        if len(check_itunes_sellers) > 0:
            #lens_domain = count_domains(check_itunes_sellers, "sellerUrl")

            for app in check_itunes_sellers:
                logger.info("**********%s find %s,%s", company_name["name"],
                            app["trackName"], app["trackId"])
                # Check if itunesId is already existed in all artifacts in 1 CompanyId
                if find_itunesId(app["trackId"], company_id):
                    pass
                else:
                    save_itunes_artifact(app, company_id)

    #Step B/3根据域名查询更多的itunes artifact
    logger.info("%s 根据域名查询更多的itunes artifact", company_id)
    for artifact in artifacts:
        if artifact["type"] != 4010:
            continue

        if artifact["domain"] is None:
            (flag, domain) = url_helper.get_domain(artifact["link"])
            if flag is None:
                continue
            if flag is False:
                continue
            update_domain(domain, artifact["id"])
        else:
            domain = artifact["domain"]

        if domain is None or domain.strip() == "":
            continue

        if domain in itunesDomainEx:
            continue

        check_itunes_sellerDomains = list(
            collection_itunes.find({"sellerDomain": domain}))
        logger.info("**********%s find %s", domain,
                    len(check_itunes_sellerDomains))
        if len(check_itunes_sellerDomains) > 0:

            #lens_company_names = count_company_names(check_itunes_sellerDomains, "sellerName")

            for app in check_itunes_sellerDomains:
                logger.info("**********%s find %s, %s", domain,
                            app["trackName"], app["trackId"])
                # Check if itunesId is already existed in all artifacts in 1 CompanyId
                if find_itunesId(app["trackId"], company_id):
                    pass
                else:
                    save_itunes_artifact(app, company_id)

        check_itunes_supportDomains = list(
            collection_itunes.find({"supportDomain": domain}))
        logger.info("**********%s find %s", domain,
                    len(check_itunes_supportDomains))
        if len(check_itunes_supportDomains) > 0 and len(
                check_itunes_supportDomains) < 100:

            #lens_company_names = count_company_names(check_itunes_supportDomains, "sellerName")

            for app in check_itunes_supportDomains:
                logger.info("**********%s find %s, %s", domain,
                            app["trackName"], app["trackId"])
                # Check if itunesId is already existed in all artifacts in 1 CompanyId
                if find_itunesId(app["trackId"], company_id):
                    pass
                else:
                    save_itunes_artifact(app, company_id)

    #android扩展

    #Step C/2根据公司名查询更多的android artifact
    logger.info("%s 根据公司名查询更多的android artifact", company_id)
    for company_name in company_names:
        # producer name
        if company_name["name"] is None or company_name["name"].strip() == "":
            continue

        check_android_authors = list(
            collection_android.find({"author": company_name["name"]}))
        logger.info("**********%s find %s", company_name["name"],
                    len(check_android_authors))
        if len(check_android_authors) > 0 and len(check_android_authors) < 100:

            #lens_domain = count_domains(check_android_authors, "website")

            #check if author is consistent
            for app in check_android_authors:
                logger.info("**********%s find %s, %s", company_name["name"],
                            app["name"], app["apkname"])
                # Check if AnId existed
                if find_androidAppname(app["apkname"], company_id):
                    pass
                else:
                    save_android_artifact(app, company_id)

    #Step C/3根据域名查询更多的android artifact
    logger.info("%s 根据域名查询更多的android artifact", company_id)
    for artifact in artifacts:
        if artifact["type"] != 4010:
            continue

        if artifact["domain"] is None:
            (flag, domain) = url_helper.get_domain(artifact["link"])
            if flag is None:
                continue
            if flag is False:
                continue
            update_domain(domain, artifact["id"])
        else:
            domain = artifact["domain"]

        if domain is None or domain.strip() == "":
            continue

        check_android_websiteDomains = list(
            collection_android.find({"website_domain": domain}))
        logger.info("**********%s find %s", domain,
                    len(check_android_websiteDomains))
        if len(check_android_websiteDomains) > 0:

            #lens_company_names = count_company_names(check_android_websiteDomains, "author")

            for app in check_android_websiteDomains:
                logger.info("**********%s find %s, %s", domain, app["name"],
                            app["apkname"])
                # Check if AndroidId is already existed in artifacts
                if find_androidAppname(app["apkname"], company_id):
                    pass
                else:
                    save_android_artifact(app, company_id)

        check_android_apknameDomains = list(
            collection_android.find({"apkname_domain": domain}))
        logger.info("**********%s find %s", domain,
                    len(check_android_apknameDomains))
        #add threshold to avoid case: domain: com.wowotuan
        if len(check_android_apknameDomains) > 0 and len(
                check_android_apknameDomains) < 100:

            #lens_company_names = count_company_names(check_android_apknameDomains, "author")

            for app in check_android_apknameDomains:
                logger.info("**********%s find %s, %s", domain, app["name"],
                            app["apkname"])
                # Check if AndroidId is already existed in artifacts
                if find_androidAppname(app["apkname"], company_id):
                    pass
                else:
                    save_android_artifact(app, company_id)

    mongo.close()
示例#28
0
def update_investor(investor,source_investor):
    conn = db.connect_torndb()
    investor_id = investor["id"]
    logger.info("****checking %s/%s/%s", investor["name"], investor["id"], source_investor["id"])
    if investor["online"] is not None and investor["online"] == "Y":
        logger.info("online not update!!!")
        time.sleep(1)
        pass
    else:
        logger.info("Update investor : %d with source_investor: %d ", investor_id, source_investor["id"])
        replace(investor, source_investor)


    #insert investor_alias
    for name in [source_investor["name"], source_investor["fullName"],source_investor["enName"], source_investor["enFullName"]]:
        if name is None or name.strip() == "": continue
        investor_alias = conn.get("select * from investor_alias where name=%s and "
                                  "investorId=%s and (active is null or active='Y') limit 1",
                                  name, investor["id"])
        # logger.info("here: %s", investor_alias)
        if investor_alias is None:
            chinese, is_company = name_helper.name_check(name)
            if is_company:
                type = 12010
            else:
                type = 12020
            sql = "insert investor_alias(investorId, name, type, createTime,modifyTime) values(%s,%s,%s, now(),now())"
            logger.info("Add new investor alias: %s for %s", name, investor["id"])
            conn.insert(sql, investor["id"], name, type)

    #insert investor_artifact:
    artifacts = []
    if source_investor["website"] is not None and source_investor["website"] != "":
        type, market, app_id = url_helper.get_market(source_investor["website"])
        if type == 4010:
            if source_investor["website"].find('36kr') > 0 and source_investor["website"].find("baidu") > 0:
                pass
            else:
                artifact = {
                    "investorId": investor["id"],
                    "name": investor["name"] ,
                    "description": None,
                    "link": source_investor["website"],
                    "domain": app_id,
                    "type": type
                }
                artifacts.append(artifact)
        elif (type == 4040 or type == 4050) and app_id is not None:
            domain = get_android_domain(market, app_id)
            if (type == 4040 or type == 4050) and domain is not None:
                artifact = {
                    "investorId": investor["id"],
                    "name": investor["name"] ,
                    "description": None,
                    "link": source_investor["website"],
                    "domain": domain,
                    "type": type
                }
                artifacts.append(artifact)


    weibo = source_investor.get("weibo", "")
    if weibo is not None and weibo.strip() != "" and weibo.find("weibo") >= 0:
        artifact = {
            "investorId": investor["id"],
            "name": investor["name"] ,
            "description": None,
            "link": weibo,
            "domain": None,
            "type": 4030
        }
        artifacts.append(artifact)

    weixin = source_investor.get("wechatId", "")
    if weixin is not None and weixin.strip() != "":
        artifact = {
            "investorId": investor["id"],
            "name": investor["name"] ,
            "description": None,
            "link": weixin,
            "domain": weixin,
            "type": 4020
        }
        artifacts.append(artifact)

    if len(artifacts) > 0:
        for art in artifacts:
            if art["type"] not in [4030] and art["domain"] is not None and art["domain"].strip()!="":

                iart = conn.get("select * from investor_artifact where type=%s and investorId=%s and domain=%s limit 1",
                            art["type"], investor["id"], art["domain"])
            else:
                iart = conn.get("select * from investor_artifact where type=%s and investorId=%s and link=%s limit 1",
                                art["type"], investor["id"], art["link"])

            if iart is None:
                logger.info("add new artifact: %s/%s/%s", art["type"], art["name"], art["link"])
                sql = "insert investor_artifact(investorId,type, name, link, domain, createTime,modifyTime) \
                                         values(%s,%s,%s,%s,%s,now(),now())"
                conn.insert(sql, investor["id"], art["type"], art["name"], art["link"], art["domain"])


    #insert contact

    contacts = conn.query("select * from source_investor_contact where sourceInvestorId=%s", source_investor["id"])
    if len(contacts) >0:
        conn.execute("delete from investor_contact where investorId=%s and createUser=139", investor["id"])
        for s in contacts:
            sql = "insert investor_contact(investorId, locationId, address, phone, email, createUser, " \
                  "createTime,modifyTime) \
                              values(%s,%s,%s,%s,%s,%s,now(),now())"
            conn.insert(sql, investor["id"], s["locationId"], s["address"], s["phone"], s["email"], 139)


    # insert member
    members = conn.query("select * from source_investor_member where sourceInvestorId=%s", source_investor["id"])
    for m in members:
        member = conn.get("select * from investor_member where investorId=%s and name=%s limit 1", investor["id"], m["name"])
        if member is not None: continue
        sql = "insert investor_member(investorId,name,logo, position, description,createUser,createTime,modifyTime) \
                              values(%s,%s,%s,%s,%s,%s,now(),now())"
        conn.insert(sql, investor["id"], m["name"], m["logo"], m["position"],
                    m["description"], 139)
    conn.close()
示例#29
0
def corp_merge3():
    tline = ""
    n = 0
    n1 = 0
    n2 = 0
    n3 = 0
    n4 = 0
    n5 = 0
    n6 = 0
    n7 = 0
    conn = db.connect_torndb()
    cnames = conn.query(
        "select name,count(*) as cnt from corporate_alias where (active is null or active !='N') "
        "and name is not null and name!=''  group by name having cnt>1")

    # cnames = conn.query("select fullName,count(*) as cnt from corporate where (active is null or active !='N') "
    #                     "and fullName='上海中慎网络科技有限公司' group by fullName having cnt>1")
    logger.info("total names: %s", len(cnames))

    for cname in cnames:
        pnames = []
        fundingFlag = False
        cfullFlag = True
        full_name = cname["name"]
        corporate_ids = []
        corporate_ids_f = []
        stockFlag = False

        if full_name is None or full_name.strip() == "" or full_name.strip() == "-" \
                or full_name.strip() == "个人" or full_name.strip() == "扎堆":
            continue

        corporate_aliases = conn.query(
            "select * from corporate_alias where name=%s and (active is null or active !='N')",
            full_name)
        for caa in corporate_aliases:
            ca = conn.get(
                "select * from corporate where (active is null or active !='N') and id=%s",
                caa["corporateId"])
            if ca is None: continue
            # if ca["fullName"] != full_name: continue

            c_stock = conn.get(
                "select * from corporate_stock_exchange_rel where corporateId=%s limit 1",
                ca["id"])
            if c_stock is not None:
                stockFlag = True
                continue

            company = conn.get(
                "select * from company where corporateId=%s and (active is null or active='Y') limit 1",
                ca["id"])
            if company is not None:

                if ca["id"] not in corporate_ids:
                    corporate_ids.append(int(ca["id"]))

                    if ca["fullName"] != full_name:
                        cfullFlag = False
                    else:
                        if ca["id"] not in corporate_ids_f:
                            corporate_ids_f.append(int(ca["id"]))

                    funding = conn.get(
                        "select * from funding where corporateId=%s and (active is null or active='Y') "
                        "order by fundingDate desc limit 1",
                        caa["corporateId"])
                    if fundingFlag is False and funding is not None:
                        fundingFlag = True

                    pnames.append(company["name"])

        if len(corporate_ids) > 1 and stockFlag is False:

            if len(pnames) >= 2:
                vv = compare(pnames)
            else:
                vv = 0

            (chinese, company) = name_helper.name_check(full_name)
            if chinese is True:
                chinese_type = "Y"
                n5 += 1
                if fundingFlag is True:
                    n3 += 1
                if cfullFlag is True:
                    n4 += 1
                if vv <= 0.75:
                    n7 += 1

            else:
                chinese_type = "N"
                n6 += 1
            #do merge

            n += 1

            logger.info("merge:%s %s-> %s", full_name, corporate_ids,
                        chinese_type)
            mflag = corporate_util.autoMerge(corporate_ids, full_name)
            #
            # if mflag is None:
            #     logger.info("wrong")
            #     exit()
            if mflag == 1:
                n1 += 1
            else:
                n2 += 1

            # elif mflag == 2:
            #     n2 += 1
            # elif mflag == 3:
            #     n3 += 1
            # elif mflag == 4:
            #     n4 += 1
            #     line = "%s+++%s+++%s\n" % (
            #     full_name, ";".join([str(id) for id in corporate_ids]), get_links(corporate_ids))
            #     fp2.write(line)
            # else:
            c1 = "否"
            c2 = "否"
            c3 = "否"
            if len(corporate_ids_f) == 1:
                c1 = "是"
            if len(corporate_ids_f) == len(corporate_ids):
                c2 = "是"
            if len(corporate_ids_f) == 0:
                c3 = "是"

            line = "%s+++%s+++%s+++%s+++%s+++%s+++%s+++%s+++%s+++%s+++%s\n" % (
                full_name, ";".join([str(id) for id in corporate_ids]),
                get_links(corporate_ids), "中文名" if chinese_type == 'Y' else
                "英文名", "有融资" if fundingFlag is True else "无融资", "公司主要名称一致"
                if cfullFlag is True else "公司别名一致", "短名高度相似" if vv <= 0.75 else
                "短名不相似", "可以根据verify自动聚合" if mflag == 1 else " ", c1, c2, c3)

            # fp2.write(line)
            tline += line
    fp2 = open("me.txt", "w")
    fp2.write(tline)
    logger.info("merge num %s/%s/%s/%s/%s/%s/%s/%s", n, n1, n2, n3, n4, n5, n6,
                n7)
    content = '''<div>Dears,    <br /><br />

        附件是目前系统中存在重复的公司,请在后台搜索
        </div>
        '''
    fp2.close()
    path = os.path.join(sys.path[0], "me.txt")
    logger.info(path)
    email_helper.send_mail_file(
        "烯牛数据数据开发组", "烯牛数据数据开发组", "*****@*****.**",
        ';'.join([i + '@xiniudata.com' for i in ["bamy"]]), "重复公司检索--人工审查",
        content, path)
    conn.close()
示例#30
0
文件: qixin_new.py 项目: yujiye/Codes
def qixinCrawler(name, type, corporateIds=[], test=False):

    if type == 1:
        if len(corporateIds) == 0: return None, None
        (chinese, cccompany) = name_helper.name_check(name)
        if chinese is True:
            pass
        else:
            return None, None

        for corporateId in corporateIds:
            corporate_alias = conn.get(
                "select * from corporate_alias where (active is null or active='Y') and "
                "corporateId=%s and name=%s limit 1", corporateId, name)
            # logger.info(corporateId)
            if corporate_alias is None:
                continue
            # logger.info(corporateId)
            corporate = conn.get(
                "select * from corporate where id=%s and "
                "(active ='Y' or active ='A' or active is null)", corporateId)
            # logger.info(corporateId)
            if corporate is None:
                continue
            company = conn.get(
                "select * from company where corporateId=%s and "
                "(active ='Y' or active ='A' or active is null) limit 1",
                corporateId)
            # logger.info(corporateId)
            if company is None:
                continue
            if company["name"] is None or company["name"].strip() == "" or \
                    len(desc_helper.count_other2(company["name"])) == len(company["name"]):
                logger.info("wwwwwwrong here")
                continue
            param = {}
            tags = [
                tt["name"] for tt in conn.query(
                    " select t.name from company_tag_rel ctr join tag t "
                    "on ctr.tagId=t.id where ctr.companyId=%s and t.type=11012 and "
                    "(ctr.active='Y' or ctr.active is null)", company["id"])
            ]
            baseinfo = {
                "company_name":
                name.strip(),
                "project_logo_url":
                "http://www.xiniudata.com/file/%s" %
                company["logo"] if company["logo"] is not None else "",
                "project_name":
                company["name"].strip(),
                "finance_rounds":
                rmap[int(corporate["round"])] if
                (corporate["round"] is not None
                 and rmap.has_key(int(corporate["round"]))) else "未融资",
                "website_url":
                company["website"] if company["website"] is not None
                and company["website"].strip() != "" else "",
                "key_words":
                ",".join(tags) if len(tags) > 0 else "",
                "introduction":
                company["description"],
            }
            # logger.info(baseinfo)
            fundings = conn.query(
                "select * from funding where corporateId=%s and (active='Y' or active is null)",
                corporateId)
            bfi = []

            for funding in fundings:
                investors = [
                    ii["name"] for ii in conn.query(
                        "select i.name from funding_investor_rel fir join "
                        "investor i "
                        "on fir.investorId=i.id where fir.fundingId=%s and "
                        "(fir.active is null or fir.active='Y') and "
                        "(i.active is null or i.active='Y')", funding["id"])
                ]
                amount = get_amount(funding["investment"], funding["precise"])

                # logger.info("**************%s, %s -> %s", funding["investment"], funding["precise"], amount)
                fi = {
                    "date":
                    str(funding["fundingDate"].date())
                    if funding["fundingDate"] is not None else "",
                    "round":
                    rmap[int(funding["round"])] if
                    (funding["round"] is not None
                     and rmap.has_key(int(funding["round"]))) else "",
                    "amount":
                    amount,
                    "currency":
                    currentmap[int(funding["currency"])] if
                    (funding["currency"] is not None
                     and currentmap.has_key(int(funding["currency"]))) else "",
                    "investor":
                    ",".join(investors)
                }
                bfi.append(fi)
            baseinfo["finance_info"] = bfi

            param["base"] = baseinfo

            #members
            members = conn.query(
                "select cmr.type,cmr.position,m.* from member m join company_member_rel cmr on "
                "m.id=cmr.memberId where cmr.companyId=%s and (cmr.active is null or cmr.active='Y')"
                " and (m.active='Y' or m.active is null)", company["id"])
            tcm = []
            for m in members:
                if int(m["type"]) not in [5010, 5020]: continue
                cm = {
                    "avatar_url":
                    "http://www.xiniudata.com/file/%s" %
                    m["photo"] if m["photo"] is not None else "",
                    "name":
                    m["name"],
                    "position":
                    m["position"],
                    "education":
                    m["education"],
                    "introduction":
                    m["description"],
                    "work":
                    m["work"]
                }
                tcm.append(cm)
            param["team"] = {"core_members": tcm}
            #comps
            coms = conn.query(
                "select c.name from companies_rel cr join company c on cr.company2Id=c.id "
                "where cr.companyId=%s and (c.active is null or c.active='Y') "
                "order by cr.distance desc limit 10", company["id"])
            param["competitors"] = [{"project_name": c["name"]} for c in coms]
            #artifact
            products = []
            for tt in [4010, 4020, 4030, 4040, 4050]:
                if tt == 4010:
                    artifacts = conn.query(
                        "select name, description, type from artifact where companyId=%s and "
                        "(active is null or active='Y') and type=%s "
                        "order by rank limit 5", company["id"], tt)
                else:
                    artifacts = conn.query(
                        "select name, description, type from artifact where companyId=%s and "
                        "(active is null or active='Y') and type=%s "
                        "order by rank desc limit 5", company["id"], tt)

                for a in artifacts:
                    if a["name"] is None: a["name"] = ""
                    if a["description"] is None: a["description"] = ""
                    if a["name"].find("av") >= 0 or a["name"].find("AV") >= 0 or \
                            a["name"].find("性爱") >= 0 or a["name"].find("做爱") >= 0 or \
                            a["name"].find("澳门") >= 0 or a["name"].find("威尼斯") >= 0 or \
                            a["name"].find("男人")>=0 or a["name"].find("成人")>=0:
                        continue
                    if a["description"].find("av") >= 0 or a["description"].find("AV") >= 0 or \
                            a["description"].find("性爱") >= 0 or a["description"].find("做爱") >= 0 or \
                            a["description"].find("澳门") >= 0 or a["description"].find("威尼斯") >= 0 or \
                            a["description"].find("男人")>=0 or a["description"].find("成人")>=0:
                        continue
                    if tmap.has_key(a["type"]):
                        if a["type"] == 4010:
                            cchinese = desc_helper.count_other(
                                a["name"]) + desc_helper.count_other(
                                    a["description"])
                            if len(cchinese) > 0:
                                logger.info("wrong luanma!!!!!: %s", a["name"])
                                continue

                        products.append({
                            "name": a["name"],
                            "description": a["description"],
                            "kind": tmap[a["type"]]
                        })
            param["products"] = products

            logger.info("products upload for %s|%s, %s", corporateId,
                        len(products), len(str(products)))
            logger.info("tags: %s", param["base"]["key_words"])
            url_company = url + '/open/xiniu/venture/capital'
            try:
                if test is True:
                    for p in param:
                        logger.info(p)
                        if p == "products":
                            for pp in param[p]:
                                # logger.info(pp)
                                logger.info(
                                    json.dumps(pp,
                                               ensure_ascii=False,
                                               cls=util.CJsonEncoder))
                        else:
                            # logger.info(p)
                            logger.info(
                                json.dumps(param[p],
                                           ensure_ascii=False,
                                           cls=util.CJsonEncoder))
                    return None, None
                else:
                    # logger.info(json.dumps(param, ensure_ascii=False, cls=util.CJsonEncoder))
                    # exit()
                    # http://httpbin.org/posthttp://www.xiniudata.com/5977875df8716656636efb78/stat/gettest
                    # res = requests.post('http://httpbin.org/post', json=param)
                    res = requests.post(url_company, json=param)
                    # logger.info("\n\n\n\n")
                    # logger.info("result:")
                    # logger.info(json.dumps(param, ensure_ascii=False, cls=util.CJsonEncoder))
                    logger.info(res.text)
                    # logger.info("\n\n\n\n")
                    # conn.close()
                    return json.loads(res.text), json.dumps(
                        param, ensure_ascii=False, cls=util.CJsonEncoder)
            except:
                pass

        # conn.close()
        return None, None

    elif type == 2:
        # conn = db.connect_torndb()
        url_investor = url + '/open/xiniu/venture/institution'
        investoras = conn.query(
            "select * from investor_alias where verify='Y' and "
            "(active is null or active!='N') and name=%s", name)
        if len(investoras) == 0:
            investoras = conn.query(
                "select * from investor_alias where "
                "(active is null or active!='N') and name=%s", name)
        if len(investoras) > 0:
            for investora in investoras:
                investor = conn.get(
                    "select * from investor where (active is null or active!='N') and id=%s",
                    investora["investorId"])

                if investor is not None:
                    try:
                        logger.info(investor)
                        param = {
                            "full_name":
                            name,
                            "short_name":
                            investor["name"] if investor["name"] is not None
                            and investor["name"] != "" else name
                        }
                        res = requests.post(url_investor, json=param)
                        # conn.close()
                        return json.loads(res.text), json.dumps(
                            param, ensure_ascii=False, cls=util.CJsonEncoder)
                    except:
                        return None, None
                        # conn.close()

        return None, None

    elif type == 3:
        try:
            # conn = db.connect_torndb()
            url_investor = url + '/open/xiniu/venture/institution'

            param = {"full_name": name, "short_name": name}
            res = requests.post(url_investor, json=param)
            # conn.close()
            return json.loads(res.text), json.dumps(param,
                                                    ensure_ascii=False,
                                                    cls=util.CJsonEncoder)
        except:
            return None, None

    else:
        return None, None