示例#1
0
def process():
    logger.info("sse_company_parser begin...")

    start = 0
    while True:
        mongo = db.connect_mongo()
        collection = mongo.stock.sse
        items = list(collection.find({"processStatus": 1}).limit(100))

        for item in items:
            # try:
            r = parse_company(item)
            logger.info(
                json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder))

            source_company_id = parser_db_util.save_company_standard(
                r, download_crawler)
            parser_db_util.delete_source_company_name(source_company_id)
            parser_db_util.delete_source_mainbeianhao(source_company_id)
            parser_db_util.save_source_company_name(source_company_id,
                                                    r["name"], 12020)
            parser_db_util.save_source_company_name(source_company_id,
                                                    r["fullName"], 12010)
            main_company_name = name_helper.get_main_company_name(
                r["fullName"])
            if main_company_name != r["fullName"]:
                parser_db_util.save_source_company_name(
                    source_company_id, main_company_name, 12010)
            logger.info("source_company_id=%s", source_company_id)

            if r["englishName"] is not None and r["englishName"].strip() != "" and r["englishName"].strip() != "-" \
                and r["englishName"].strip() != "null" and r["englishName"].strip() != "无":
                parser_db_util.save_source_company_name(
                    source_company_id, r["englishName"], 12010)

            # source_company_id = None
            artifacts = parse_artifact(source_company_id, item)
            logger.info(
                json.dumps(artifacts,
                           ensure_ascii=False,
                           cls=util.CJsonEncoder))

            parser_db_util.save_artifacts_standard(source_company_id,
                                                   artifacts)

            parseMember_save(source_company_id, item, download_crawler)

            collection.update({"_id": item["_id"]},
                              {"$set": {
                                  "processStatus": 2
                              }})
            logger.info("processed %s", item["sourceId"])

        # break
        mongo.close()
        if len(items) == 0:
            break

    logger.info("sse_company_parser end.")
示例#2
0
def process():
    logger.info("itjuzi_company_parser begin...")

    start = 0
    while True:
        items = parser_db_util.find_process_limit(SOURCE, TYPE, start, 1000)
        # items = [parser_db_util.find_process_one(SOURCE, TYPE, 33045986)]
        for item in items:
            logger.info(item["url"])

            r = parse_base(item)
            if r is None:
                continue
            source_company_id = parser_db_util.save_company(r, SOURCE, download_crawler)
            parser_db_util.delete_source_company_name(source_company_id)
            parser_db_util.delete_source_mainbeianhao(source_company_id)
            parser_db_util.save_source_company_name(source_company_id, r["shortName"],12020)
            parser_db_util.save_source_company_name(source_company_id, r["productName"],12020)
            if r["fullName"] is not None:
                parser_db_util.save_source_company_name(source_company_id, r["fullName"],12010)
                main_company_name = name_helper.get_main_company_name(r["fullName"])
                if main_company_name != r["fullName"]:
                    parser_db_util.save_source_company_name(source_company_id, main_company_name,12010)

            logger.info("source_company_id=%s", source_company_id)

            artifacts = parse_artifact(item)
            flag = False
            if len(artifacts) > 0:
                flag = True

            artifacts.extend(r["artifacts"])
            logger.info(artifacts)
            parser_db_util.save_artifacts(source_company_id, artifacts)

            footprints = parse_footprint(item)
            parser_db_util.save_footprints(source_company_id, footprints)

            # members = parse_member(item)
            # parser_db_util.save_member_rels(source_company_id, members, SOURCE)
            parseMember_save(source_company_id, item, download_crawler)

            parser_db_util.update_processed(item["_id"])

            #if flag:
        # break
        start += 1000
        if len(items) == 0:
            break

    logger.info("itjuzi_company_parser end.")
示例#3
0
if __name__ == '__main__':
    logger.info("Begin...")
    conn = db.connect_torndb()
    companies = conn.query("select id,fullname from company")
    for c in companies:
        aliases = conn.query(
            "select * from company_alias where companyId=%s and type=12010",
            c["id"])
        for alias in aliases:
            name = alias["name"]
            new_name = name_helper.company_name_normalize(name)
            if name != new_name:
                logger.info("1. %s --- %s", name, new_name)
                update_company_alias(alias["id"], new_name)
            main_name = name_helper.get_main_company_name(new_name)
            if main_name != new_name:
                logger.info("2. %s --- %s", new_name, main_name)
                save_company_alias(c["id"], main_name)

        fullname = c["fullname"]
        if fullname is None or fullname.strip() == "":
            continue
        is_chinese, is_company = name_helper.name_check(fullname)
        if is_company:
            new_name = name_helper.company_name_normalize(fullname)
            if fullname != new_name:
                save_company_alias(c["id"], new_name)
                logger.info("3. %s --- %s", fullname, new_name)
            main_name = name_helper.get_main_company_name(new_name)
            if main_name != new_name:
示例#4
0
            logger.info("items : %s", len(items))
            for item in items:
                # if item.has_key("processed") and item["processed"] is True:
                #     continue
                try:
                    logger.info(item)
                    r = parse_company(item)
                    # logger.info(json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder))
                    for i in r:
                        logger.info("%s - %s",i,r[i])
                    source_company_id = parser_db_util.save_company_standard(r, download_crawler)
                    parser_db_util.delete_source_company_name(source_company_id)
                    parser_db_util.delete_source_mainbeianhao(source_company_id)
                    parser_db_util.save_source_company_name(source_company_id, r["name"], 12020)
                    parser_db_util.save_source_company_name(source_company_id, r["fullName"], 12010)
                    main_company_name = name_helper.get_main_company_name(r["fullName"])
                    if main_company_name != r["fullName"]:
                        parser_db_util.save_source_company_name(source_company_id, main_company_name, 12010)
                    logger.info("source_company_id=%s", source_company_id)

                    artifacts = []
                    artifacts.extend(r["artifacts"])
                    logger.info(json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder))


                    parser_db_util.save_artifacts_standard(source_company_id, artifacts)
                    #
                    # #
                    parser_db_util.delete_funding(source_company_id)
                    flag = parseFinance_save(source_company_id, item, download_crawler)
                    # flag = True
示例#5
0
def process():
    logger.info("xtecher_company_parser begin...")

    start = 0
    while True:
        items = parser_db_util.find_process_limit(SOURCE, TYPE, start, 1000)

        for item in items:
            r = parse_company(item)
            if r == 0:
                parser_db_util.update_processed(item["_id"])
                logger.info("missing website and companyName, processed %s",
                            item["url"])
                continue

            logger.info(
                json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder))

            source_company_id = parser_db_util.save_company_standard(
                r, download_crawler)
            parser_db_util.delete_source_company_name(source_company_id)
            parser_db_util.delete_source_mainbeianhao(source_company_id)
            parser_db_util.save_source_company_name(source_company_id,
                                                    r["name"], 12020)
            if r.has_key('fakeName'):
                parser_db_util.save_source_company_name(
                    source_company_id, r["fakeName"], 12020)
            else:
                parser_db_util.save_source_company_name(
                    source_company_id, r["fullName"], 12010)
                main_company_name = name_helper.get_main_company_name(
                    r["fullName"])
                if main_company_name != r["fullName"]:
                    parser_db_util.save_source_company_name(
                        source_company_id, main_company_name, 12010)
            logger.info("source_company_id=%s", source_company_id)

            artifacts = parse_artifact(source_company_id, r)
            logger.info(
                json.dumps(artifacts,
                           ensure_ascii=False,
                           cls=util.CJsonEncoder))

            parser_db_util.save_artifacts_standard(source_company_id,
                                                   artifacts)

            # parser_db_util.delete_funding(source_company_id)
            # flag=parseFinance_save(source_company_id,item, download_crawler)
            flag = True

            if flag:
                parser_db_util.update_processed(item["_id"])
                logger.info("processed %s", item["url"])
            else:
                logger.info("lack something:  %s", item["url"])

                # break
        # start += 1000  # todo
        if len(items) == 0:
            break

    logger.info("xtecher_company_parser end.")
示例#6
0
def process():
    logger.info("36kr_company_parser begin...")

    start = 0
    while True:
        items = parser_mongo_util.find_process_limit(SOURCE, TYPE, start, 1000)

        for item in items:
            r = parse_company(item)
            logger.info(
                json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder))
            if r["status"] == "INIT":
                parser_mongo_util.update_processed(item["_id"])
                logger.info("processed %s", item["url"])
                continue

            parser_mongo_util.save_mongo_company(r["source"], r["sourceId"], r)
            parser_mongo_util.save_mongo_source_company_name(
                r["source"], r["sourceId"], {
                    "name": r["name"],
                    "type": 12020
                })
            parser_mongo_util.save_mongo_source_company_name(
                r["source"], r["sourceId"], {
                    "name": r["fullName"],
                    "type": 12010
                })
            main_company_name = name_helper.get_main_company_name(
                r["fullName"])
            if main_company_name != r["fullName"]:
                parser_mongo_util.save_mongo_source_company_name(
                    r["source"], r["sourceId"], {
                        "name": main_company_name,
                        "type": 12010
                    })

            artifacts = parse_artifact(item)
            logger.info(
                json.dumps(artifacts,
                           ensure_ascii=False,
                           cls=util.CJsonEncoder))
            for artifact in artifacts:
                parser_mongo_util.save_mongo_source_artifact(
                    r["source"], r["sourceId"], artifact)

            flag = parseFinance_save(r["source"], r["sourceId"], item,
                                     download_crawler)

            if item["content"].has_key("founders") and item["content"][
                    "founders"]["data"].has_key("data"):
                parseMember_save(r["source"], r["sourceId"], 5010,
                                 item["content"]["founders"]["data"]["data"],
                                 download_crawler)
            if item["content"].has_key("employees") and item["content"][
                    "employees"]["data"].has_key("data"):
                parseMember_save(r["source"], r["sourceId"], 5030,
                                 item["content"]["employees"]["data"]["data"],
                                 download_crawler)
            if item["content"].has_key("former_members") and item["content"][
                    "former_members"]["data"].has_key("data"):
                parseMember_save(
                    r["source"], r["sourceId"], 5040,
                    item["content"]["former_members"]["data"]["data"],
                    download_crawler)

            if flag:
                #parser_mongo_util.update_processed(item["_id"])
                logger.info("processed %s", item["url"])
            else:
                logger.info("lack somethin:  %s", item["url"])

            parser_mongo_util.update_processStatus(r["source"], r["sourceId"],
                                                   0)

            #break
        start += 1000
        if len(items) == 0:
            break

    logger.info("36kr_company_parser end.")
示例#7
0
def process():
    logger.info("36kr_company_parser begin...")

    start = 0
    while True:
        items = parser_db_util.find_process_limit(SOURCE, TYPE, start, 1000)
        # items = [parser_db_util.find_process_one(SOURCE,TYPE,83700389)]
        for item in items:
            try:
                r = parse_company(item)
                logger.info(
                    json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder))

                source_company_id = parser_db_util.save_company_standard(
                    r, download_crawler)
                parser_db_util.delete_source_company_name(source_company_id)
                parser_db_util.delete_source_mainbeianhao(source_company_id)
                parser_db_util.save_source_company_name(
                    source_company_id, r["name"], 12020)
                if r["fullName"] is not None:
                    parser_db_util.save_source_company_name(
                        source_company_id, r["fullName"], 12010)
                    main_company_name = name_helper.get_main_company_name(
                        r["fullName"])
                    if main_company_name != r["fullName"]:
                        parser_db_util.save_source_company_name(
                            source_company_id, main_company_name, 12010)
                logger.info("source_company_id=%s", source_company_id)

                artifacts = parse_artifact(source_company_id, item)
                logger.info(
                    json.dumps(artifacts,
                               ensure_ascii=False,
                               cls=util.CJsonEncoder))

                if (r["fullName"] is None or r["fullName"].strip() == "") and (r['description'] is None or r['description'].strip() == "") \
                    and len(artifacts) == 0:
                    parser_db_util.update_active(SOURCE, item["key"], 'N')
                    parser_db_util.update_processed(item["_id"])
                    logger.info("missing all stuff, processed %s", item["url"])
                    continue

                parser_db_util.save_artifacts_standard(source_company_id,
                                                       artifacts)

                parseMember_save(source_company_id, item, download_crawler)
                #
                parser_db_util.delete_funding(source_company_id)
                flag = parseFinance_save(source_company_id, item,
                                         download_crawler)
                flag = True
            except Exception, E:
                logger.info(E)
                pass
            # if flag:
            parser_db_util.update_processed(item["_id"])
            logger.info("processed %s", item["url"])
            # else:
            #     logger.info("lack something:  %s", item["url"])

            #break
        #break
        if len(items) == 0:
            break
示例#8
0
def process():
    logger.info("36kr_company_parser begin...")

    start = 0
    while True:
        items = parser_db_util.find_all_limit(SOURCE, TYPE, start, 1000)

        for item in items:
            r = parse_company(item)
            logger.info(
                json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder))
            if r["status"] == "INIT":
                parser_db_util.update_active(SOURCE, item["key"], 'N')
                #parser_db_util.update_processed(item["_id"])
                logger.info("processed %s", item["url"])
                continue

            parser_db_util.update_active(SOURCE, item["key"], None)

            sc = parser_db_util.get_source_company_by_source_and_sourceid(
                SOURCE, item["key"])
            if sc is None:
                source_company_id = parser_db_util.save_company_standard(
                    r, download_crawler)
                parser_db_util.delete_source_company_name(source_company_id)
                parser_db_util.delete_source_mainbeianhao(source_company_id)
                parser_db_util.save_source_company_name(
                    source_company_id, r["name"], 12020)
                parser_db_util.save_source_company_name(
                    source_company_id, r["fullName"], 12010)
                main_company_name = name_helper.get_main_company_name(
                    r["fullName"])
                if main_company_name != r["fullName"]:
                    parser_db_util.save_source_company_name(
                        source_company_id, main_company_name, 12010)
                logger.info("source_company_id=%s", source_company_id)

                artifacts = parse_artifact(source_company_id, item)
                logger.info(
                    json.dumps(artifacts,
                               ensure_ascii=False,
                               cls=util.CJsonEncoder))
                parser_db_util.save_artifacts_standard(source_company_id,
                                                       artifacts)

                parser_db_util.delete_funding(source_company_id)
                flag = parseFinance_save(source_company_id, item,
                                         download_crawler)

                if item["content"].has_key("founders") and item["content"][
                        "founders"]["data"].has_key("data"):
                    parseMember_save(
                        source_company_id, 5010,
                        item["content"]["founders"]["data"]["data"],
                        download_crawler)
                if item["content"].has_key("employees") and item["content"][
                        "employees"]["data"].has_key("data"):
                    parseMember_save(
                        source_company_id, 5030,
                        item["content"]["employees"]["data"]["data"],
                        download_crawler)
                if item["content"].has_key("former_members") and item[
                        "content"]["former_members"]["data"].has_key("data"):
                    parseMember_save(
                        source_company_id, 5040,
                        item["content"]["former_members"]["data"]["data"],
                        download_crawler)

                # if flag:
                #     parser_db_util.update_processed(item["_id"])
                #     logger.info("processed %s" ,item["url"])
                # else:
                #     logger.info("lack somethin:  %s", item["url"])

            #break
        start += 1000
        if len(items) == 0:
            break

    logger.info("36kr_company_parser end.")
示例#9
0
def process(sourceId=0):
    logger.info("evervc_company_parser begin...")

    start = 0
    while True:
        if sourceId > 0:
            items = [parser_db_util.find_process_one(SOURCE, TYPE, sourceId)]
        else:
            items = parser_db_util.find_process_limit(SOURCE, TYPE, start,
                                                      1000)

        for item in items:
            r = parse_company(item)
            logger.info(
                json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder))

            source_company_id = parser_db_util.save_company_standard(
                r, download_crawler)
            parser_db_util.delete_source_company_name(source_company_id)
            parser_db_util.delete_source_mainbeianhao(source_company_id)
            parser_db_util.save_source_company_name(source_company_id,
                                                    r["fullName"], 12010)
            if len(r["name"]) < len(
                    r["fullName"]
            ) or r['fullName'] is None or r["fullName"] == '':
                parser_db_util.save_source_company_name(
                    source_company_id, r["name"], 12020)
            main_company_name = name_helper.get_main_company_name(
                r["fullName"])
            if main_company_name != r["fullName"]:
                parser_db_util.save_source_company_name(
                    source_company_id, main_company_name, 12010)
            logger.info("source_company_id=%s", source_company_id)

            artifacts = parse_artifact(source_company_id, r)
            logger.info(
                json.dumps(artifacts,
                           ensure_ascii=False,
                           cls=util.CJsonEncoder))

            parser_db_util.save_artifacts_standard(source_company_id,
                                                   artifacts)

            parseMember_save(source_company_id, item, download_crawler)

            parser_db_util.delete_funding(source_company_id)  ##??
            flag = parseFinance_save(source_company_id, item, r['sourceId'],
                                     download_crawler)
            flag = True

            if flag:
                parser_db_util.update_processed(item["_id"])
                logger.info("processed %s", item["url"])
            else:
                logger.info("lack something:  %s", item["url"])

                # break

        # start += 1000  # todo
        if len(items) == 0 or sourceId > 0:
            break

    logger.info("evervc_company_parser end.")
示例#10
0
def process():
    logger.info("itjuzi_company_parser begin...")

    start = 0
    while True:
        items = parser_mongo_util.find_process_limit(SOURCE, TYPE, start, 1000)
        for item in items:
            logger.info(item["url"])

            r = parse_base(item)
            if r is None:
                continue
            #source_company_id = parser_db_util.save_company(r, SOURCE, download_crawler)
            parser_mongo_util.save_mongo_company(r["source"], r["sourceId"], r)
            # parser_db_util.delete_source_company_name(source_company_id)
            # parser_db_util.delete_source_mainbeianhao(source_company_id)
            # parser_db_util.save_source_company_name(source_company_id, r["shortName"],12020)
            # parser_db_util.save_source_company_name(source_company_id, r["productName"],12020)
            # parser_db_util.save_source_company_name(source_company_id, r["fullName"],12010)
            parser_mongo_util.save_mongo_source_company_name(
                r["source"], r["sourceId"], {
                    "name": r["shortName"],
                    "type": 12020
                })
            parser_mongo_util.save_mongo_source_company_name(
                r["source"], r["sourceId"], {
                    "name": r["productName"],
                    "type": 12020
                })
            parser_mongo_util.save_mongo_source_company_name(
                r["source"], r["sourceId"], {
                    "name": r["fullName"],
                    "type": 12020
                })
            main_company_name = name_helper.get_main_company_name(
                r["fullName"])
            if main_company_name != r["fullName"]:
                parser_mongo_util.save_mongo_source_company_name(
                    r["source"], r["sourceId"], {
                        "name": main_company_name,
                        "type": 12020
                    })

            # logger.info("source_company_id=%s", source_company_id)

            artifacts = parse_artifact(item)
            artifacts.extend(r["artifacts"])
            logger.info(
                json.dumps(artifacts,
                           ensure_ascii=False,
                           cls=util.CJsonEncoder))
            # parser_db_util.save_artifacts(source_company_id, artifacts)
            for artifact in artifacts:
                parser_mongo_util.save_mongo_source_artifact(
                    r["source"], r["sourceId"], artifact)

            #TODO FOOTPRINTS
            # footprints = parse_footprint(item)
            # parser_db_util.save_footprints(source_company_id, footprints)

            members = parse_member(item)
            # parser_db_util.save_member_rels(source_company_id, members, SOURCE)
            for member in members:
                parser_mongo_util.save_mongo_source_company_member(
                    r["source"], r["sourceId"], member)

            parser_mongo_util.update_processed(item["_id"])
            parser_mongo_util.update_processStatus(r["source"], r["sourceId"])

            #if flag:
            #break
        start += 1000
        if len(items) == 0:
            break

    logger.info("itjuzi_company_parser end.")