Exemplo n.º 1
0
def process():
    logger.info("Chuangyepu_company_parser begin...")

    start = 0
    while True:
        items = parser_db_util.find_process_limit(SOURCE, TYPE, start, 1000)
        #items = [parser_db_util.find_process_one(SOURCE, TYPE, 1)]

        for item in items:
            #if item['key_int'] != 1:
            #    continue
            r = parse_company(item)
            logger.info(
                json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder))

            if r["status"] == "No_Data" or r["status"] == "No_Name":
                parser_db_util.update_active(SOURCE, item["key"], 'N')
                parser_db_util.update_processed(item["_id"])
                logger.info("No infos for %s", item["url"])
                exit()
                continue

            source_company_id = parser_db_util.save_company_standard(
                r, download_crawler)
            parser_db_util.delete_source_company_name(source_company_id)
            parser_db_util.delete_source_mainbeianhao(source_company_id)
            parser_db_util.save_source_company_name(source_company_id,
                                                    r["name"], 12020)

            logger.info("source_company_id=%s", source_company_id)

            artifacts = []
            artifacts.extend(r["artifacts"])
            logger.info(
                json.dumps(artifacts,
                           ensure_ascii=False,
                           cls=util.CJsonEncoder))
            parser_db_util.save_artifacts_standard(source_company_id,
                                                   artifacts)

            parser_db_util.delete_funding(source_company_id)
            flag = parseFinance_save(source_company_id, r['fundings'],
                                     download_crawler)
            if flag:
                parser_db_util.update_processed(item["_id"])
                logger.info("processed %s", item["url"])
            else:
                logger.info("lack something:  %s", item["url"])
                exit()

        break

    logger.info("Chuangyepu_company_parser end.")
Exemplo n.º 2
0
def process():
    logger.info('crunchbase_company_parser begin ...')
    start = 0
    while True:
        items = parser_db_util.find_process_limit(SOURCE, TYPE, 0, 500)
        # mongo = db.connect_mongo()
        # collection = mongo.raw.projectdata
        # items = list(collection.find({"_id" : ObjectId("5b02a14fdeb4717184810e22")}))
        for item in items:
            if item is None:
                continue
            try:
                r = parse_company(item)
                logger.info(
                    json.dumps(r,
                               ensure_ascii=False,
                               cls=util.CJsonEncoder,
                               indent=2))
                # source_company (2010 running)
                source_company_id = parser_db_util.save_company_standard(
                    r, download_crawler)
                logger.info('%s:%s' % (item['name'], source_company_id))
                parser_db_util.delete_source_company_name(source_company_id)
                parser_db_util.delete_source_mainbeianhao(source_company_id)
                # source_company_name (12020 shortname)
                parser_db_util.save_source_company_name(
                    source_company_id, r["name"], 12020)

                artifacts = parse_artifact(source_company_id, item)
                logger.info(
                    json.dumps(artifacts,
                               ensure_ascii=False,
                               cls=util.CJsonEncoder,
                               indent=2))

                if (r["fullName"] is None or r["fullName"].strip() == "") and (r['description'] is None or r['description'].strip() == "") \
                    and len(artifacts) == 0:
                    parser_db_util.update_active(SOURCE, item["key"], 'N')
                    parser_db_util.update_processed(item["_id"])
                    logger.info("missing all stuff, processed %s", item["url"])
                    continue

                # source_artifact (4010 website)
                parser_db_util.save_artifacts_standard(source_company_id,
                                                       artifacts)

                # source_member and source_company_member_rel(5010 ceo)
                parseMember_save(source_company_id, item, download_crawler)

                parser_db_util.delete_funding(source_company_id)
                # source_funding and source_funding_investor_rel (10020 vc)
                parseFinance_save(source_company_id, item, download_crawler)

            except Exception, E:
                logger.info(E)
                pass

            parser_db_util.update_processed(item["_id"])
            logger.info("processed %s" % item["url"])
        # break
        if len(items) == 0:
            break
        logger.info('parser end.')
        return
Exemplo n.º 3
0
def process():
    logger.info("36kr_company_parser begin...")

    start = 0
    while True:
        items = parser_db_util.find_all_limit(SOURCE, TYPE, start, 1000)

        for item in items:
            r = parse_company(item)
            logger.info(
                json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder))
            if r["status"] == "INIT":
                parser_db_util.update_active(SOURCE, item["key"], 'N')
                #parser_db_util.update_processed(item["_id"])
                logger.info("processed %s", item["url"])
                continue

            parser_db_util.update_active(SOURCE, item["key"], None)

            sc = parser_db_util.get_source_company_by_source_and_sourceid(
                SOURCE, item["key"])
            if sc is None:
                source_company_id = parser_db_util.save_company_standard(
                    r, download_crawler)
                parser_db_util.delete_source_company_name(source_company_id)
                parser_db_util.delete_source_mainbeianhao(source_company_id)
                parser_db_util.save_source_company_name(
                    source_company_id, r["name"], 12020)
                parser_db_util.save_source_company_name(
                    source_company_id, r["fullName"], 12010)
                main_company_name = name_helper.get_main_company_name(
                    r["fullName"])
                if main_company_name != r["fullName"]:
                    parser_db_util.save_source_company_name(
                        source_company_id, main_company_name, 12010)
                logger.info("source_company_id=%s", source_company_id)

                artifacts = parse_artifact(source_company_id, item)
                logger.info(
                    json.dumps(artifacts,
                               ensure_ascii=False,
                               cls=util.CJsonEncoder))
                parser_db_util.save_artifacts_standard(source_company_id,
                                                       artifacts)

                parser_db_util.delete_funding(source_company_id)
                flag = parseFinance_save(source_company_id, item,
                                         download_crawler)

                if item["content"].has_key("founders") and item["content"][
                        "founders"]["data"].has_key("data"):
                    parseMember_save(
                        source_company_id, 5010,
                        item["content"]["founders"]["data"]["data"],
                        download_crawler)
                if item["content"].has_key("employees") and item["content"][
                        "employees"]["data"].has_key("data"):
                    parseMember_save(
                        source_company_id, 5030,
                        item["content"]["employees"]["data"]["data"],
                        download_crawler)
                if item["content"].has_key("former_members") and item[
                        "content"]["former_members"]["data"].has_key("data"):
                    parseMember_save(
                        source_company_id, 5040,
                        item["content"]["former_members"]["data"]["data"],
                        download_crawler)

                # if flag:
                #     parser_db_util.update_processed(item["_id"])
                #     logger.info("processed %s" ,item["url"])
                # else:
                #     logger.info("lack somethin:  %s", item["url"])

            #break
        start += 1000
        if len(items) == 0:
            break

    logger.info("36kr_company_parser end.")
Exemplo n.º 4
0
def process():
    logger.info("36kr_company_parser begin...")

    start = 0
    while True:
        items = parser_db_util.find_process_limit(SOURCE, TYPE, start, 1000)
        # items = [parser_db_util.find_process_one(SOURCE,TYPE,83700389)]
        for item in items:
            try:
                r = parse_company(item)
                logger.info(
                    json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder))

                source_company_id = parser_db_util.save_company_standard(
                    r, download_crawler)
                parser_db_util.delete_source_company_name(source_company_id)
                parser_db_util.delete_source_mainbeianhao(source_company_id)
                parser_db_util.save_source_company_name(
                    source_company_id, r["name"], 12020)
                if r["fullName"] is not None:
                    parser_db_util.save_source_company_name(
                        source_company_id, r["fullName"], 12010)
                    main_company_name = name_helper.get_main_company_name(
                        r["fullName"])
                    if main_company_name != r["fullName"]:
                        parser_db_util.save_source_company_name(
                            source_company_id, main_company_name, 12010)
                logger.info("source_company_id=%s", source_company_id)

                artifacts = parse_artifact(source_company_id, item)
                logger.info(
                    json.dumps(artifacts,
                               ensure_ascii=False,
                               cls=util.CJsonEncoder))

                if (r["fullName"] is None or r["fullName"].strip() == "") and (r['description'] is None or r['description'].strip() == "") \
                    and len(artifacts) == 0:
                    parser_db_util.update_active(SOURCE, item["key"], 'N')
                    parser_db_util.update_processed(item["_id"])
                    logger.info("missing all stuff, processed %s", item["url"])
                    continue

                parser_db_util.save_artifacts_standard(source_company_id,
                                                       artifacts)

                parseMember_save(source_company_id, item, download_crawler)
                #
                parser_db_util.delete_funding(source_company_id)
                flag = parseFinance_save(source_company_id, item,
                                         download_crawler)
                flag = True
            except Exception, E:
                logger.info(E)
                pass
            # if flag:
            parser_db_util.update_processed(item["_id"])
            logger.info("processed %s", item["url"])
            # else:
            #     logger.info("lack something:  %s", item["url"])

            #break
        #break
        if len(items) == 0:
            break
Exemplo n.º 5
0
def process():
    logger.info("lagou_company_parser begin...")
    bnames = get_blacklist()
    while True:

        items = parser_db_util.find_process_limit(SOURCE, TYPE, 0, 1000)
        # items = [parser_db_util.find_process_one(SOURCE, TYPE, 109625)]

        for item in items:

            r = parse_company(item)
            #if r is None:
            #    continue
            if r.has_key("name") and r["name"].strip() != "":
                for bname in bnames:
                    if r["name"].find(bname) >= 0:
                        logger.info("黑名单")
                        r["status"] = "No_Name"
                        break

            if r["status"] == "No_Name":
                parser_db_util.update_active(SOURCE, item["key"], 'N')
                parser_db_util.update_processed(item["_id"])
                logger.info("processed %s with no data", item["url"])
                continue

            logger.info(
                json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder))
            source_company_id = parser_db_util.save_company_standard(
                r, download_crawler)
            logger.info("sourceCompanyId : %s", source_company_id)
            parser_db_util.delete_source_company_name(source_company_id)
            parser_db_util.delete_source_mainbeianhao(source_company_id)
            if len(r["name"]) < len(r["fullName"]):
                parser_db_util.save_source_company_name(
                    source_company_id, r["name"], 12020)
            parser_db_util.save_source_company_name(source_company_id,
                                                    r["fullName"], 12010)

            artifacts = []
            artifacts.extend(r["artifacts"])
            logger.info(
                json.dumps(artifacts,
                           ensure_ascii=False,
                           cls=util.CJsonEncoder))
            #artifact provided in lagou do not have any links, ignore that
            #artifacts = parse_artifact(source_company_id, item)
            parser_db_util.save_artifacts_standard(source_company_id,
                                                   artifacts)
            parseMember_save(source_company_id, item)

            parserDevelop_save(source_company_id, item)

            # job = parser_db_util.find_process_one(SOURCE,36010, item["key_int"])
            # if job:
            #     source_jobs = lagou_job_parser.parse_companyjobs_save(source_company_id, job)
            #     if len(source_jobs) > 0:
            #         parser_db_util.save_jobs_standard(source_jobs)
            #     parser_db_util.update_processed(job["_id"])
            parser_db_util.update_processed(item["_id"])

            #exit()

        if len(items) == 0:
            break

        #break

    logger.info("lagou_company_parser end.")