Exemplo n.º 1
0
def process():
    logger.info("itjuzi_company_parser begin...")

    start = 0
    while True:
        items = parser_db_util.find_process_limit(SOURCE, TYPE, start, 1000)
        # items = [parser_db_util.find_process_one(SOURCE, TYPE, 33045986)]
        for item in items:
            logger.info(item["url"])

            r = parse_base(item)
            if r is None:
                continue
            source_company_id = parser_db_util.save_company(r, SOURCE, download_crawler)
            parser_db_util.delete_source_company_name(source_company_id)
            parser_db_util.delete_source_mainbeianhao(source_company_id)
            parser_db_util.save_source_company_name(source_company_id, r["shortName"],12020)
            parser_db_util.save_source_company_name(source_company_id, r["productName"],12020)
            if r["fullName"] is not None:
                parser_db_util.save_source_company_name(source_company_id, r["fullName"],12010)
                main_company_name = name_helper.get_main_company_name(r["fullName"])
                if main_company_name != r["fullName"]:
                    parser_db_util.save_source_company_name(source_company_id, main_company_name,12010)

            logger.info("source_company_id=%s", source_company_id)

            artifacts = parse_artifact(item)
            flag = False
            if len(artifacts) > 0:
                flag = True

            artifacts.extend(r["artifacts"])
            logger.info(artifacts)
            parser_db_util.save_artifacts(source_company_id, artifacts)

            footprints = parse_footprint(item)
            parser_db_util.save_footprints(source_company_id, footprints)

            # members = parse_member(item)
            # parser_db_util.save_member_rels(source_company_id, members, SOURCE)
            parseMember_save(source_company_id, item, download_crawler)

            parser_db_util.update_processed(item["_id"])

            #if flag:
        # break
        start += 1000
        if len(items) == 0:
            break

    logger.info("itjuzi_company_parser end.")
Exemplo n.º 2
0
def process():
    logger.info("Demo8_next_parser begin...")

    items = parser_db_util.find_process(SOURCE, TYPE)

    for item in items:
        logger.info(item["url"])
        r = parse_base(item)
        if r is None:
            continue
        #logger.info(r)
        source_company_id = parser_db_util.save_company(r, SOURCE)
        logger.info("source_company_id=%s", source_company_id)

        parser_db_util.save_company_score(source_company_id, r["score"])
        parser_db_util.save_artifacts(source_company_id, r["artifacts"])

        parser_db_util.update_processed(item["_id"])
        #break

    logger.info("Demo8_next_parser end.")
Exemplo n.º 3
0
def process():
    logger.info("itjuzi_next_parser begin...")

    items = parser_db_util.find_process(SOURCE, TYPE)

    for item in items:
        logger.info(item["url"])

        r = parse_base(item)
        if r is None:
            continue
        #logger.info(r)
        source_company_id = parser_db_util.save_company(
            r, SOURCE, download_crawler)
        logger.info("source_company_id=%s", source_company_id)

        parser_db_util.save_company_score(source_company_id, r["score"])

        artifacts = []
        for artifact in r["artifacts"]:
            link = artifact["link"]
            type, app_market, app_id = url_helper.get_market(link)
            if type is None:
                continue
            if type == 4040 or type == 4050:
                if app_id is None:
                    continue
            artifact["type"] = type
            artifact["domain"] = app_id
            artifacts.append(artifact)

        parser_db_util.save_artifacts(source_company_id, artifacts)

        parser_db_util.update_processed(item["_id"])
        #break

    logger.info("itjuzi_next_parser end.")
Exemplo n.º 4
0
def process():
    logger.info("36kr_next_parser begin...")

    items = parser_db_util.find_process(SOURCE, TYPE)

    for item in items:
        logger.info(item["url"])

        r = parse_base(item)
        if r is None:
            continue
        #logger.info(r)
        try:
            source_company_id = parser_db_util.save_company(r, SOURCE, download_crawler)
            logger.info("source_company_id=%s", source_company_id)

            parser_db_util.save_company_score(source_company_id, r["score"])
            parser_db_util.save_artifacts(source_company_id, r["artifacts"])

            parser_db_util.update_processed(item["_id"])
            #break
        except Exception,ex:
            logger.info(ex)
            continue