Python IcpchinazCrawler示例，icp_chinaz.IcpchinazCrawler Python示例

示例#1

0

显示文件

文件： company_decompose_dev.py 项目： yujiye/Codes

def decompose(company_id, hard=True):
    conn = db.connect_torndb()
    company = conn.get("select * from company where id=%s", company_id)
    scs = list(
        conn.query(
            "select * from source_company where (active is null or active='Y') and (source is not null and source != 13002 and (source < 13100 or source >= 13110)) and companyStatus!=2020 and companyId=%s order by source",
            company_id))
    conn.close()

    if len(scs) < 2:
        logger.info(
            "Company : %s has one active source company, no need decompose",
            company_id)
        return True

    fullName = company["fullName"]
    name = company["name"]
    description = company["description"]
    # init crawler
    beian_links_crawler = beian_links.BeianLinksCrawler()
    icp_chinaz_crawler = icp_chinaz.IcpchinazCrawler()
    screenshot_crawler = screenshot_website.phantomjsScreenshot()

    for sc in scs:

        company_info_expand.expand_source_company(sc["id"],
                                                  beian_links_crawler,
                                                  icp_chinaz_crawler,
                                                  screenshot_crawler)
        company_aggregator_dev.aggregator(sc)

    return True

示例#2

0

显示文件

文件： company_decompose.py 项目： yujiye/Codes

def decompose(company_id, hard=True):
    conn = db.connect_torndb()
    company = conn.get("select * from company where id=%s", company_id)
    scs = list(conn.query(
        "select * from source_company where (active is null or active='Y') and (source is not null and source != 13002 and (source < 13100 or source >= 13110)) and companyStatus!=2020 and companyId=%s order by source",
        company_id))
    conn.close()

    if len(scs) < 2:
        logger.info("Company : %s has one active source company, no need decompose", company_id)
        return True

    fullName = company["fullName"]
    name = company["name"]
    description = company["description"]
    # init crawler
    beian_links_crawler = beian_links.BeianLinksCrawler()
    icp_chinaz_crawler = icp_chinaz.IcpchinazCrawler()
    screenshot_crawler = screenshot_website.phantomjsScreenshot()

    reserve_sc = None
    for sc in scs:
        logger.info("source company: %s, source: %s, sourceId: %s", sc["id"], sc["source"], sc["sourceId"])
        if sc["name"].strip() != "" and sc["name"] == name:
            # logger.info("Reserve source company: %s, %s for company: %s, %s", sc["id"], sc["name"], company["id"], company["name"])
            reserve_sc = sc
            break
            # update_column(company,sc)
            # delete_old_data(company_id)
            # company_info_expand.expand_source_company(sc["id"], beian_links_crawler, icp_chinaz_crawler,screenshot_crawler)
            # set_processStatus_zero(company_id, sc["id"])
            # company_aggregator.aggregator(sc)
            # return True
            #
    #Must find one sc for decompose

    # #if no source_company can match company
    # sc_ids = [str(sc["id"]) for sc in scs if sc.has_key("id")]
    # logger.info("Can not locate source companys (%s) for company: %s", sc_ids, company_id)
    # return False
    if reserve_sc is None:
        reserve_sc = scs[0]

    logger.info("Reserve source company: %s, %s for company: %s, %s", reserve_sc["id"], reserve_sc["name"], company["id"], company["name"])
    update_column(company,reserve_sc)
    delete_old_data(company_id)
    company_info_expand.expand_source_company(reserve_sc["id"], beian_links_crawler, icp_chinaz_crawler,screenshot_crawler)
    set_processStatus_zero(company_id, reserve_sc["id"], hard)

    for sc in scs:
        set_funding_processStatus(sc["id"])

    company_aggregator.aggregator(reserve_sc)
    return True

示例#3

0

显示文件

def expand():
    #init crawler
    beian_links_crawler = beian_links.BeianLinksCrawler()
    icp_chinaz_crawler = icp_chinaz.IcpchinazCrawler()
    screenshot_crawler = screenshot_website.phantomjsScreenshot()
    download_crawler_itjuzi = download.DownloadCrawler(max_crawl=200,
                                                       timeout=10)
    download_crawler_kr36 = download.DownloadCrawler(use_proxy=False)
    download_crawler_lagou = download.DownloadCrawler(use_proxy=True)
    download_crawler = download.DownloadCrawler()
    while True:
        # gevent -> list of source_companies

        if len(COMPANIES) == 0:
            return
        sc = COMPANIES.pop(0)
        source = sc["source"]
        sourceId = sc["sourceId"]

        # company_info_expand_mongo.expand_source_company(source, sourceId, beian_links_crawler, icp_chinaz_crawler, screenshot_crawler)

        if source == 13030:
            diff_sourceCompanyId = check_expand_diff.check_diff(
                source, sourceId, download_crawler_itjuzi)
        elif source == 13020:
            diff_sourceCompanyId = check_expand_diff.check_diff(
                source, sourceId, download_crawler_kr36)
        elif source == 13050:
            diff_sourceCompanyId = check_expand_diff.check_diff(
                source, sourceId, download_crawler_lagou)
        else:
            diff_sourceCompanyId = check_expand_diff.check_diff(
                source, sourceId, download_crawler)
        logger.info("Source: %s, sourceId: %s, Diff: %s", source, sourceId,
                    diff_sourceCompanyId)
        #Set processStatus in mysql and mongo
        mongo = db.connect_mongo()
        collection_source_company = mongo.source.company
        collection_source_company.update_one(
            {
                "source": source,
                "sourceId": sourceId
            }, {'$set': {
                "processStatus": 1
            }})
        mongo.close()
        if diff_sourceCompanyId is not None:
            # #Set recommendIds
            # # insert audit_source_company
            # parser_mysql_util.insert_audit_source_company(diff_sourceCompanyId)
            # parser_mysql_util.update_db_processStatus(source, sourceId, 1)
            pass

示例#4

0

显示文件

文件： corporate_util.py 项目： yujiye/Codes

    os.path.join(
        os.path.split(os.path.realpath(__file__))[0],
        '../../crawler/screenshot'))
import screenshot_website

sys.path.append(
    os.path.join(os.path.split(os.path.realpath(__file__))[0], '../corporate'))
import company_info_expand
# import company_aggregator
import company_aggregator_new
import company_aggregator_baseinfo
import corporate_aggregator
import company_replacement

beian_links_crawler = beian_links.BeianLinksCrawler()
icp_chinaz_crawler = icp_chinaz.IcpchinazCrawler()
screenshot_crawler = screenshot_website.phantomjsScreenshot()

#logger
loghelper.init_logger("corporate_util", stream=True)
logger = loghelper.get_logger("corporate_util")


def insert_company(name, fullName, aliases):

    conn = db.connect_torndb()
    sql = "insert company(code,name,fullName,createTime,modifyTime,active) \
           values(%s,%s,%s,now(),now(),'P')"

    code = company_aggregator_baseinfo.get_company_code(name)