def process(): logger.info("sse_company_parser begin...") start = 0 while True: mongo = db.connect_mongo() collection = mongo.stock.sse items = list(collection.find({"processStatus": 1}).limit(100)) for item in items: # try: r = parse_company(item) logger.info( json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder)) source_company_id = parser_db_util.save_company_standard( r, download_crawler) parser_db_util.delete_source_company_name(source_company_id) parser_db_util.delete_source_mainbeianhao(source_company_id) parser_db_util.save_source_company_name(source_company_id, r["name"], 12020) parser_db_util.save_source_company_name(source_company_id, r["fullName"], 12010) main_company_name = name_helper.get_main_company_name( r["fullName"]) if main_company_name != r["fullName"]: parser_db_util.save_source_company_name( source_company_id, main_company_name, 12010) logger.info("source_company_id=%s", source_company_id) if r["englishName"] is not None and r["englishName"].strip() != "" and r["englishName"].strip() != "-" \ and r["englishName"].strip() != "null" and r["englishName"].strip() != "无": parser_db_util.save_source_company_name( source_company_id, r["englishName"], 12010) # source_company_id = None artifacts = parse_artifact(source_company_id, item) logger.info( json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder)) parser_db_util.save_artifacts_standard(source_company_id, artifacts) parseMember_save(source_company_id, item, download_crawler) collection.update({"_id": item["_id"]}, {"$set": { "processStatus": 2 }}) logger.info("processed %s", item["sourceId"]) # break mongo.close() if len(items) == 0: break logger.info("sse_company_parser end.")
def process(): logger.info("itjuzi_company_parser begin...") start = 0 while True: items = parser_db_util.find_process_limit(SOURCE, TYPE, start, 1000) # items = [parser_db_util.find_process_one(SOURCE, TYPE, 33045986)] for item in items: logger.info(item["url"]) r = parse_base(item) if r is None: continue source_company_id = parser_db_util.save_company(r, SOURCE, download_crawler) parser_db_util.delete_source_company_name(source_company_id) parser_db_util.delete_source_mainbeianhao(source_company_id) parser_db_util.save_source_company_name(source_company_id, r["shortName"],12020) parser_db_util.save_source_company_name(source_company_id, r["productName"],12020) if r["fullName"] is not None: parser_db_util.save_source_company_name(source_company_id, r["fullName"],12010) main_company_name = name_helper.get_main_company_name(r["fullName"]) if main_company_name != r["fullName"]: parser_db_util.save_source_company_name(source_company_id, main_company_name,12010) logger.info("source_company_id=%s", source_company_id) artifacts = parse_artifact(item) flag = False if len(artifacts) > 0: flag = True artifacts.extend(r["artifacts"]) logger.info(artifacts) parser_db_util.save_artifacts(source_company_id, artifacts) footprints = parse_footprint(item) parser_db_util.save_footprints(source_company_id, footprints) # members = parse_member(item) # parser_db_util.save_member_rels(source_company_id, members, SOURCE) parseMember_save(source_company_id, item, download_crawler) parser_db_util.update_processed(item["_id"]) #if flag: # break start += 1000 if len(items) == 0: break logger.info("itjuzi_company_parser end.")
if __name__ == '__main__': logger.info("Begin...") conn = db.connect_torndb() companies = conn.query("select id,fullname from company") for c in companies: aliases = conn.query( "select * from company_alias where companyId=%s and type=12010", c["id"]) for alias in aliases: name = alias["name"] new_name = name_helper.company_name_normalize(name) if name != new_name: logger.info("1. %s --- %s", name, new_name) update_company_alias(alias["id"], new_name) main_name = name_helper.get_main_company_name(new_name) if main_name != new_name: logger.info("2. %s --- %s", new_name, main_name) save_company_alias(c["id"], main_name) fullname = c["fullname"] if fullname is None or fullname.strip() == "": continue is_chinese, is_company = name_helper.name_check(fullname) if is_company: new_name = name_helper.company_name_normalize(fullname) if fullname != new_name: save_company_alias(c["id"], new_name) logger.info("3. %s --- %s", fullname, new_name) main_name = name_helper.get_main_company_name(new_name) if main_name != new_name:
logger.info("items : %s", len(items)) for item in items: # if item.has_key("processed") and item["processed"] is True: # continue try: logger.info(item) r = parse_company(item) # logger.info(json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder)) for i in r: logger.info("%s - %s",i,r[i]) source_company_id = parser_db_util.save_company_standard(r, download_crawler) parser_db_util.delete_source_company_name(source_company_id) parser_db_util.delete_source_mainbeianhao(source_company_id) parser_db_util.save_source_company_name(source_company_id, r["name"], 12020) parser_db_util.save_source_company_name(source_company_id, r["fullName"], 12010) main_company_name = name_helper.get_main_company_name(r["fullName"]) if main_company_name != r["fullName"]: parser_db_util.save_source_company_name(source_company_id, main_company_name, 12010) logger.info("source_company_id=%s", source_company_id) artifacts = [] artifacts.extend(r["artifacts"]) logger.info(json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder)) parser_db_util.save_artifacts_standard(source_company_id, artifacts) # # # parser_db_util.delete_funding(source_company_id) flag = parseFinance_save(source_company_id, item, download_crawler) # flag = True
def process(): logger.info("xtecher_company_parser begin...") start = 0 while True: items = parser_db_util.find_process_limit(SOURCE, TYPE, start, 1000) for item in items: r = parse_company(item) if r == 0: parser_db_util.update_processed(item["_id"]) logger.info("missing website and companyName, processed %s", item["url"]) continue logger.info( json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder)) source_company_id = parser_db_util.save_company_standard( r, download_crawler) parser_db_util.delete_source_company_name(source_company_id) parser_db_util.delete_source_mainbeianhao(source_company_id) parser_db_util.save_source_company_name(source_company_id, r["name"], 12020) if r.has_key('fakeName'): parser_db_util.save_source_company_name( source_company_id, r["fakeName"], 12020) else: parser_db_util.save_source_company_name( source_company_id, r["fullName"], 12010) main_company_name = name_helper.get_main_company_name( r["fullName"]) if main_company_name != r["fullName"]: parser_db_util.save_source_company_name( source_company_id, main_company_name, 12010) logger.info("source_company_id=%s", source_company_id) artifacts = parse_artifact(source_company_id, r) logger.info( json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder)) parser_db_util.save_artifacts_standard(source_company_id, artifacts) # parser_db_util.delete_funding(source_company_id) # flag=parseFinance_save(source_company_id,item, download_crawler) flag = True if flag: parser_db_util.update_processed(item["_id"]) logger.info("processed %s", item["url"]) else: logger.info("lack something: %s", item["url"]) # break # start += 1000 # todo if len(items) == 0: break logger.info("xtecher_company_parser end.")
def process(): logger.info("36kr_company_parser begin...") start = 0 while True: items = parser_mongo_util.find_process_limit(SOURCE, TYPE, start, 1000) for item in items: r = parse_company(item) logger.info( json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder)) if r["status"] == "INIT": parser_mongo_util.update_processed(item["_id"]) logger.info("processed %s", item["url"]) continue parser_mongo_util.save_mongo_company(r["source"], r["sourceId"], r) parser_mongo_util.save_mongo_source_company_name( r["source"], r["sourceId"], { "name": r["name"], "type": 12020 }) parser_mongo_util.save_mongo_source_company_name( r["source"], r["sourceId"], { "name": r["fullName"], "type": 12010 }) main_company_name = name_helper.get_main_company_name( r["fullName"]) if main_company_name != r["fullName"]: parser_mongo_util.save_mongo_source_company_name( r["source"], r["sourceId"], { "name": main_company_name, "type": 12010 }) artifacts = parse_artifact(item) logger.info( json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder)) for artifact in artifacts: parser_mongo_util.save_mongo_source_artifact( r["source"], r["sourceId"], artifact) flag = parseFinance_save(r["source"], r["sourceId"], item, download_crawler) if item["content"].has_key("founders") and item["content"][ "founders"]["data"].has_key("data"): parseMember_save(r["source"], r["sourceId"], 5010, item["content"]["founders"]["data"]["data"], download_crawler) if item["content"].has_key("employees") and item["content"][ "employees"]["data"].has_key("data"): parseMember_save(r["source"], r["sourceId"], 5030, item["content"]["employees"]["data"]["data"], download_crawler) if item["content"].has_key("former_members") and item["content"][ "former_members"]["data"].has_key("data"): parseMember_save( r["source"], r["sourceId"], 5040, item["content"]["former_members"]["data"]["data"], download_crawler) if flag: #parser_mongo_util.update_processed(item["_id"]) logger.info("processed %s", item["url"]) else: logger.info("lack somethin: %s", item["url"]) parser_mongo_util.update_processStatus(r["source"], r["sourceId"], 0) #break start += 1000 if len(items) == 0: break logger.info("36kr_company_parser end.")
def process(): logger.info("36kr_company_parser begin...") start = 0 while True: items = parser_db_util.find_process_limit(SOURCE, TYPE, start, 1000) # items = [parser_db_util.find_process_one(SOURCE,TYPE,83700389)] for item in items: try: r = parse_company(item) logger.info( json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder)) source_company_id = parser_db_util.save_company_standard( r, download_crawler) parser_db_util.delete_source_company_name(source_company_id) parser_db_util.delete_source_mainbeianhao(source_company_id) parser_db_util.save_source_company_name( source_company_id, r["name"], 12020) if r["fullName"] is not None: parser_db_util.save_source_company_name( source_company_id, r["fullName"], 12010) main_company_name = name_helper.get_main_company_name( r["fullName"]) if main_company_name != r["fullName"]: parser_db_util.save_source_company_name( source_company_id, main_company_name, 12010) logger.info("source_company_id=%s", source_company_id) artifacts = parse_artifact(source_company_id, item) logger.info( json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder)) if (r["fullName"] is None or r["fullName"].strip() == "") and (r['description'] is None or r['description'].strip() == "") \ and len(artifacts) == 0: parser_db_util.update_active(SOURCE, item["key"], 'N') parser_db_util.update_processed(item["_id"]) logger.info("missing all stuff, processed %s", item["url"]) continue parser_db_util.save_artifacts_standard(source_company_id, artifacts) parseMember_save(source_company_id, item, download_crawler) # parser_db_util.delete_funding(source_company_id) flag = parseFinance_save(source_company_id, item, download_crawler) flag = True except Exception, E: logger.info(E) pass # if flag: parser_db_util.update_processed(item["_id"]) logger.info("processed %s", item["url"]) # else: # logger.info("lack something: %s", item["url"]) #break #break if len(items) == 0: break
def process(): logger.info("36kr_company_parser begin...") start = 0 while True: items = parser_db_util.find_all_limit(SOURCE, TYPE, start, 1000) for item in items: r = parse_company(item) logger.info( json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder)) if r["status"] == "INIT": parser_db_util.update_active(SOURCE, item["key"], 'N') #parser_db_util.update_processed(item["_id"]) logger.info("processed %s", item["url"]) continue parser_db_util.update_active(SOURCE, item["key"], None) sc = parser_db_util.get_source_company_by_source_and_sourceid( SOURCE, item["key"]) if sc is None: source_company_id = parser_db_util.save_company_standard( r, download_crawler) parser_db_util.delete_source_company_name(source_company_id) parser_db_util.delete_source_mainbeianhao(source_company_id) parser_db_util.save_source_company_name( source_company_id, r["name"], 12020) parser_db_util.save_source_company_name( source_company_id, r["fullName"], 12010) main_company_name = name_helper.get_main_company_name( r["fullName"]) if main_company_name != r["fullName"]: parser_db_util.save_source_company_name( source_company_id, main_company_name, 12010) logger.info("source_company_id=%s", source_company_id) artifacts = parse_artifact(source_company_id, item) logger.info( json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder)) parser_db_util.save_artifacts_standard(source_company_id, artifacts) parser_db_util.delete_funding(source_company_id) flag = parseFinance_save(source_company_id, item, download_crawler) if item["content"].has_key("founders") and item["content"][ "founders"]["data"].has_key("data"): parseMember_save( source_company_id, 5010, item["content"]["founders"]["data"]["data"], download_crawler) if item["content"].has_key("employees") and item["content"][ "employees"]["data"].has_key("data"): parseMember_save( source_company_id, 5030, item["content"]["employees"]["data"]["data"], download_crawler) if item["content"].has_key("former_members") and item[ "content"]["former_members"]["data"].has_key("data"): parseMember_save( source_company_id, 5040, item["content"]["former_members"]["data"]["data"], download_crawler) # if flag: # parser_db_util.update_processed(item["_id"]) # logger.info("processed %s" ,item["url"]) # else: # logger.info("lack somethin: %s", item["url"]) #break start += 1000 if len(items) == 0: break logger.info("36kr_company_parser end.")
def process(sourceId=0): logger.info("evervc_company_parser begin...") start = 0 while True: if sourceId > 0: items = [parser_db_util.find_process_one(SOURCE, TYPE, sourceId)] else: items = parser_db_util.find_process_limit(SOURCE, TYPE, start, 1000) for item in items: r = parse_company(item) logger.info( json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder)) source_company_id = parser_db_util.save_company_standard( r, download_crawler) parser_db_util.delete_source_company_name(source_company_id) parser_db_util.delete_source_mainbeianhao(source_company_id) parser_db_util.save_source_company_name(source_company_id, r["fullName"], 12010) if len(r["name"]) < len( r["fullName"] ) or r['fullName'] is None or r["fullName"] == '': parser_db_util.save_source_company_name( source_company_id, r["name"], 12020) main_company_name = name_helper.get_main_company_name( r["fullName"]) if main_company_name != r["fullName"]: parser_db_util.save_source_company_name( source_company_id, main_company_name, 12010) logger.info("source_company_id=%s", source_company_id) artifacts = parse_artifact(source_company_id, r) logger.info( json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder)) parser_db_util.save_artifacts_standard(source_company_id, artifacts) parseMember_save(source_company_id, item, download_crawler) parser_db_util.delete_funding(source_company_id) ##?? flag = parseFinance_save(source_company_id, item, r['sourceId'], download_crawler) flag = True if flag: parser_db_util.update_processed(item["_id"]) logger.info("processed %s", item["url"]) else: logger.info("lack something: %s", item["url"]) # break # start += 1000 # todo if len(items) == 0 or sourceId > 0: break logger.info("evervc_company_parser end.")
def process(): logger.info("itjuzi_company_parser begin...") start = 0 while True: items = parser_mongo_util.find_process_limit(SOURCE, TYPE, start, 1000) for item in items: logger.info(item["url"]) r = parse_base(item) if r is None: continue #source_company_id = parser_db_util.save_company(r, SOURCE, download_crawler) parser_mongo_util.save_mongo_company(r["source"], r["sourceId"], r) # parser_db_util.delete_source_company_name(source_company_id) # parser_db_util.delete_source_mainbeianhao(source_company_id) # parser_db_util.save_source_company_name(source_company_id, r["shortName"],12020) # parser_db_util.save_source_company_name(source_company_id, r["productName"],12020) # parser_db_util.save_source_company_name(source_company_id, r["fullName"],12010) parser_mongo_util.save_mongo_source_company_name( r["source"], r["sourceId"], { "name": r["shortName"], "type": 12020 }) parser_mongo_util.save_mongo_source_company_name( r["source"], r["sourceId"], { "name": r["productName"], "type": 12020 }) parser_mongo_util.save_mongo_source_company_name( r["source"], r["sourceId"], { "name": r["fullName"], "type": 12020 }) main_company_name = name_helper.get_main_company_name( r["fullName"]) if main_company_name != r["fullName"]: parser_mongo_util.save_mongo_source_company_name( r["source"], r["sourceId"], { "name": main_company_name, "type": 12020 }) # logger.info("source_company_id=%s", source_company_id) artifacts = parse_artifact(item) artifacts.extend(r["artifacts"]) logger.info( json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder)) # parser_db_util.save_artifacts(source_company_id, artifacts) for artifact in artifacts: parser_mongo_util.save_mongo_source_artifact( r["source"], r["sourceId"], artifact) #TODO FOOTPRINTS # footprints = parse_footprint(item) # parser_db_util.save_footprints(source_company_id, footprints) members = parse_member(item) # parser_db_util.save_member_rels(source_company_id, members, SOURCE) for member in members: parser_mongo_util.save_mongo_source_company_member( r["source"], r["sourceId"], member) parser_mongo_util.update_processed(item["_id"]) parser_mongo_util.update_processStatus(r["source"], r["sourceId"]) #if flag: #break start += 1000 if len(items) == 0: break logger.info("itjuzi_company_parser end.")