def update_investor(investor, source_investor): conn = db.connect_torndb() investor_id = investor["id"] if replace(investor, source_investor): logger.info("Update investor : %d with source_investor: %d ", investor_id, source_investor["id"]) sql = "update investor set name=%s,website=%s,domain=%s,description=%s,logo=%s,stage=%s,\ field=%s,type=%s,modifyTime=now() where id=%s" conn.update(sql, investor["name"], investor["website"], investor["domain"], investor["description"], investor["logo"], investor["stage"], investor["field"], investor["type"], investor_id) else: logger.info("Not update investor : %d with source_investor: %d ", investor_id, source_investor["id"]) #insert investor_alias investor_alias = conn.get( "select * from investor_alias where name=%s and (active is null or active='Y') limit 1", source_investor["name"]) if investor_alias is not None: chinese, is_company = name_helper.name_check(investor["name"]) if is_company: type = 12010 else: type = 12020 sql = "insert investor_alias(investorId, name, type, createTime,modifyTime) values(%s,%s,%s, now(),now())" logger.info("Add new investor alias: %s for %s", source_investor["name"], investor["id"]) conn.insert(sql, investor["id"], source_investor["name"], type) conn.close()
def process_corporate(): id = -1 conn = db.connect_torndb() while True: cs = conn.query( "select * from corporate where id>%s order by id limit 1000", id) if len(cs) == 0: break for c in cs: corporate_id = c["id"] fullname = c["fullName"] if fullname is None or fullname.strip() == "": continue fullname = fullname.strip() chinese, iscompany = name_helper.name_check(fullname) if chinese is False or iscompany is False: continue # logger.info(c["fullname"]) if corporate_id > id: id = corporate_id alias = conn.get( "select * from corporate_alias where corporateId=%s and name=%s", corporate_id, fullname) if alias is None: logger.info(fullname) conn.insert( "insert corporate_alias(corporateId,name,type,createTime,modifyTime) values(%s,%s,12010,now(),now())", corporate_id, fullname) conn.close()
def patch_company_alias(): # 删除 fullname 12010 # type is null 处理 id = -1 conn = db.connect_torndb() while True: cas = conn.query("select * from company_alias where id>%s order by id limit 1000", id) if len(cas) == 0: break for ca in cas: company_alias_id = ca["id"] logger.info(ca["name"]) if company_alias_id > id: id = company_alias_id type = ca["type"] chinese, company = name_helper.name_check(ca["name"]) if chinese and company: type = 12010 if type is None: type = 12020 if type == 12010: conn.update("update company_alias set active='N', modifyUser=139 where id=%s", company_alias_id) else: conn.update("update company_alias set type=12020 where id=%s", company_alias_id) conn.close()
def process_alias(): id = -1 conn = db.connect_torndb() while True: cas = conn.query("select * from company_alias where id>%s order by id limit 1000", id) if len(cas) == 0: break for ca in cas: company_alias_id = ca["id"] logger.info(ca["name"]) if company_alias_id > id: id = company_alias_id c = conn.get("select * from company where id=%s", ca["companyId"]) corporate_id = c["corporateId"] if corporate_id is None: continue cpa = conn.get("select * from corporate_alias where corporateId=%s and name=%s limit 1", corporate_id, ca["name"]) if cpa is not None: continue type = ca["type"] chinese, company = name_helper.name_check(ca["name"]) if chinese and company: type = 12010 conn.insert( "insert corporate_alias(" "corporateId, name, type, verify, active, createTime, modifyTime,createUser,modifyUser,confidence)" "values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", corporate_id, ca["name"], type, ca["verify"], ca["active"], ca["createTime"], ca["modifyTime"], ca["createUser"], ca["modifyUser"], ca["confidence"] ) conn.update("update corporate_alias set type=12020 where type is null") conn.close()
def add_corporate_alias(): conn = db.connect_torndb() id = 0 while True: inames = conn.query("select * from corporate_alias where " "(active is null or active='Y') and gongshangCheckTime is not null and id>%s" " order by id limit 2000", id) names = [] for iname in inames: if iname["id"] > id: id = iname["id"] if iname["name"] is not None and iname["name"] != "": chinese, company = name_helper.name_check(iname["name"]) if chinese is True: # logger.info("name: %s, time: %s", iname["name"]) names.append({"name": iname["name"], "lastCheckTime": iname["gongshangCheckTime"], "corporateId": int(iname["corporateId"])}) # names = [{"name": iname["name"], "lastCheckTime": iname["gongshangCheckTime"]} # for iname in inames if iname["name"] is not None and iname["name"] != ""] # logger.info(names) save_names(names, 3) if len(inames) == 0: break conn.close()
def count_company_names(apps, item_of_name): names = {} for app in apps: company_name = app.get(item_of_name) if company_name is not None: ischinese, iscompany = name_helper.name_check(company_name) if iscompany == True: names[company_name] = 1 return len(names)
def find_company_candidate(name, fullname): conn = db.connect_torndb() candidate_companies = [] companies = conn.query( "select * from company where name=%s and (active is null or active='Y')", name) for c in companies: #logger.info("company: %s", c["name"]) if not is_exist(candidate_companies, c): candidate_companies.append(c) cas = conn.query( "select * from company_alias where name=%s and (active is null or active='Y')", name) for ca in cas: company = conn.get("select * from company where id=%s", ca["companyId"]) if company['active'] != 'N': #logger.info("company: %s", company["name"]) if not is_exist(candidate_companies, company): candidate_companies.append(company) if fullname == u"": isCN, isCompany = name_helper.name_check(name) #logger.info("isCN: %s, isCompany: %s", isCN, isCompany) if isCN and isCompany: fullname = name if fullname != u"": #logger.info("***fullname: %s", fullname) companies = conn.query( "select * from company where fullname=%s and (active is null or active='Y')", fullname) for c in companies: #logger.info("company: %s", c["name"]) if not is_exist(candidate_companies, c): candidate_companies.append(c) cas = conn.query( "select * from company_alias where name=%s and (active is null or active='Y')", fullname) for ca in cas: company = conn.get("select * from company where id=%s", ca["companyId"]) if company['active'] != 'N': #logger.info("company: %s", company["name"]) if not is_exist(candidate_companies, company): candidate_companies.append(company) conn.close() return candidate_companies
def start_run(concurrent_num): # while True: # logger.info("Company gongshang start...") # # conn = db.connect_torndb() # #source_company_names = conn.query("select * from source_company_name where type=12010 and chines='Y' and gongshangCheckTime is null order by id desc") # source_company_names = conn.query("select * from source_company_name where sourceCompanyId=31098") # conn.close() # for source_company_name in source_company_names: # company_name = source_company_name["name"] # #NAME CHECK # chinese, is_company = name_helper.name_check(company_name) # if chinese and is_company: # COMPANIES.append(source_company_name) # # logger.info(json.dumps(COMPANIES, ensure_ascii=False, cls=util.CJsonEncoder)) # # threads = [gevent.spawn(query_goshang()) for i in xrange(concurrent_num)] # gevent.joinall(threads) # # # logger.info("Company gongshang end.") # # if len(COMPANIES) == 0: # gevent.sleep(10*60) logger.info("Company gongshang start...") while True: conn = db.connect_torndb() company_aliases = conn.query( "select * from company_alias where type=12010 and gongshangCheckTime is null order by id desc limit 1000" ) conn.close() for alias in company_aliases: company_name = alias["name"] #NAME CHECK chinese, is_company = name_helper.name_check(company_name) if chinese and is_company: COMPANIES.append(alias) #logger.info(json.dumps(COMPANIES, ensure_ascii=False, cls=util.CJsonEncoder)) if len(COMPANIES) > 0: threads = [ gevent.spawn(query_goshang) for i in xrange(concurrent_num) ] gevent.joinall(threads) else: logger.info("Company gongshang end.") gevent.sleep(10 * 60) logger.info("Company gongshang start...")
def find_from_gongshang(name): name = name_helper.company_name_normalize(name) if name is None: return chinese, company = name_helper.name_check(name) if chinese is True and company is True: gs = mongo.info.gongshang.find_one({"name": name}) if gs is not None: for investor in gs["investors"]: if investor["type"] == u"企业投资": logger.info("gongshang name: %s", investor["name"]) add_2_company_list(investor["name"]) if gs.has_key("invests"): for invest in gs["invests"]: add_2_company_list(invest["name"])
def add_2_company_list(name): name = name_helper.company_name_normalize(name) if name is None: return chinese, company = name_helper.name_check(name) if chinese is True and company is True: logger.info("fullname: %s", name) name_md5 = util.md5str(name) c = mongo.info.company_idx.find_one({"name_md5": name_md5}) if c is None: data = { "name": name, "name_md5": name_md5, "createTime": datetime.datetime.utcnow() } mongo.info.company_idx.insert_one(data)
def patch_corporate_fullname_new(corporate_id): flag = False conn = db.connect_torndb() corporate1 = conn.get("select * from corporate where id=%s", corporate_id) patch = False if corporate1["fullName"] is None or corporate1["fullName"].strip() == "": patch = True else: chinese, iscompany = name_helper.name_check(corporate1["fullName"]) if chinese is False: patch = True elif iscompany is False: patch = True if patch is False: gs = gongshang.find_one({"name": corporate1["fullName"]}) if gs is None: patch = True if patch: logger.info("patch: %s, %s", corporate1, corporate1["id"]) aliases = conn.query( "select * from corporate_alias where corporateId=%s", corporate1["id"]) for alias in aliases: company_name = alias["name"] gs = gongshang.find_one({"name": company_name}) if gs: logger.info("fullname: %s", company_name) conn.update("update corporate set fullName=%s where id=%s", company_name, corporate_id) flag = True break if flag is False: if corporate1["fullName"] is None or corporate1["fullName"].strip( ) == "": for alias in aliases: company_name = alias["name"] logger.info("fullname: %s", company_name) conn.update("update corporate set fullName=%s where id=%s", company_name, corporate_id) flag = True break conn.close() return flag
def begin(): global total NUM = 100 # while True: # conn2 = db.connect_torndb_crawler() # result = conn2.get("select count(*) cnt from proxy_tyc where status = 0 and DATE_ADD(createTime,INTERVAL 2 SECOND) < now()") # conn2.close() # if result["cnt"] > 0: # break # time.sleep(5) #time.sleep(random.randint(1,10)) while True: has_request = False conn = db.connect_torndb() company_aliases = conn.query( "select * from company_alias where type=12010 and " "(gongshangCheckTime is null or gongshangCheckTime < date_sub(now(),interval 30 day)) " "order by id desc limit %s", NUM) #company_aliases = conn.query("select * from company_alias where type=12010 and gongshangCheckTime is null order by id desc limit %s", NUM) #company_aliases = conn.query("select * from company_alias where id=428826") conn.close() if len(company_aliases) <= 0: logger.info("Finish.") time.sleep(60) logger.info("Start...") continue for alias in company_aliases: company_name = alias["name"] #NAME CHECK chinese, is_company = name_helper.name_check(company_name) if chinese and is_company: logger.info(company_name) first_request(company_name, first=True) has_request = True else: update_time(alias["id"]) if has_request: break
def run(): conn = db.connect_torndb() sql = '''select name,fullname,sourceid,id from source_company where source=13821 ''' results = conn.query(sql) # TODO conn.close() for c in results: if c['fullname'] is not None and not name_helper.name_check( c['fullname'])[1] == True: logger.info('%s not company', c['fullname']) conn = db.connect_torndb() conn.update( '''UPDATE source_company SET fullName=null where id = %s''', c['id']) conn.update( 'UPDATE source_company_name SET type=12020 where sourcecompanyid = %s and type=12010 and name=%s', c['id'], c['fullname']) conn.close()
def insert_investor_alias(investorId, selected_investorId): conn = db.connect_torndb() investor_aliaes = conn.query( "select * from investor_alias where investorId=%s and (active is null or active!='N')", investorId) for investor in investor_aliaes: # investor = conn.get("select * from investor where id=%s",investorId) ia = conn.get( "select * from investor_alias where name=%s and investorId=%s and " "(active is null or active!='N') limit 1", investor["name"], selected_investorId) if ia is None: chinese, is_company = name_helper.name_check(investor["name"]) if is_company: type = 12010 else: type = 12020 sql = "insert investor_alias(investorId, name, type, createTime,modifyTime) values(%s,%s,%s, now(),now())" logger.info("Add new investor alias: %s for %s", investor["name"], selected_investorId) conn.insert(sql, selected_investorId, investor["name"], type) conn.close()
def process_corporate_alias(): id = -1 conn = db.connect_torndb() while True: cs = conn.query( "select * from corporate_alias where id>%s and (type=12020 or type is null) order by id limit 1000", id) if len(cs) == 0: break for c in cs: if c["id"] > id: id = c["id"] name = c["name"].strip() if len(name) < 6: continue chinese, iscompany = name_helper.name_check(name) if chinese is False or iscompany is False: continue logger.info("%s, %s", c["createTime"], name) conn.update("update corporate_alias set type=12010 where id=%s", c["id"]) conn.close()
def save_company_name(app, item_of_name, source, sourceId): company_name = app[item_of_name] if company_name is None or company_name.strip() == "": return company_name = name_helper.company_name_normalize(company_name) source_company_name = collection_source_company.find_one({"source": source, "sourceId": sourceId, "source_company_name.name": company_name}) if source_company_name is None: (chinese, company) = name_helper.name_check(app[item_of_name]) if chinese is True: chinese_type = "Y" else: chinese_type = "N" scnamedata = { "name": company_name, "chinese": chinese_type, "type": 12010, "extended": 'Y', } save_mongo_source_company_name(source, sourceId, scnamedata)
c["id"]) for alias in aliases: name = alias["name"] new_name = name_helper.company_name_normalize(name) if name != new_name: logger.info("1. %s --- %s", name, new_name) update_company_alias(alias["id"], new_name) main_name = name_helper.get_main_company_name(new_name) if main_name != new_name: logger.info("2. %s --- %s", new_name, main_name) save_company_alias(c["id"], main_name) fullname = c["fullname"] if fullname is None or fullname.strip() == "": continue is_chinese, is_company = name_helper.name_check(fullname) if is_company: new_name = name_helper.company_name_normalize(fullname) if fullname != new_name: save_company_alias(c["id"], new_name) logger.info("3. %s --- %s", fullname, new_name) main_name = name_helper.get_main_company_name(new_name) if main_name != new_name: save_company_alias(c["id"], main_name) logger.info("4. %s --- %s", new_name, main_name) if main_name != fullname: update_company_fullname(c["id"], main_name) conn.close() logger.info("End.")
def expand_source_company(source, sourceId, beian_links_crawler, icp_chinaz_crawler, screenshot_crawler, test=False): logger.info("source: %s, sourceId: %s Start expand!!!", source, sourceId) logger.info("clean old expanded data") expand_clean(source, sourceId) sourcecompany = collection_source_company.find_one({"source": source, "sourceId": sourceId}) # exit() company_fullname = sourcecompany["source_company"]["fullName"] if company_fullname is not None and company_fullname.strip() != "": company_fullname = name_helper.company_name_normalize(company_fullname) scnames = sourcecompany["source_company_name"] check_fullname = False for scname in scnames: if scname["name"] == company_fullname: check_fullname = True break if check_fullname is False: (chinese, company) = name_helper.name_check(company_fullname) if chinese is True: chinese_type = "Y" else: chinese_type = "N" scname_data ={ "name": company_fullname, "chinese": chinese_type, "type": 12010, } save_mongo_source_company_name(source, sourceId, scname_data) round = 1 while True: if round >= 6: collection_source_company.update_one({"_id": sourcecompany["_id"]},{'$set': {"scexpanded": True, "modifyTime": datetime.datetime.now()}}) break source_company_names = find_mongo_data(collection_source_company, "source_company_name", source, sourceId) main_beianhaos = find_mongo_data(collection_source_company, "source_mainbeianhao", source, sourceId) artifacts = find_mongo_data(collection_source_company, "source_artifact", source, sourceId) logger.info(json.dumps(source_company_names, ensure_ascii=False, cls=util.CJsonEncoder)) logger.info(json.dumps(main_beianhaos, ensure_ascii=False, cls=util.CJsonEncoder)) logger.info(json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder)) # Check if there are new stuff which need to do expansion if len(source_company_names) == 0 and len(artifacts) == 0 and len(main_beianhaos) == 0: collection_source_company.update_one({"_id": sourcecompany["_id"]}, {'$set': {"scexpanded": True, "modifyTime": datetime.datetime.now()}}) break logger.info("source: %s, sourceId: %s expand for round %d", source, sourceId, round) # Step A/1:按公司名,备案查询 logger.info("source: %s, sourceId: %s 按公司名备案查询", source, sourceId) for source_company_name in source_company_names: # Only check chinese company name if source_company_name["name"] is None or source_company_name["name"].strip() == "": continue if source_company_name["chinese"] is None: (chinese, companyName) = name_helper.name_check(source_company_name["name"]) else: chinese = source_company_name["chinese"] if chinese != "Y": continue check_name = list(collection_beian.find({"organizer": source_company_name["name"]})) # Case that one company_name has multiple beian# : 上海汇翼->(today.ai/teambition.com)#If only one found in Mongo.beian(organizer) it is fine if len(check_name) == 0: if test: items_beianlinks = [] else: items_beianlinks = beian_links_crawler.query_by_company_name(source_company_name["name"]) save_collection_beian(collection_beian, items_beianlinks) # insert infos into Mongo.beian else: items_beianlinks = check_name save_beian_artifacts(items_beianlinks, source, sourceId) # insert website/homepage into Mysql.source_artifact save_beian_company_names(items_beianlinks, source, sourceId) # insert organizer into Mysql.source_company_names save_beian_mainbeianhaos(items_beianlinks, source, sourceId) # insert mainBeianhao into Mysql.source_mainbeiahao # beian # 发现更多的artifact(website)和公司名,主备案号 # Step A/2:按domian,备案查询 logger.info("source: %s, sourceId: %s 按domian备案查询", source, sourceId) for artifact in artifacts: # Only check is artifact is a website if artifact["type"] != 4010: continue if artifact["domain"] is None: link = url_helper.url_normalize(artifact["link"]) (flag, domain) = url_helper.get_domain(link) if flag is None: continue if flag is False: continue else: domain = artifact["domain"] if domain is None or domain.strip() == "": continue check_domain = list(collection_beian.find({"domain": domain})) if len(check_domain) == 0: if test: items_merge =[] else: items_beianlinks = beian_links_crawler.query_by_domain(domain) items_icpchinaz = icp_chinaz_crawler.query_by_domain(domain) items_merge = merge_beian(items_beianlinks, items_icpchinaz) save_collection_beian(collection_beian, items_merge) # insert infos into Mongo.beian else: items_merge = check_domain # filer by check domain to avoid sinaapp.cn case items_merge = filter_domain(items_merge, domain) save_beian_artifacts(items_merge, source, sourceId) # insert website/homepage into Mysql.source_artifact save_beian_company_names(items_merge, source, sourceId) # insert organizer into Mysql.source_company_names save_beian_mainbeianhaos(items_merge, source, sourceId) # insert mainBeianhao into Mysql.source_mainbeiahao # beian # 发现更多的artifact(website)和公司名,主备案号 # Step A/3 #按主备案号查询 logger.info("source: %s, sourceId: %s 按主备案号查询", source, sourceId) for main_beianhao in main_beianhaos: mainBeianhao = main_beianhao["mainBeianhao"] check_mainBeianhao = collection_main_beianhao.find_one({"mainBeianhao": mainBeianhao}) if check_mainBeianhao is None: if test: items_merge =[] else: items_beianlinks = beian_links_crawler.query_by_main_beianhao(mainBeianhao) items_icpchinaz = icp_chinaz_crawler.query_by_main_beianhao(mainBeianhao) items_merge = merge_beian(items_beianlinks, items_icpchinaz) save_collection_beian(collection_beian, items_merge) # insert infos into Mongo.beian # if mainBeianhao could be found in two links if len(items_merge) > 0: items_main_beianhao = [{"mainBeianhao": mainBeianhao}] save_collection_mainBeianhao(collection_main_beianhao, items_main_beianhao) # insert mainBeianhao into Mongo.main_beianhao else: items_merge = list(collection_beian.find({"mainBeianhao": mainBeianhao})) save_beian_artifacts(items_merge, source, sourceId) # insert website/homepage into Mysql.source_artifact save_beian_company_names(items_merge, source, sourceId) # insert organizer into Mysql.source_company_names save_beian_mainbeianhaos(items_merge, source, sourceId) # insert mainBeianhao into Mysql.source_mainbeiahao # 发现更多的artifact(website)和公司名 # itunes扩展 # Step B/1 #查询itunes artifact logger.info("source: %s, sourceId: %s 查询itunes artifact", source, sourceId) itunes_company_enames = {} app_by_name = {} for artifact in artifacts: if artifact["type"] != 4040: continue # Get trackid trackid = None if artifact["domain"] is None: (apptype, appmarket, trackid) = url_helper.get_market(artifact["link"]) if apptype != 4040: continue else: try: trackid = int(artifact["domain"]) except: pass if trackid is not None: app = collection_itunes.find_one({"trackId": trackid}) if app is None: # mark it as Noactive set_artifact_active(artifact, "N", source, sourceId) else: copy_from_itunes(app, artifact, source, sourceId) # 存在: copy from mongo.itunes if app.has_key("offline") and app["offline"] is True: set_artifact_active(artifact, "Offline", source, sourceId) else: set_artifact_active(artifact, "Y", source, sourceId) english, is_company = name_helper.english_name_check(app["sellerName"]) if english and is_company: itunes_company_enames["sellerName"] = 1 app_by_name = app else: set_artifact_active(artifact, "N", source, sourceId) # save the only english name if len(itunes_company_enames) == 1: company_name = collection_source_company.find_one({"source": source, "sourceId": sourceId, "source_company_name": {"$elemMatch": {"type": 12010, "chinese":"N"}}}) if company_name is None: save_company_name(app_by_name, "sellerName", source, sourceId) # Step B/2根据公司名查询更多的itunes artifact logger.info("source: %s, sourceId: %s 根据公司名查询更多的itunes artifact", source, sourceId) for source_company_name in source_company_names: # producer name ''' check_itunes_producers = list(collection_itunes.find({"developer": source_company_name["name"]})) if len(check_itunes_producers) > 0: for app in check_itunes_producers: # Check if itunesId is already existed in artifacts if find_itunesId(app["trackId"], source_company_id): pass else: source_artifact_id = save_itunes_artifact(app, source_company_id) #save_artifact_itunes_rel(app["_id"], source_artifact_id) save_company_name(app, "developer", source_company_id) ''' if source_company_name["name"] is None or source_company_name["name"].strip() == "": continue check_itunes_sellers = list(collection_itunes.find({"sellerName": source_company_name["name"]})) if len(check_itunes_sellers) > 0: ''' domains = {} for app in check_itunes_sellers: sellerUrl = app.get("sellerUrl") flag ,domain = url_helper.get_domain(sellerUrl) if flag is not None and domain is not None: domains[domain] = 1 ''' lens_domain = count_domains(check_itunes_sellers, "sellerUrl") artifact_status = check_source_artifact(source, sourceId) for app in check_itunes_sellers: # Check if itunesId is already existed in all artifacts in 1 sourceCompanyId if find_itunesId(app["trackId"], source, sourceId): pass else: save_itunes_artifact(app, source, sourceId) if app.has_key("sellerUrl"): # if find_link(app["sellerUrl"], source_company_id) or check_source_artifact(source_company_id): if artifact_status: pass elif lens_domain == 1: artifact_id = save_itunesSellerUrl_artifact(app, source, sourceId) if artifact_id is not None: artifact_status = True # comment due to incorrect expand ''' if app.has_key("supportUrl"): if find_link(app["supportUrl"], source_company_id): pass else: save_itunesSupportUrl_artifact(app, source_company_id) ''' # save_artifact_itunes_rel(app["_id"], source_artifact_id) # save_company_name(app, "sellerName", source_company_id) # Step B/3根据域名查询更多的itunes artifact logger.info("source: %s, sourceId: %s 根据域名查询更多的itunes artifact", source, sourceId) for artifact in artifacts: if artifact["type"] != 4010: continue if artifact["domain"] is None: (flag, domain) = url_helper.get_domain(artifact["link"]) if flag is None: continue if flag is False: continue else: domain = artifact["domain"] if domain is None or domain.strip() == "": continue if domain in itunesDomainEx: continue check_itunes_sellerDomains = list(collection_itunes.find({"sellerDomain": domain})) if len(check_itunes_sellerDomains) > 0: lens_company_names = count_company_names(check_itunes_sellerDomains, "sellerName") company_name_status = check_source_company_name(source, sourceId) for app in check_itunes_sellerDomains: # Check if itunesId is already existed in all artifacts in 1 sourceCompanyId if find_itunesId(app["trackId"], source, sourceId): pass else: save_itunes_artifact(app, source, sourceId) if company_name_status: pass elif lens_company_names == 1: # save_artifact_itunes_rel(app["_id"], source_artifact_id) chinese, is_company = name_helper.name_check(app["sellerName"]) if chinese and is_company: save_company_name(app, "sellerName", source, sourceId) company_name_status = True english, is_company = name_helper.english_name_check(app["sellerName"]) if english and is_company: save_company_name(app, "sellerName", source, sourceId) company_name_status = True check_itunes_supportDomains = list(collection_itunes.find({"supportDomain": domain})) if len(check_itunes_supportDomains) > 0 and len(check_itunes_supportDomains) < 100: lens_company_names = count_company_names(check_itunes_supportDomains, "sellerName") company_name_status = check_source_company_name(source, sourceId) for app in check_itunes_supportDomains: # Check if itunesId is already existed in all artifacts in 1 sourceCompanyId if find_itunesId(app["trackId"], source, sourceId): pass else: save_itunes_artifact(app, source, sourceId) # save_artifact_itunes_rel(app["_id"], source_artifact_id) if company_name_status: pass elif lens_company_names == 1: chinese, is_company = name_helper.name_check(app["sellerName"]) if chinese and is_company: save_company_name(app, "sellerName", source, sourceId) company_name_status = True english, is_company = name_helper.english_name_check(app["sellerName"]) if english and is_company: save_company_name(app, "sellerName", source, sourceId) company_name_status = True # 发现更多的artifact(website)和公司名,check if existed in source_art..and company_name # android扩展 # Step C/1#查询android artifact logger.info("source: %s, sourceId: %s 查询android artifact", source, sourceId) for artifact in artifacts: if artifact["type"] != 4050: continue # Get apkname apkname = None if artifact["domain"] is None: (apptype, appmarket, appid) = url_helper.get_market(artifact["link"]) # Get apkname of baidu and 360 from android market if apptype != 4050: continue if appmarket == 16010 or appmarket == 16020: android_app = collection_android_market.find_one({"appmarket": appmarket, "key_int": appid}) if android_app: apkname = android_app["apkname"] else: apkname = appid else: apkname = artifact["domain"] if apkname is not None: app = collection_android.find_one({"apkname": apkname}) if app is None: # mark it as Noactive set_artifact_active(artifact, "N", source, sourceId) else: copy_from_android(app, artifact, source, sourceId) # 存在: copy from mongo.android set_artifact_active(artifact, "Y", source, sourceId) # chinese, is_company = name_helper.name_check(app["author"]) # if is_company: # save_company_name(app, "author", source_company_id) else: set_artifact_active(artifact, "N", source, sourceId) # Step C/2根据公司名查询更多的android artifact logger.info("source: %s, sourceId: %s 根据公司名查询更多的android artifact", source, sourceId) for source_company_name in source_company_names: # producer name if source_company_name["name"] is None or source_company_name["name"].strip() == "": continue check_android_authors = list(collection_android.find({"author": source_company_name["name"]})) if len(check_android_authors) > 0 and len(check_android_authors) < 200: lens_domain = count_domains(check_android_authors, "website") artifact_status = check_source_artifact(source, sourceId) # check if author is consistent for app in check_android_authors: # Check if AnId have one 4010 if find_androidAppname(app["apkname"], source, sourceId): pass else: save_android_artifact(app, source, sourceId) if artifact_status: pass elif lens_domain == 1: artifact_id = save_androidWebsite_artifact(app, source, sourceId) if artifact_id is not None: artifact_status = True # save_artifact_android_rel(app["_id"], source_artifact_id) # save_company_name(app, "author", source_company_id) # Step C/3根据域名查询更多的android artifact logger.info("source: %s, sourceId: %s 根据域名查询更多的android artifact", source, sourceId) for artifact in artifacts: if artifact["type"] != 4010: continue if artifact["domain"] is None: (flag, domain) = url_helper.get_domain(artifact["link"]) if flag is None: continue if flag is False: continue else: domain = artifact["domain"] if domain is None or domain.strip() == "": continue check_android_websiteDomains = list(collection_android.find({"website_domain": domain})) if len(check_android_websiteDomains) > 0: lens_company_names = count_company_names(check_android_websiteDomains, "author") company_name_status = check_source_company_name(source, sourceId) for app in check_android_websiteDomains: # Check if AndroidId is already existed in artifacts if find_androidAppname(app["apkname"], source, sourceId): pass else: save_android_artifact(app, source, sourceId) # save_artifact_android_rel(app["_id"], source_artifact_id) if company_name_status: pass elif lens_company_names == 1: chinese, is_company = name_helper.name_check(app["author"]) if is_company: save_company_name(app, "author", source, sourceId) company_name_status = True check_android_apknameDomains = list(collection_android.find({"apkname_domain": domain})) # add threshold to avoid case: domain: com.wowotuan if len(check_android_apknameDomains) > 0 and len(check_android_apknameDomains) < 100: lens_company_names = count_company_names(check_android_apknameDomains, "author") company_name_status = check_source_company_name(source, sourceId) for app in check_android_apknameDomains: # Check if AndroidId is already existed in artifacts if find_androidAppname(app["apkname"], source, sourceId): pass else: save_android_artifact(app, source, sourceId) # save_artifact_android_rel(app["_id"], source_artifact_id) if company_name_status: pass elif lens_company_names == 1: chinese, is_company = name_helper.name_check(app["author"]) if is_company: save_company_name(app, "author", source, sourceId) company_name_status = True # 发现更多的artifact(website)和公司名 # 曾用名 TODO # 清洗website artfiact # 查询meta信息, 标记不能访问的?website?, 处理转跳的website logger.info("source: %s, sourceId: %s website meta", source, sourceId) for artifact in artifacts: if artifact["type"] != 4010: continue if artifact["link"] is None or artifact["link"].strip() == "": # set_active("source_artifact", "N", artifact["id"]) set_artifact_active(artifact, "N", source, sourceId) continue url = artifact["link"].strip() meta = collection_website.find_one({"url": url}) if meta is None or meta["httpcode"]==404: meta = website.get_meta_info(url) if meta: websiteId = save_collection_website(collection_website, meta) if websiteId is not None and not test: #screenshot_wesbite(collection_website, websiteId, screenshot_crawler) pass else: meta = { "url": artifact["link"], "httpcode": 404 } websiteId = save_collection_website(collection_website, meta) set_artifact_active(artifact, "N", source, sourceId) if meta: # 发生转跳 # logger.info(meta) if meta["httpcode"] == 200: redirect_url = meta.get("redirect_url") if artifact["link"] != redirect_url: url = url_helper.url_normalize(meta["redirect_url"]) (flag_new, domain_new) = url_helper.get_domain(url) meta_new = { "url": url, "domain": domain_new if flag_new is True else None, "redirect_url": url, "title": meta["title"], "tags": meta["tags"], "description": meta["description"], "httpcode": 200 } websiteId_new = save_collection_website(collection_website, meta_new) if websiteId_new is not None and not test: #screenshot_wesbite(collection_website, websiteId_new, screenshot_crawler) pass flag, domain = url_helper.get_domain(artifact["link"]) if domain_new != domain: # 跳出原域名 set_artifact_active(artifact, "Redirect", source, sourceId) else: if flag is True: # 这是个'好'地址 set_artifact_active(artifact, "Y", source, sourceId) else: if flag_new is True: # 转跳后是个 '好'地址 set_artifact_active(artifact, "Redirect", source, sourceId) save_website_artifact(meta_new, source, sourceId) else: set_artifact_active(artifact, "Y", source, sourceId) else: set_artifact_active(artifact, "Y", source, sourceId) elif meta["httpcode"] == 404: set_artifact_active(artifact, "N", source, sourceId) # verify -> source_artifacts/source_company_name set verify logger.info("source: %s, sourceId: %s set verify", source, sourceId) for artifact in artifacts: set_artifact_expand(artifact, source, sourceId) for source_company_name in source_company_names: set_scname_expand(source_company_name, source, sourceId) for main_beianhao in main_beianhaos: set_scbeianhao_expand(main_beianhao, source, sourceId) round += 1
def parse_company(item): # logger.info("parse_company") d = pq(html.fromstring(item['content'].decode("utf-8"))) company_key = item["key"] # company basic info tags = [] for tag in d('.word_list').text().split(): if tag.strip() not in tags: tags.append(tag) tags_str = ",".join(tags) logo = d('.peoimg img').attr('src') if logo: logo = logo.replace("https://", "http://") establish_date = None time_content = d('.time_content li:last-child') if d(time_content)('.upword').text().find('成立') > 0: establish_date = d(time_content)('.time_up').text() establish_date = datetime.datetime.strptime(establish_date, '%Y-%m-%d') companyName = d('.company_div h5').text() city = name_helper.get_location_from_company_name(companyName)[0] location_id = 0 if city != None: location = parser_db_util.get_location(city) if location != None: location_id = location["locationId"] # logger.info("locationid =%s",location_id) fullName = companyName.replace("_", "") fullName = name_helper.company_name_normalize(fullName) desc = d('#intro_srocll p').text() productDesc = '' website = '' for p in d('.procont_lis p'): if d(p).text().find('官网') > 0 and d(p)('a').attr('href') is not None: website = d(p)('a').attr('href') continue productDesc += d(p).text() + '\n' if desc == '' or desc is None: desc = productDesc shortName = d('.peo_center h4').text().split(':')[0].split(':')[0].split( '——')[0].split(',')[0].split('|')[0] companyResult = {} # isCompany # print companyName,company_key, ',', name_helper.name_check(companyName)[1], ',', len(website)>0 if name_helper.name_check(companyName)[1] == True: # English name if name_helper.name_check(shortName)[0] == False: pass else: cnt = 0 for s in shortName: if s in companyName: cnt += 1 if not cnt > 2: shortName = companyName else: if not len(website) > 0: return 0 else: companyResult['fakeName'] = fullName fullName = None companyResult.update({ "name": shortName, "fullName": fullName, "description": desc, "round": 0, "roundDesc": None, "companyStatus": 2010, 'fundingType': 0, "locationId": location_id, "address": None, "phone": None, "establishDate": establish_date, "logo": logo, "source": SOURCE, "sourceId": company_key, "field": None, "subField": None, "tags": tags_str, "headCountMin": None, "headCountMax": None, "brief": None, "website": website, }) return companyResult
def kuohao_alias(): tline = "" conn = db.connect_torndb() n = 0 n1 = 0 n2 = 0 n3 = 0 n4 = 0 # cnames = conn.query("select * from investor_alias where (active is null or active !='N') and name like %s", '%(%') cnames = conn.query( "select name,count(*) as cnt from investor_alias where (active is null or active !='N') " "and (name like %s or name like %s) group by name", '%(%', '%)%') for cname in cnames: wname = cname["name"] investors = conn.query( "select * from investor_alias where (active is null or active !='N') and name=%s", wname) for inv in investors: if inv["type"] != 12010: continue wid = inv["investorId"] investor = conn.get( "select * from investor where (active is null or active !='N') and id=%s", wid) if investor is None: continue n1 += 1 # logger.info("*****************name:%s",inv["name"]) mnames = [wname.replace("(", "(").replace(")", ")").strip()] # csameiid = "" investor_ids = [] for mname in mnames: # i0 = conn.get("select * from investor_alias where name=%s and (active is null or active !='N') and " # "investorId=%s limit 1", mname, wid) i0 = None if i0 is None: i1s = conn.query( "select * from investor_alias where name=%s and (active is null or active !='N')", mname) for i1 in i1s: iv1 = conn.get( "select * from investor where (active is null or active !='N') and id=%s", i1["investorId"]) if iv1 is not None and iv1["id"] not in investor_ids: investor_ids.append(iv1["id"]) else: if wid not in investor_ids: investor_ids.append(wid) if len(investor_ids) > 0: if wid in investor_ids and len(investor_ids) == 1: csameiid = "同一机构" n2 += 1 conn.update( "update investor_alias set active='N',modifyUser=-561 where id=%s", inv["id"]) else: csameiid = "多个机构" n3 += 1 line = "%s+++%s+++%s\n" % (cname["name"], ";".join([ str(id) for id in [str(wid)] + investor_ids ]), get_links([str(wid)] + investor_ids)) tline += line logger.info("%s - %s - %s - %s", wname, str(wid), ";".join([str(id) for id in investor_ids]), csameiid) n += 1 else: (chinese, cccompany) = name_helper.name_check(mnames[0]) if chinese is True: n4 += 1 logger.info("update!!!!!") conn.update( "update investor_alias set name=%s,modifyUser=-561 where id=%s", mnames[0], inv["id"]) logger.info("%s - %s - %s - %s - %s", n, n1, n2, n3, n4) fp2 = open("me.txt", "w") fp2.write(tline) content = '''<div>Dears, <br /><br /> 附件是目前系统中存在重复的公司,请在后台搜索 </div> ''' fp2.close() path = os.path.join(sys.path[0], "me.txt") logger.info(path) email_helper.send_mail_file( "烯牛数据数据开发组", "烯牛数据数据开发组", "*****@*****.**", ';'.join([i + '@xiniudata.com' for i in ["bamy"]]), "重复机构检索--人工审查", content, path) fp2.close() conn.close()
def parse_company(item): if item is None: logger.info("here") return None logger.info("*** base ***") company_key = item["key"] html1 = item["content"] logger.info(company_key) d = pq((html.fromstring(html1.decode("utf-8")))) name = d('h1.name').text().strip() fullName = d('div.company-business> h4').text() if fullName.find("来源")>=0: fullName = fullName.split(" ")[-1] fullName = name_helper.company_name_normalize(fullName) if (name is None or name == "") or (fullName is None or fullName == ""): logger.info("here1: %s", name) return { "status": "No_Name", } if len(name) > len(fullName): name = fullName if name is None or name.strip() == "": name = fullName chinese, companycheck = name_helper.name_check(fullName) if companycheck is not True: logger.info("here") return { "status": "No_Name", } logo = d('div.company-logo> img').attr('src') if logo.startswith("http") or logo.startswith("https") or logo.find("default") >= 0: pass else: logo = None # if logo.find("default") >= 0: # logo = None brief = None desc_text = d('div.job-sec> div.text').text() logger.info("desc: %s", desc_text) if u"该公司尚未添加公司介绍" in desc_text or desc_text == "" or len(desc_text) < 5: desc = None else: desc = desc_text.replace('公司简介:',"").replace("收起","").replace("展开","").replace(" ","").strip() field = '' stage = '' headCount = '' location = '' address = '' try: lll = d('div.info-primary> p').text().strip() if len(lll.split(" ")) == 3: field = lll.split(" ")[2] stage = lll.split(" ")[0] headCount = lll.split(" ")[1] except: pass headCount = headCount.replace("人", "") if headCount == "少于15": min_staff = 1 max_staff = 15 else: staffarr = headCount.split('-') if len(staffarr) > 1: min_staff = staffarr[0] max_staff = staffarr[1] else: try: min_staff = int(staffarr[0].strip()) max_staff = None except: min_staff = None max_staff = None funding_type = 0 if stage == '不需要融资': stage = 0 funding_type = 8010 elif stage == '未融资': stage = 0 elif stage == '天使轮': stage = 1010 elif stage == 'A轮': stage = 1030 elif stage == 'B轮': stage = 1040 elif stage == 'C轮': stage = 1050 elif stage == 'D轮及以上': stage = 1060 elif stage == '上市公司': stage = 1110 else: stage = 0 links = d('div.company-products> ul> li> div.text> div.name> a') artifacts = [] for linkp in links: link = pq(linkp)('a').attr("href") website = url_helper.url_normalize(link) logger.info("website: %s" % website) type, app_market, app_id = url_helper.get_market(website) if type == 4010: if item["url"] != website and website.find("zhipin") == -1: flag, domain = url_helper.get_domain(website) if flag is not None: if flag is False: domain = None artifacts.append({ "type": 4010, "name": name, "description": None, "link": website, "domain": domain }) elif type == 4020 or type == 4030: domain = None if domain is not None: artifacts.append({ "type": type, "name": name, "description": None, "link": website, "domain": domain }) elif type == 4040: domain = app_id if domain is not None: artifacts.append({ "type": 4040, "name": name, "description": None, "link": website, "domain": domain }) elif type == 4050: domain = None if app_market == 16010 or app_market == 16020: android_app = parser_mongo_util.find_android_market(app_market, app_id) if android_app: domain = android_app["apkname"] else: domain = app_id if domain is not None: artifacts.append({ "type": 4050, "name": name, "description": None, "link": website, "domain": domain }) #parser member members = [] lis = d('div.manager-list> div> ul >li> div') member_rank = 0 if len(lis) > 0: for li in lis: mem = pq(li) try: logo_url = mem('div.info-user> img').attr('src') if logo_url.startswith("http") or logo_url.startswith("https"): pass else: logo_url = "http:" + logo_url member_rank += 1 member_key = str(item["key"]) + '_' + str(member_rank) member_name = mem('p> span.name').text() # member_link = mem('p.item_manager_name > a').attr('href') member_position = mem('p> span.job-title').text() member_desc = mem('div.item_manager_content').text() # weibo = None # if member_link is not None: # if 'weibo.com' in member_link: # weibo = member_link source_member = {'name': member_name, 'photo_url': logo_url, 'weibo': None, 'location': None, 'role': member_position, 'description': member_desc, 'education': None, 'work': None } members.append(source_member) except: pass sourceId2link = d('div.company-tab> a').eq(0).attr("href") if sourceId2link is not None and sourceId2link.find("gongsi") >=0: sourceId2 = sourceId2link.split("/")[-1].replace(".html","") else: sourceId2 = None source_company = { "name": name, "fullName": fullName if fullName is not None and fullName.strip() != "" else None, "description": desc, "brief": brief, "round": None, "roundDesc": None, "companyStatus": 2010, 'fundingType': funding_type, "locationId": int(0), "address": address, "phone": None, "establishDate": None, "logo": logo, "source": SOURCE, "sourceId": company_key, "sourceId2": sourceId2, "sourceUrl": "https://www.zhipin.com/gongsi/%s.html?ka=company-intro" % company_key, "field": field, "headCountMin": min_staff, "headCountMax": max_staff, "artifacts": artifacts, "members": members, "status": 1, "stage": 0, } return source_company
def process(crawler, app, content): if content.find('请检查您所输入的URL地址是否有误') != -1: return key = app["key_int"] url = app["link"] d = pq(content) cate = d('div.nav> span >a').eq(1).text().strip() if cate == "游戏": return sub_cate = d('div.nav> span >a').eq(2).text().strip() name = d('h1.app-name> span').text().strip() downloadstr = d("span.download-num").eq(0).text().replace("下载次数:","").replace("+","").strip() if downloadstr.endswith("千"): download = float(downloadstr.replace("千","")) * 1000 elif downloadstr.endswith("万"): download = float(downloadstr.replace("万","")) * 10000 elif downloadstr.endswith("亿"): download = float(downloadstr.replace("亿","")) * 10000 * 10000 else: download = int(downloadstr) logger.info("%s-%s, %s, %s", cate, sub_cate, name, download) mosug_url = "http://m.baidu.com/mosug?wd=%s&type=soft" % urllib.quote(name.encode("utf-8")) while True: result = crawler.crawl(mosug_url) if result['get'] == 'success': mosug_content = result["content"] break #logger.info(mosug_content) data = json.loads(mosug_content) if data["result"].get("s") is None: return found = False for dt in data["result"].get("s"): if dt.get("package") is None: continue if long(dt["docid"]) == key: download = int(dt["download_num"]) score = int(dt["score"]) * 0.05 break # screenshot screenshots = [] imgs = d('img.imagefix') #logger.info(imgs) for img in imgs: surl = pq(img).attr("src") #logger.info(url) screenshots.append(surl) # content desc = d('p.content').text() #logger.info(desc) icon = d('div.app-pic> img').attr("src") #logger.info(icon) author = d('div.origin-wrap> span> span').eq(1).text() chinese, is_company = name_helper.name_check(author) if chinese and is_company: author = name_helper.company_name_normalize(author) #logger.info("author: %s", author) commentbyeditor = d('span.head-content').text() item = { "link": url, "apkname": app["apkname"], "appmarket": APPMARKET, "name": name, "brief": None, "website": None, "description": desc, "commentbyeditor": commentbyeditor, "updateDate": None, "language": None, "tags": sub_cate, "version": app["version"], "updates": None, "size": app["size"], "compatibility": None, "icon": icon, "author": author, "screenshots": screenshots, "type": app["type"], "key": str(key), "key_int": key, "download": download } logger.info(json.dumps(item, ensure_ascii=False, cls=util.CJsonEncoder)) android.save(collection, APPMARKET, item) android.merge(item)
def parse_company(item): if item is None: logger.info("here") return None logger.info("*** base ***") company_key = item["key"] html1 = item["content"] logger.info(company_key) d = pq((html.fromstring(html1.decode("utf-8")))) name = d('h1').text().split()[0].strip() fullName = name fullName = name_helper.company_name_normalize(fullName) if (name is None or name == "") or (fullName is None or fullName == ""): logger.info("here1: %s", name) return { "status": "No_Name", } if len(name) > len(fullName): name = fullName if name is None or name.strip() == "": name = fullName chinese, companycheck = name_helper.name_check(fullName) # if companycheck is not True: # logger.info("here") # return { # "status": "No_Name", # } logo = d('.bigELogo').attr('src') if logo.startswith("http") or logo.startswith( "https") or logo.find("default") >= 0: pass else: logo = None brief = None desc_text = d('.profile').text() logger.info("desc: %s", desc_text) if u"该公司尚未添加公司介绍" in desc_text or desc_text == "" or len(desc_text) < 5: desc = None else: desc = desc_text.replace('公司简介:', "").replace("收起", "").replace( "展开", "").replace(" ", "").strip() field = d('.comp-industry').text().strip() stage = '' headCount = d('.new-compintro li:nth-child(2)').text().split()[-1] location = d('.new-compintro li:nth-child(3)').attr('data-city') address = d('.new-compintro li:nth-child(3)').text().replace('公司地址:', '').strip() headCount = headCount.replace("人", "") if headCount == "少于15": min_staff = 1 max_staff = 15 else: staffarr = headCount.split('-') if len(staffarr) > 1: min_staff = staffarr[0] max_staff = staffarr[1] else: try: min_staff = int(staffarr[0].strip()) max_staff = None except: min_staff = None max_staff = None # # funding_type = 0 # if stage == '不需要融资': # stage = 0 # funding_type = 8010 # elif stage == '未融资': # stage = 0 # elif stage == '天使轮': # stage = 1010 # elif stage == 'A轮': # stage = 1030 # elif stage == 'B轮': # stage = 1040 # elif stage == 'C轮': # stage = 1050 # elif stage == 'D轮及以上': # stage = 1060 # elif stage == '上市公司': # stage = 1110 # else: # stage = 0 # # links = d('div.company-products> ul> li> div.text> div.name> a') artifacts = [] # for linkp in links: # link = pq(linkp)('a').attr("href") # website = url_helper.url_normalize(link) # logger.info("website: %s" % website) # # type, app_market, app_id = url_helper.get_market(website) # if type == 4010: # if item["url"] != website and website.find("zhipin") == -1: # flag, domain = url_helper.get_domain(website) # if flag is not None: # if flag is False: # domain = None # artifacts.append({ # "type": 4010, # "name": name, # "description": None, # "link": website, # "domain": domain # }) # elif type == 4020 or type == 4030: # domain = None # if domain is not None: # artifacts.append({ # "type": type, # "name": name, # "description": None, # "link": website, # "domain": domain # }) # elif type == 4040: # domain = app_id # if domain is not None: # artifacts.append({ # "type": 4040, # "name": name, # "description": None, # "link": website, # "domain": domain # }) # elif type == 4050: # domain = None # if app_market == 16010 or app_market == 16020: # android_app = parser_mongo_util.find_android_market(app_market, app_id) # if android_app: # domain = android_app["apkname"] # else: # domain = app_id # if domain is not None: # artifacts.append({ # "type": 4050, # "name": name, # "description": None, # "link": website, # "domain": domain # }) # parser member members = [] lis = d('div.executive dl') member_rank = 0 if len(lis) > 0: for li in lis: mem = pq(li) try: logo_url = mem('img').attr('src') if logo_url.startswith("http") or logo_url.startswith("https"): pass else: logo_url = "http:" + logo_url member_rank += 1 member_key = str(item["key"]) + '_' + str(member_rank) member_name = mem('p:nth-child(2)').text() # member_link = mem('p.item_manager_name > a').attr('href') member_position = mem('p:nth-child(3)').text() member_desc = mem('dd').text() # weibo = None # if member_link is not None: # if 'weibo.com' in member_link: # weibo = member_link source_member = { 'name': member_name, 'photo_url': logo_url, 'weibo': None, 'location': None, 'role': member_position, 'description': member_desc, 'education': None, 'work': None } members.append(source_member) except: pass source_company = { "name": name, "fullName": fullName, "description": desc, "brief": brief, "round": None, "roundDesc": None, "companyStatus": 2010, 'fundingType': None, "locationId": int(0), "address": address, "phone": None, "establishDate": None, "logo": logo, "source": SOURCE, "sourceId": company_key, "sourceUrl": "https://www.liepin.com/company/%s/" % company_key, "field": field, "headCountMin": min_staff, "headCountMax": max_staff, "artifacts": artifacts, "members": members, "status": 1, "stage": 0, } return source_company
def process(crawler, url, apkname, content): # logger.info(content) if has_content(content,apkname): logger.info("hereherehere") #content = content.decode('utf-8') d = pq(html.fromstring(content.decode("utf-8", "ignore"))) #content = unicode(content, encoding="utf-8", errors='replace') #d = pq(content) name = d('span.title').text() # logger.info("name: %s",name) icon = d('div.app-icon> img').attr("src") brief = d('p.tagline').text() # logger.info(brief) commentbyeditor= d('div.editorComment> div').text() #logger.info(editor_comment) screenshots = [] imgs = d('div.overview> img') # logger.info(imgs) for img in imgs: imgurl = pq(img).attr("src") screenshots.append(imgurl) desc = d('div.desc-info> div').text() # logger.info(desc) updates = d('div.change-info> div').text() # logger.info(update_desc) try: size = int(d('meta[itemprop="fileSize"]').attr("content")) except: size = d('meta[itemprop="fileSize"]').attr("content") if size.find("KB") >= 0: size = int(float(size.replace("KB","").strip())* 1024) elif size.find("MB") >= 0: size = int(float(size.replace("MB","").strip())* 1024 * 1024) else: size = None tags = d('dd.tag-box >a').text().replace(" ",",") datestr = d('time#baidu_time').text() updatedate = datetime.datetime.strptime(datestr, "%Y年%m月%d日") #versionname = d(':contains("版本")').next() #logger.info(versionname) author = d('span.dev-sites').text() chinese, is_company = name_helper.name_check(author) if chinese and is_company: author = name_helper.company_name_normalize(author) try: website=d('a.dev-sites').attr("href") website = url_helper.url_normalize(website) except: website=None compatibility=None if content.find("查看权限要求") == -1: r1 = "content=\"Android\">(.*?)</dd>.*<dt>来自" else: r1 = "content=\"Android\">(.*?)<div>.*" result1 = util.re_get_result(r1, content) if result1: (compatibility,)= result1 compatibility=compatibility.replace("\n","").replace("\r","").replace("\s","").replace(" ","") #logger.info(compatibility) versionname=None r2 = "<dt>版本</dt>.*<dd>(.*?)</dd>.*<dt>要求" result2 = util.re_get_result(r2, content) if result2: (versionname,)= result2 versionname = versionname.replace("\n", "").replace("\r", "").replace("\s", "").replace(" ","").strip() #logger.info(versionname) try: versionname = versionname.split()[0] if versionname.startswith("V"): versionname = versionname.replace("V", "") except: pass # download = int(d("i[itemprop='interactionCount']").attr("content").split(":")[1]) dnum = d("i[itemprop='interactionCount']").attr("content").split(":")[1] download = None try: download = int(dnum) except: if dnum.find("万") >= 0: download = int(float(dnum.replace("万", "").strip()) * 10000) elif dnum.find("亿") >= 0: download = int(float(dnum.replace("亿", "").strip()) * 10000 * 10000) else: logger.info("********download :%s cannot get", dnum) item = { "link": url, "apkname": apkname, "appmarket": APPMARKET, "name": name, "brief": brief, "website": website, "description": desc, "commentbyeditor": commentbyeditor, "updateDate": updatedate, "language": None, "tags": tags, "version": versionname, "updates": updates, "size": size, "compatibility": compatibility, "icon": icon, "author": author, "screenshots": screenshots, "type": None, "key": apkname, "key_int": None, "download":download, } logger.info(json.dumps(item, ensure_ascii=False, cls=util.CJsonEncoder)) android.save(collection, APPMARKET, item) android.merge(item) collection_android.update_one({"apkname": apkname}, {"$set": {"wandoujiaprocessed": True, "wandoujiafound": True}}) else: logger.info("App: %s has no content", apkname) #logger.info(content) collection_android.update_one({"apkname": apkname}, {"$set": {"wandoujiaprocessed": True, "wandoujiafound": False}})
def process(url, key, content): global LATEST if content.find('360安全中心') == -1: return #logger.info(content) r = "var detail = \(function \(\) \{\s*?return\s*?(.*?);\s*?\}\)" result = util.re_get_result(r, content) (b, ) = result base = json.loads(b.replace("'", '"'), strict=False) name = base["sname"] type = base["type"] package = base["pname"].strip() #logger.info("%s, %s, %s" % (type, name, package)) d = pq(html.fromstring(content.decode("utf-8"))) desc = "" try: # desc = d('div.breif').contents()[0].strip() desc = d('div.breif').text().strip() ts = desc.split("【基本信息】") desc = ts[0].strip() except: pass if desc == "": try: desc = d('div#html-brief').text().strip() except: pass #logger.info(desc) author = d('div.base-info> table> tbody> tr> td').eq( 0).contents()[1].strip() chinese, is_company = name_helper.name_check(author) if chinese and is_company: author = name_helper.company_name_normalize(author) author = None #logger.info(author) modify_date_str = d('div.base-info> table> tbody> tr> td').eq( 1).contents()[1].strip() #logger.info(modify_date_str) modify_date = datetime.datetime.strptime(modify_date_str, "%Y-%m-%d") #logger.info(modify_date) versionname = None try: versionname = d('div.base-info> table> tbody> tr> td').eq( 2).contents()[1].strip() if versionname.startswith("V"): versionname = versionname.replace("V", "") except: pass #logger.info(versionname) compatibility = d('div.base-info> table> tbody> tr> td').eq( 3).contents()[1].strip() language = d('div.base-info> table> tbody> tr> td').eq( 4).contents()[1].strip() if language == "其他": if hz.is_chinese_string(desc): language = "中文" #logger.info(language) icon = d('div#app-info-panel> div> dl> dt >img').attr("src").strip() #logger.info(icon) screenshots = [] try: screenshots = d('div#scrollbar').attr("data-snaps").split(",") except: pass commentbyeditor = None r = "<p><strong>【小编点评】</strong>(.*?)</p>" result = util.re_get_result(r, content) if result: (commentbyeditor, ) = result updates = None r = "<br/><b>【更新内容】</b><br/>(.*?)</div>" result = util.re_get_result(r, content) if result: (updates, ) = result updates = updates.replace("<br />", "\n").strip() tags = d("div.app-tags> a").text().replace(" ", ",") size = None r = "'size':'(.*?)'" result = util.re_get_result(r, content) if result: (size, ) = result size = int(size) downloadstr = d("span.s-3").eq(0).text().replace("下载:", "").replace( "次", "").replace("+", "").strip() download = None try: if downloadstr.endswith("千"): download = float(downloadstr.replace("千", "")) * 1000 elif downloadstr.endswith("万"): download = float(downloadstr.replace("万", "")) * 10000 elif downloadstr.endswith("亿"): download = float(downloadstr.replace("亿", "")) * 10000 * 10000 else: download = int(downloadstr) score = float(d("span.s-1").text().replace("分", "").strip()) * 0.5 except: traceback.print_exc() item = { "link": url, "apkname": package, "appmarket": APPMARKET, "name": name, "brief": None, "website": None, "description": desc, "commentbyeditor": commentbyeditor, "updateDate": modify_date, "language": language, "tags": tags, "version": versionname, "updates": updates, "size": size, "compatibility": compatibility, "icon": icon, "author": author, "screenshots": screenshots, "type": type, "key": str(key), "key_int": key, "download": download, } logger.info(json.dumps(item, ensure_ascii=False, cls=util.CJsonEncoder)) android.save(collection, APPMARKET, item) android.merge(item) if LATEST < key: LATEST = key
def parse_company(item): if item is None: return None logger.info("*** base ***") company_key = item["key"] html = item["content"] logger.info(company_key) d = pq(html) # logo_id processed in parser_db_util ''' logo_id = None if logo_url is not None: logo_id = parser_util.get_logo_id(source, company_key, 'company', logo_url) ''' if html.decode("utf-8").find("这个公司的主页还在建设") >= 0: return { "status": "No_Name", } name = d('.company_main > h1 > a').text() link = d('.company_main > h1 > a').attr('href') fullName = d('.company_main > h1 > a').attr('title') fullName = name_helper.company_name_normalize(fullName) if name is None or fullName is None or name.find("拉勾") >= 0: return { "status": "No_Name", } if len(name) > len(fullName): name = fullName if name is None or name.strip() == "": name = fullName chinese, companycheck = name_helper.name_check(fullName) if companycheck is not True: return { "status": "No_Name", } logo = d('.top_info_wrap > img').attr('src') if logo.startswith("http") or logo.startswith("https"): pass else: logo = "http:" + logo if logo.find("logo_default") >= 0: logo = None brief = d('.company_word').text() desc_text = d('.company_intro_text').text() if u"该公司尚未添加公司介绍" in desc_text or len(desc_text) < 10: desc = None else: desc = d('.company_intro_text > .company_content').html() desc = desc.replace('<span class="text_over">展开</span>', '') soup = BeautifulSoup(desc, "lxml") raw = soup.getText() # logger.info(desc) #logger.info(raw) desc = raw # if desc is None or desc.strip() == "": # return { # "status": "No_Name", # } field = '' stage = '' headCount = '' location = '' address = '' try: field = d( '#basic_container > .item_content >ul > li:eq(0) > span').text() stage = d( '#basic_container > .item_content >ul > li:eq(1) > span').text() headCount = d( '#basic_container > .item_content >ul > li:eq(2) > span').text() headCount = headCount[0:headCount.index(u'人')] location = d( '#basic_container > .item_content >ul > li:eq(3) > span').text() address = d('.con_mlist_ul > li:eq(0) > p:eq(1)').text() except: pass headCount = headCount.replace("people", "") if headCount == "少于15": min_staff = 1 max_staff = 15 else: staffarr = headCount.split('-') if len(staffarr) > 1: min_staff = staffarr[0] max_staff = staffarr[1] else: try: min_staff = int(staffarr[0].strip()) max_staff = None except: min_staff = None max_staff = None funding_type = 0 if stage == '不需要融资': stage = 0 funding_type = 8010 elif stage == '未融资': stage = 0 elif stage == '天使轮': stage = 1010 elif stage == 'A轮': stage = 1030 elif stage == 'B轮': stage = 1040 elif stage == 'C轮': stage = 1050 elif stage == 'D轮及以上': stage = 1060 elif stage == '上市公司': stage = 1110 else: stage = 0 location_id = 0 location_new = parser_db_util.get_location(location) if location_new != None: location_id = location_new["locationId"] #website = util.norm_url(link) website = url_helper.url_normalize(link) logger.info("website: %s" % website) artifacts = [] type, app_market, app_id = url_helper.get_market(website) if type == 4010: if item["url"] != website and website.find("lagou.com") == -1: flag, domain = url_helper.get_domain(website) if flag is not None: if flag is False: domain = None artifacts.append({ "type": 4010, "name": name, "description": desc, "link": website, "domain": domain }) elif type == 4020 or type == 4030: domain = None if domain is not None: artifacts.append({ "type": type, "name": name, "description": desc, "link": website, "domain": domain }) elif type == 4040: domain = app_id if domain is not None: artifacts.append({ "type": 4040, "name": name, "description": desc, "link": website, "domain": domain }) elif type == 4050: domain = None if app_market == 16010 or app_market == 16020: android_app = parser_db_util.find_android_market( app_market, app_id) if android_app: domain = android_app["apkname"] else: domain = app_id if domain is not None: artifacts.append({ "type": 4050, "name": name, "description": desc, "link": website, "domain": domain }) source_company = { "name": name, "fullName": fullName if fullName is not None and fullName.strip() != "" else None, "description": desc, "productDesc": None, "modelDesc": None, "operationDesc": None, "teamDesc": None, "marketDesc": None, "compititorDesc": None, "advantageDesc": None, "planDesc": None, "brief": brief, "round": None, "roundDesc": None, "companyStatus": 2010, 'fundingType': funding_type, "locationId": location_id, "address": address, "phone": None, "establishDate": None, "logo": logo, "source": SOURCE, "sourceId": company_key, "field": field, "subField": None, "tags": None, "headCountMin": min_staff, "headCountMax": max_staff, "artifacts": artifacts, "status": 1 } return source_company
def expand(company_id): # mongo mongo = db.connect_mongo() # create index? # collection = mongo.crawler_v3.projectdata collection_itunes = mongo.market.itunes collection_beian = mongo.info.beian collection_android = mongo.market.android collection_android_market = mongo.market.android_market logger.info("Company id: %s Start app check!!!", company_id) conn = db.connect_torndb() company_names = conn.query( "select * from corporate_alias where corporateId in (select corporateId from " "company where id=%s) and (active is null or active='Y')", company_id) artifacts = conn.query( "select * from artifact where companyId=%s and (active is null or active='Y')", company_id) logger.info( json.dumps(company_names, ensure_ascii=False, cls=util.CJsonEncoder)) logger.info( json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder)) conn.close() # Step A/1:按公司名,备案查询 logger.info("%s 按公司名备案查询", company_id) for company_name in company_names: # Only check chinese company name if company_name["name"] is None or company_name["name"].strip() == "": continue (chinese, companyName) = name_helper.name_check(company_name["name"]) if chinese != "Y": continue check_names = list( collection_beian.find({"organizer": company_name["name"]})) if len(check_names) > 0: save_beian_artifacts( check_names, company_id) # insert website/homepage into Mysql.artifact #itunes扩展 #Step B/2根据公司名查询更多的itunes artifact logger.info("%s 根据公司名查询更多的itunes artifact", company_id) for company_name in company_names: if company_name["name"] is None or company_name["name"].strip() == "": continue check_itunes_sellers = list( collection_itunes.find({"sellerName": company_name["name"]})) logger.info("**********%s find %s", company_name["name"], len(check_itunes_sellers)) if len(check_itunes_sellers) > 0: #lens_domain = count_domains(check_itunes_sellers, "sellerUrl") for app in check_itunes_sellers: logger.info("**********%s find %s,%s", company_name["name"], app["trackName"], app["trackId"]) # Check if itunesId is already existed in all artifacts in 1 CompanyId if find_itunesId(app["trackId"], company_id): pass else: save_itunes_artifact(app, company_id) #Step B/3根据域名查询更多的itunes artifact logger.info("%s 根据域名查询更多的itunes artifact", company_id) for artifact in artifacts: if artifact["type"] != 4010: continue if artifact["domain"] is None: (flag, domain) = url_helper.get_domain(artifact["link"]) if flag is None: continue if flag is False: continue update_domain(domain, artifact["id"]) else: domain = artifact["domain"] if domain is None or domain.strip() == "": continue if domain in itunesDomainEx: continue check_itunes_sellerDomains = list( collection_itunes.find({"sellerDomain": domain})) logger.info("**********%s find %s", domain, len(check_itunes_sellerDomains)) if len(check_itunes_sellerDomains) > 0: #lens_company_names = count_company_names(check_itunes_sellerDomains, "sellerName") for app in check_itunes_sellerDomains: logger.info("**********%s find %s, %s", domain, app["trackName"], app["trackId"]) # Check if itunesId is already existed in all artifacts in 1 CompanyId if find_itunesId(app["trackId"], company_id): pass else: save_itunes_artifact(app, company_id) check_itunes_supportDomains = list( collection_itunes.find({"supportDomain": domain})) logger.info("**********%s find %s", domain, len(check_itunes_supportDomains)) if len(check_itunes_supportDomains) > 0 and len( check_itunes_supportDomains) < 100: #lens_company_names = count_company_names(check_itunes_supportDomains, "sellerName") for app in check_itunes_supportDomains: logger.info("**********%s find %s, %s", domain, app["trackName"], app["trackId"]) # Check if itunesId is already existed in all artifacts in 1 CompanyId if find_itunesId(app["trackId"], company_id): pass else: save_itunes_artifact(app, company_id) #android扩展 #Step C/2根据公司名查询更多的android artifact logger.info("%s 根据公司名查询更多的android artifact", company_id) for company_name in company_names: # producer name if company_name["name"] is None or company_name["name"].strip() == "": continue check_android_authors = list( collection_android.find({"author": company_name["name"]})) logger.info("**********%s find %s", company_name["name"], len(check_android_authors)) if len(check_android_authors) > 0 and len(check_android_authors) < 100: #lens_domain = count_domains(check_android_authors, "website") #check if author is consistent for app in check_android_authors: logger.info("**********%s find %s, %s", company_name["name"], app["name"], app["apkname"]) # Check if AnId existed if find_androidAppname(app["apkname"], company_id): pass else: save_android_artifact(app, company_id) #Step C/3根据域名查询更多的android artifact logger.info("%s 根据域名查询更多的android artifact", company_id) for artifact in artifacts: if artifact["type"] != 4010: continue if artifact["domain"] is None: (flag, domain) = url_helper.get_domain(artifact["link"]) if flag is None: continue if flag is False: continue update_domain(domain, artifact["id"]) else: domain = artifact["domain"] if domain is None or domain.strip() == "": continue check_android_websiteDomains = list( collection_android.find({"website_domain": domain})) logger.info("**********%s find %s", domain, len(check_android_websiteDomains)) if len(check_android_websiteDomains) > 0: #lens_company_names = count_company_names(check_android_websiteDomains, "author") for app in check_android_websiteDomains: logger.info("**********%s find %s, %s", domain, app["name"], app["apkname"]) # Check if AndroidId is already existed in artifacts if find_androidAppname(app["apkname"], company_id): pass else: save_android_artifact(app, company_id) check_android_apknameDomains = list( collection_android.find({"apkname_domain": domain})) logger.info("**********%s find %s", domain, len(check_android_apknameDomains)) #add threshold to avoid case: domain: com.wowotuan if len(check_android_apknameDomains) > 0 and len( check_android_apknameDomains) < 100: #lens_company_names = count_company_names(check_android_apknameDomains, "author") for app in check_android_apknameDomains: logger.info("**********%s find %s, %s", domain, app["name"], app["apkname"]) # Check if AndroidId is already existed in artifacts if find_androidAppname(app["apkname"], company_id): pass else: save_android_artifact(app, company_id) mongo.close()
def update_investor(investor,source_investor): conn = db.connect_torndb() investor_id = investor["id"] logger.info("****checking %s/%s/%s", investor["name"], investor["id"], source_investor["id"]) if investor["online"] is not None and investor["online"] == "Y": logger.info("online not update!!!") time.sleep(1) pass else: logger.info("Update investor : %d with source_investor: %d ", investor_id, source_investor["id"]) replace(investor, source_investor) #insert investor_alias for name in [source_investor["name"], source_investor["fullName"],source_investor["enName"], source_investor["enFullName"]]: if name is None or name.strip() == "": continue investor_alias = conn.get("select * from investor_alias where name=%s and " "investorId=%s and (active is null or active='Y') limit 1", name, investor["id"]) # logger.info("here: %s", investor_alias) if investor_alias is None: chinese, is_company = name_helper.name_check(name) if is_company: type = 12010 else: type = 12020 sql = "insert investor_alias(investorId, name, type, createTime,modifyTime) values(%s,%s,%s, now(),now())" logger.info("Add new investor alias: %s for %s", name, investor["id"]) conn.insert(sql, investor["id"], name, type) #insert investor_artifact: artifacts = [] if source_investor["website"] is not None and source_investor["website"] != "": type, market, app_id = url_helper.get_market(source_investor["website"]) if type == 4010: if source_investor["website"].find('36kr') > 0 and source_investor["website"].find("baidu") > 0: pass else: artifact = { "investorId": investor["id"], "name": investor["name"] , "description": None, "link": source_investor["website"], "domain": app_id, "type": type } artifacts.append(artifact) elif (type == 4040 or type == 4050) and app_id is not None: domain = get_android_domain(market, app_id) if (type == 4040 or type == 4050) and domain is not None: artifact = { "investorId": investor["id"], "name": investor["name"] , "description": None, "link": source_investor["website"], "domain": domain, "type": type } artifacts.append(artifact) weibo = source_investor.get("weibo", "") if weibo is not None and weibo.strip() != "" and weibo.find("weibo") >= 0: artifact = { "investorId": investor["id"], "name": investor["name"] , "description": None, "link": weibo, "domain": None, "type": 4030 } artifacts.append(artifact) weixin = source_investor.get("wechatId", "") if weixin is not None and weixin.strip() != "": artifact = { "investorId": investor["id"], "name": investor["name"] , "description": None, "link": weixin, "domain": weixin, "type": 4020 } artifacts.append(artifact) if len(artifacts) > 0: for art in artifacts: if art["type"] not in [4030] and art["domain"] is not None and art["domain"].strip()!="": iart = conn.get("select * from investor_artifact where type=%s and investorId=%s and domain=%s limit 1", art["type"], investor["id"], art["domain"]) else: iart = conn.get("select * from investor_artifact where type=%s and investorId=%s and link=%s limit 1", art["type"], investor["id"], art["link"]) if iart is None: logger.info("add new artifact: %s/%s/%s", art["type"], art["name"], art["link"]) sql = "insert investor_artifact(investorId,type, name, link, domain, createTime,modifyTime) \ values(%s,%s,%s,%s,%s,now(),now())" conn.insert(sql, investor["id"], art["type"], art["name"], art["link"], art["domain"]) #insert contact contacts = conn.query("select * from source_investor_contact where sourceInvestorId=%s", source_investor["id"]) if len(contacts) >0: conn.execute("delete from investor_contact where investorId=%s and createUser=139", investor["id"]) for s in contacts: sql = "insert investor_contact(investorId, locationId, address, phone, email, createUser, " \ "createTime,modifyTime) \ values(%s,%s,%s,%s,%s,%s,now(),now())" conn.insert(sql, investor["id"], s["locationId"], s["address"], s["phone"], s["email"], 139) # insert member members = conn.query("select * from source_investor_member where sourceInvestorId=%s", source_investor["id"]) for m in members: member = conn.get("select * from investor_member where investorId=%s and name=%s limit 1", investor["id"], m["name"]) if member is not None: continue sql = "insert investor_member(investorId,name,logo, position, description,createUser,createTime,modifyTime) \ values(%s,%s,%s,%s,%s,%s,now(),now())" conn.insert(sql, investor["id"], m["name"], m["logo"], m["position"], m["description"], 139) conn.close()
def corp_merge3(): tline = "" n = 0 n1 = 0 n2 = 0 n3 = 0 n4 = 0 n5 = 0 n6 = 0 n7 = 0 conn = db.connect_torndb() cnames = conn.query( "select name,count(*) as cnt from corporate_alias where (active is null or active !='N') " "and name is not null and name!='' group by name having cnt>1") # cnames = conn.query("select fullName,count(*) as cnt from corporate where (active is null or active !='N') " # "and fullName='上海中慎网络科技有限公司' group by fullName having cnt>1") logger.info("total names: %s", len(cnames)) for cname in cnames: pnames = [] fundingFlag = False cfullFlag = True full_name = cname["name"] corporate_ids = [] corporate_ids_f = [] stockFlag = False if full_name is None or full_name.strip() == "" or full_name.strip() == "-" \ or full_name.strip() == "个人" or full_name.strip() == "扎堆": continue corporate_aliases = conn.query( "select * from corporate_alias where name=%s and (active is null or active !='N')", full_name) for caa in corporate_aliases: ca = conn.get( "select * from corporate where (active is null or active !='N') and id=%s", caa["corporateId"]) if ca is None: continue # if ca["fullName"] != full_name: continue c_stock = conn.get( "select * from corporate_stock_exchange_rel where corporateId=%s limit 1", ca["id"]) if c_stock is not None: stockFlag = True continue company = conn.get( "select * from company where corporateId=%s and (active is null or active='Y') limit 1", ca["id"]) if company is not None: if ca["id"] not in corporate_ids: corporate_ids.append(int(ca["id"])) if ca["fullName"] != full_name: cfullFlag = False else: if ca["id"] not in corporate_ids_f: corporate_ids_f.append(int(ca["id"])) funding = conn.get( "select * from funding where corporateId=%s and (active is null or active='Y') " "order by fundingDate desc limit 1", caa["corporateId"]) if fundingFlag is False and funding is not None: fundingFlag = True pnames.append(company["name"]) if len(corporate_ids) > 1 and stockFlag is False: if len(pnames) >= 2: vv = compare(pnames) else: vv = 0 (chinese, company) = name_helper.name_check(full_name) if chinese is True: chinese_type = "Y" n5 += 1 if fundingFlag is True: n3 += 1 if cfullFlag is True: n4 += 1 if vv <= 0.75: n7 += 1 else: chinese_type = "N" n6 += 1 #do merge n += 1 logger.info("merge:%s %s-> %s", full_name, corporate_ids, chinese_type) mflag = corporate_util.autoMerge(corporate_ids, full_name) # # if mflag is None: # logger.info("wrong") # exit() if mflag == 1: n1 += 1 else: n2 += 1 # elif mflag == 2: # n2 += 1 # elif mflag == 3: # n3 += 1 # elif mflag == 4: # n4 += 1 # line = "%s+++%s+++%s\n" % ( # full_name, ";".join([str(id) for id in corporate_ids]), get_links(corporate_ids)) # fp2.write(line) # else: c1 = "否" c2 = "否" c3 = "否" if len(corporate_ids_f) == 1: c1 = "是" if len(corporate_ids_f) == len(corporate_ids): c2 = "是" if len(corporate_ids_f) == 0: c3 = "是" line = "%s+++%s+++%s+++%s+++%s+++%s+++%s+++%s+++%s+++%s+++%s\n" % ( full_name, ";".join([str(id) for id in corporate_ids]), get_links(corporate_ids), "中文名" if chinese_type == 'Y' else "英文名", "有融资" if fundingFlag is True else "无融资", "公司主要名称一致" if cfullFlag is True else "公司别名一致", "短名高度相似" if vv <= 0.75 else "短名不相似", "可以根据verify自动聚合" if mflag == 1 else " ", c1, c2, c3) # fp2.write(line) tline += line fp2 = open("me.txt", "w") fp2.write(tline) logger.info("merge num %s/%s/%s/%s/%s/%s/%s/%s", n, n1, n2, n3, n4, n5, n6, n7) content = '''<div>Dears, <br /><br /> 附件是目前系统中存在重复的公司,请在后台搜索 </div> ''' fp2.close() path = os.path.join(sys.path[0], "me.txt") logger.info(path) email_helper.send_mail_file( "烯牛数据数据开发组", "烯牛数据数据开发组", "*****@*****.**", ';'.join([i + '@xiniudata.com' for i in ["bamy"]]), "重复公司检索--人工审查", content, path) conn.close()
def qixinCrawler(name, type, corporateIds=[], test=False): if type == 1: if len(corporateIds) == 0: return None, None (chinese, cccompany) = name_helper.name_check(name) if chinese is True: pass else: return None, None for corporateId in corporateIds: corporate_alias = conn.get( "select * from corporate_alias where (active is null or active='Y') and " "corporateId=%s and name=%s limit 1", corporateId, name) # logger.info(corporateId) if corporate_alias is None: continue # logger.info(corporateId) corporate = conn.get( "select * from corporate where id=%s and " "(active ='Y' or active ='A' or active is null)", corporateId) # logger.info(corporateId) if corporate is None: continue company = conn.get( "select * from company where corporateId=%s and " "(active ='Y' or active ='A' or active is null) limit 1", corporateId) # logger.info(corporateId) if company is None: continue if company["name"] is None or company["name"].strip() == "" or \ len(desc_helper.count_other2(company["name"])) == len(company["name"]): logger.info("wwwwwwrong here") continue param = {} tags = [ tt["name"] for tt in conn.query( " select t.name from company_tag_rel ctr join tag t " "on ctr.tagId=t.id where ctr.companyId=%s and t.type=11012 and " "(ctr.active='Y' or ctr.active is null)", company["id"]) ] baseinfo = { "company_name": name.strip(), "project_logo_url": "http://www.xiniudata.com/file/%s" % company["logo"] if company["logo"] is not None else "", "project_name": company["name"].strip(), "finance_rounds": rmap[int(corporate["round"])] if (corporate["round"] is not None and rmap.has_key(int(corporate["round"]))) else "未融资", "website_url": company["website"] if company["website"] is not None and company["website"].strip() != "" else "", "key_words": ",".join(tags) if len(tags) > 0 else "", "introduction": company["description"], } # logger.info(baseinfo) fundings = conn.query( "select * from funding where corporateId=%s and (active='Y' or active is null)", corporateId) bfi = [] for funding in fundings: investors = [ ii["name"] for ii in conn.query( "select i.name from funding_investor_rel fir join " "investor i " "on fir.investorId=i.id where fir.fundingId=%s and " "(fir.active is null or fir.active='Y') and " "(i.active is null or i.active='Y')", funding["id"]) ] amount = get_amount(funding["investment"], funding["precise"]) # logger.info("**************%s, %s -> %s", funding["investment"], funding["precise"], amount) fi = { "date": str(funding["fundingDate"].date()) if funding["fundingDate"] is not None else "", "round": rmap[int(funding["round"])] if (funding["round"] is not None and rmap.has_key(int(funding["round"]))) else "", "amount": amount, "currency": currentmap[int(funding["currency"])] if (funding["currency"] is not None and currentmap.has_key(int(funding["currency"]))) else "", "investor": ",".join(investors) } bfi.append(fi) baseinfo["finance_info"] = bfi param["base"] = baseinfo #members members = conn.query( "select cmr.type,cmr.position,m.* from member m join company_member_rel cmr on " "m.id=cmr.memberId where cmr.companyId=%s and (cmr.active is null or cmr.active='Y')" " and (m.active='Y' or m.active is null)", company["id"]) tcm = [] for m in members: if int(m["type"]) not in [5010, 5020]: continue cm = { "avatar_url": "http://www.xiniudata.com/file/%s" % m["photo"] if m["photo"] is not None else "", "name": m["name"], "position": m["position"], "education": m["education"], "introduction": m["description"], "work": m["work"] } tcm.append(cm) param["team"] = {"core_members": tcm} #comps coms = conn.query( "select c.name from companies_rel cr join company c on cr.company2Id=c.id " "where cr.companyId=%s and (c.active is null or c.active='Y') " "order by cr.distance desc limit 10", company["id"]) param["competitors"] = [{"project_name": c["name"]} for c in coms] #artifact products = [] for tt in [4010, 4020, 4030, 4040, 4050]: if tt == 4010: artifacts = conn.query( "select name, description, type from artifact where companyId=%s and " "(active is null or active='Y') and type=%s " "order by rank limit 5", company["id"], tt) else: artifacts = conn.query( "select name, description, type from artifact where companyId=%s and " "(active is null or active='Y') and type=%s " "order by rank desc limit 5", company["id"], tt) for a in artifacts: if a["name"] is None: a["name"] = "" if a["description"] is None: a["description"] = "" if a["name"].find("av") >= 0 or a["name"].find("AV") >= 0 or \ a["name"].find("性爱") >= 0 or a["name"].find("做爱") >= 0 or \ a["name"].find("澳门") >= 0 or a["name"].find("威尼斯") >= 0 or \ a["name"].find("男人")>=0 or a["name"].find("成人")>=0: continue if a["description"].find("av") >= 0 or a["description"].find("AV") >= 0 or \ a["description"].find("性爱") >= 0 or a["description"].find("做爱") >= 0 or \ a["description"].find("澳门") >= 0 or a["description"].find("威尼斯") >= 0 or \ a["description"].find("男人")>=0 or a["description"].find("成人")>=0: continue if tmap.has_key(a["type"]): if a["type"] == 4010: cchinese = desc_helper.count_other( a["name"]) + desc_helper.count_other( a["description"]) if len(cchinese) > 0: logger.info("wrong luanma!!!!!: %s", a["name"]) continue products.append({ "name": a["name"], "description": a["description"], "kind": tmap[a["type"]] }) param["products"] = products logger.info("products upload for %s|%s, %s", corporateId, len(products), len(str(products))) logger.info("tags: %s", param["base"]["key_words"]) url_company = url + '/open/xiniu/venture/capital' try: if test is True: for p in param: logger.info(p) if p == "products": for pp in param[p]: # logger.info(pp) logger.info( json.dumps(pp, ensure_ascii=False, cls=util.CJsonEncoder)) else: # logger.info(p) logger.info( json.dumps(param[p], ensure_ascii=False, cls=util.CJsonEncoder)) return None, None else: # logger.info(json.dumps(param, ensure_ascii=False, cls=util.CJsonEncoder)) # exit() # http://httpbin.org/posthttp://www.xiniudata.com/5977875df8716656636efb78/stat/gettest # res = requests.post('http://httpbin.org/post', json=param) res = requests.post(url_company, json=param) # logger.info("\n\n\n\n") # logger.info("result:") # logger.info(json.dumps(param, ensure_ascii=False, cls=util.CJsonEncoder)) logger.info(res.text) # logger.info("\n\n\n\n") # conn.close() return json.loads(res.text), json.dumps( param, ensure_ascii=False, cls=util.CJsonEncoder) except: pass # conn.close() return None, None elif type == 2: # conn = db.connect_torndb() url_investor = url + '/open/xiniu/venture/institution' investoras = conn.query( "select * from investor_alias where verify='Y' and " "(active is null or active!='N') and name=%s", name) if len(investoras) == 0: investoras = conn.query( "select * from investor_alias where " "(active is null or active!='N') and name=%s", name) if len(investoras) > 0: for investora in investoras: investor = conn.get( "select * from investor where (active is null or active!='N') and id=%s", investora["investorId"]) if investor is not None: try: logger.info(investor) param = { "full_name": name, "short_name": investor["name"] if investor["name"] is not None and investor["name"] != "" else name } res = requests.post(url_investor, json=param) # conn.close() return json.loads(res.text), json.dumps( param, ensure_ascii=False, cls=util.CJsonEncoder) except: return None, None # conn.close() return None, None elif type == 3: try: # conn = db.connect_torndb() url_investor = url + '/open/xiniu/venture/institution' param = {"full_name": name, "short_name": name} res = requests.post(url_investor, json=param) # conn.close() return json.loads(res.text), json.dumps(param, ensure_ascii=False, cls=util.CJsonEncoder) except: return None, None else: return None, None