def save_mongo_source_company_name(source, sourceId, scndata): mongo = db.connect_mongo() collection_company = mongo.source.company item = populate_column(scndata, source_company_name_columns) chinese = hz.is_chinese_string(item["name"]) if chinese: item["chinese"] = 'Y' else: item["chinese"] = 'N' record = collection_company.find_one({ "source": source, "sourceId": sourceId }) if record is not None: #logger.info(record) source_company_name = collection_company.find_one({ "source": source, "sourceId": sourceId, "source_company_name.name": item["name"] }) if source_company_name is None: collection_company.update_one( {"_id": record["_id"]}, {'$addToSet': { "source_company_name": item }}) mongo.close()
def save_source_company_name(source_company_id, name, type, nflag): if name is None or name.strip() == "": return chinese = hz.is_chinese_string(name) if chinese: chinese = 'Y' else: chinese = 'N' # new company if nflag == "new": conn = db.connect_torndb() n = conn.get( "select * from source_company_name where sourceCompanyId=%s and name=%s", source_company_id, name) if n is None: # chinese = hz.is_chinese_string(name) # if chinese: # chinese = 'Y' # else: # chinese = 'N' conn.insert( "insert source_company_name(sourceCompanyId,name,type,chinese,createTime,modifyTime) values( \ %s,%s,%s,%s,now(),now())", source_company_id, name, type, chinese) conn.close() #Save all into mongo scname = {"name": name, "type": type, "chinese": chinese} parser_mongo_util.save_mongo_source_company_name(source_company_id, scname)
def get_short_name(unicode_name): name = unicode_name.strip() if name.find(u"-") > 0: name = name.split(u"-")[0].strip() if name.find(u"—") > 0: name = name.split(u"—")[0].strip() if name.find(u"-") > 0: name = name.split(u"-")[0].strip() if hz.is_chinese_string(name): if name.find(u" ") > 0: name = name.split(u" ")[0].strip() if name.find(u"|") > 0: name = name.split(u"|")[0].strip() if name.find(u"·") > 0: name = name.split(u"·")[0].strip() return name
def save_source_company_name(source_company_id, name, type): if name is None or name.strip() == "": return conn = db.connect_torndb() n = conn.get( "select * from source_company_name where sourceCompanyId=%s and name=%s", source_company_id, name) if n is None: chinese = hz.is_chinese_string(name) if chinese: chinese = 'Y' else: chinese = 'N' conn.insert( "insert source_company_name(sourceCompanyId,name,type,chinese,createTime,modifyTime) values( \ %s,%s,%s,%s,now(),now())", source_company_id, name, type, chinese) conn.close()
def english_name_check(unicode_name): chinese = hz.is_chinese_string(unicode_name) english = False company = None if not chinese: english = True company = False n = unicode_name.strip().lower() if n.endswith("ltd") or \ n.endswith("ltd.") or \ n.endswith("inc") or \ n.endswith("inc.") or \ n.endswith("llc") or \ n.endswith("llc.") or \ n.endswith("limited") or \ n.endswith("corporation") or \ n.endswith("company"): company = True return english, company
def name_check(unicode_name): chinese = hz.is_chinese_string(unicode_name) company = None if chinese: company = False if unicode_name.find(u"公司") > 0: company = True elif unicode_name.find(u"企业") > 0: company = True elif unicode_name.find(u"中心") > 0: company = True elif unicode_name.find(u"事务所") > 0: company = True elif unicode_name.find(u"研究院") > 0: company = True elif unicode_name.find(u"会社") > 0: company = True elif unicode_name.find(u"合伙") > 0: company = True elif unicode_name.endswith(u"所") > 0: company = True elif unicode_name.endswith(u"部") > 0: company = True elif unicode_name.endswith(u"会") > 0: company = True elif unicode_name.endswith(u"院") > 0: company = True elif unicode_name.endswith(u"社") > 0: company = True elif unicode_name.endswith(u"店") > 0: company = True elif unicode_name.endswith(u"馆") > 0: company = True elif unicode_name.endswith(u"室") > 0: company = True elif unicode_name.endswith(u"厂") > 0: company = True elif unicode_name.endswith(u"楼") > 0: company = True return chinese, company
def check_desc(content, length=5): if content is None or content.strip() == "": return False if len(content) <= length: return False if hz.is_chinese_string(content): if float(len(set(content)))/float(len(content)) <= 0.1: num_chinese = count_chinese(content) #logger.info("%s->%s",len(set(content)),num_chinese) if len(set(content))-num_chinese <= 20: return False else: #logger.info(len(set(content))) if len(content) <= 10: return False if len(set(content)) <= 20: if float(len(set(content)))/float(len(content)) <= 0.28: return False return True
def find_company_by_short_name(source_company, test=False): #产品名相同,则判断 #1. 地区相同 #2. 成立日期相同 #3. member有相同 #4. 融资事件 table_names = helper.get_table_names(test) logger.info("find_company_by_short_name") matched_company_id = None conn = db.connect_torndb() source_members = list(conn.query("select m.* from source_company_member_rel r join source_member m on m.id=r.sourceMemberId where r.sourceCompanyId=%s", source_company["id"])) source_investor_ids = {} source_fundings = list(conn.query("select * from source_funding where sourceCompanyId=%s",source_company["id"])) for sf in source_fundings: rels = list(conn.query("select * from source_funding_investor_rel where sourceFundingId=%s",sf["id"])) for rel in rels: source_investor = conn.get("select * from source_investor where id=%s", rel["sourceInvestorId"]) if source_investor["investorId"] is not None: source_investor_ids[source_investor["investorId"]] = 1 short_names = list(conn.query("select * from source_company_name where type=12020 and sourceCompanyId=%s", source_company["id"])) sns = [] for s in short_names: sns.append(s["name"]) if source_company["name"] not in sns: sns.append(source_company["name"]) for short_name in sns: if short_name is None or short_name.strip() == "": continue short_name = short_name.strip() logger.info("short_name: %s", short_name) candidate_company_ids = [] cs = list(conn.query("select * from " + table_names["company"] + " where name=%s and (active is null or active !='N')", short_name)) for c in cs: company_id = c["id"] candidate_company_ids.append(company_id) aliases = list(conn.query("select a.companyId from " + table_names["company_alias"] + " a join " + table_names["company"] + " c on c.id=a.companyId " + "where (c.active is null or c.active!='N') and a.name=%s",short_name)) for alias in aliases: company_id = alias["companyId"] candidate_company_ids.append(company_id) for company_id in candidate_company_ids: company = conn.get("select * from " + table_names["company"] + " where id=%s and (active is null or active='Y')", company_id) if company is None: continue #地区 location1 = source_company["locationId"] location2 = company["locationId"] if location1 > 0 and location1==location2: matched_company_id = company_id logger.info("find_company_by_short_name, location") break #成立日期 date1 = source_company["establishDate"] date2 = company["establishDate"] if date1 is not None and date2 is not None and \ date1.year==date2.year and date1.month==date2.month: matched_company_id = company_id logger.info("find_company_by_short_name, establish date") break #member members = list(conn.query("select m.* from " + table_names["company_member_rel"] + " r join " + table_names["member"] + " m on m.id=r.memberId where r.companyId=%s", company_id)) for member in members: member_name = member["name"] logger.info("member_name: %s", member_name) if member_name is None or member_name == "": continue if not hz.is_chinese_string(member_name): continue for source_member in source_members: #logger.info("source_member_name: %s", source_member["name"]) if member_name == source_member["name"]: matched_company_id = company_id logger.info("find_company_by_short_name, member") break if matched_company_id is not None: break if matched_company_id is not None: break # gongshang member # TODO #funding fundings = list(conn.query("select * from " + table_names["funding"] + " where companyId=%s",company_id)) for f in fundings: rels = list(conn.query("select * from " + table_names["funding_investor_rel"] + " where fundingId=%s",f["id"])) for rel in rels: if source_investor_ids.has_key(rel["investorId"]): matched_company_id = company_id logger.info("find_company_by_short_name, funding") break if matched_company_id is not None: break if matched_company_id is not None: break if matched_company_id is not None: break conn.close() return matched_company_id
def find_company_by_short_name(company): #产品名相同,则判断 #1. 地区相同 #2. 成立日期相同 #3. member有相同 #4. 融资事件 global caflag logger.info("find_company_by_short_name") matched_company_id = None conn = db.connect_torndb() members = list( conn.query( "select m.* from company_member_rel r join member m on m.id=r.memberId where r.companyId=%s", company["id"])) investor_ids = {} fundings = list( conn.query("select * from funding where companyId=%s", company["id"])) for f in fundings: rels = list( conn.query("select * from funding_investor_rel where fundingId=%s", f["id"])) for rel in rels: investor_ids[rel["investorId"]] = 1 sns = [] #add company_alias into checking list if caflag is True: short_names = list( conn.query( "select * from company_alias where type=12020 and companyId=%s", company["id"])) for s in short_names: sns.append(s["name"]) if company["name"] not in sns: sns.append(company["name"]) for short_name in sns: if short_name is None or short_name.strip() == "": continue short_name = short_name.strip() logger.info("short_name: %s", short_name) candidate_company_ids = [] cs = list( conn.query( "select * from company where name=%s and (active is null or active !='N') and id!=%s order by id desc ", short_name, company["id"])) for c in cs: company_id = c["id"] candidate_company_ids.append(company_id) #add caflag into checking list if caflag is True: aliases = list( conn.query( "select a.companyId from company_alias a join company c on c.id=a.companyId where (c.active is null or c.active!='N') and a.name=%s and c.id!=%s", short_name, company["id"])) for alias in aliases: company_id = alias["companyId"] if company_id in candidate_company_ids: continue candidate_company_ids.append(company_id) # sort id candidate_company_ids.sort(reverse=True) #logger.info("candidate companies id: %s", candidate_company_ids) for company_id in candidate_company_ids: company_candidate = conn.get( "select * from company where id=%s and (active is null or active='Y')", company_id) if company_candidate is None: continue #地区 location1 = company["locationId"] location2 = company_candidate["locationId"] if location1 > 0 and location1 == location2: matched_company_id = company_id logger.info("find_company_by_short_name, location") break #成立日期 date1 = company["establishDate"] date2 = company_candidate["establishDate"] if date1 is not None and date2 is not None and \ date1.year==date2.year and date1.month==date2.month: matched_company_id = company_id logger.info("find_company_by_short_name, establish date") break #member members_candidate = list( conn.query( "select m.* from company_member_rel r join member m on m.id=r.memberId where r.companyId=%s", company_id)) for member_candidate in members_candidate: member_name = member_candidate["name"] #logger.info("member_name: %s", member_name) if member_name is None or member_name == "": continue if not hz.is_chinese_string(member_name): continue for member in members: #logger.info("source_member_name: %s", source_member["name"]) if member_name == member["name"]: matched_company_id = company_id logger.info("find_company_by_short_name, member") break if matched_company_id is not None: break if matched_company_id is not None: break # gongshang member # TODO #funding fundings_candidate = list( conn.query("select * from funding where companyId=%s", company_id)) for fc in fundings_candidate: rels = list( conn.query( "select * from funding_investor_rel where fundingId=%s", fc["id"])) for rel in rels: if investor_ids.has_key(rel["investorId"]): matched_company_id = company_id logger.info("find_company_by_short_name, funding") break if matched_company_id is not None: break if matched_company_id is not None: break if matched_company_id is not None: break conn.close() return matched_company_id
def process(url, key, content): global LATEST if content.find('360安全中心') == -1: return #logger.info(content) r = "var detail = \(function \(\) \{\s*?return\s*?(.*?);\s*?\}\)" result = util.re_get_result(r, content) (b, ) = result base = json.loads(b.replace("'", '"'), strict=False) name = base["sname"] type = base["type"] package = base["pname"].strip() #logger.info("%s, %s, %s" % (type, name, package)) d = pq(html.fromstring(content.decode("utf-8"))) desc = "" try: # desc = d('div.breif').contents()[0].strip() desc = d('div.breif').text().strip() ts = desc.split("【基本信息】") desc = ts[0].strip() except: pass if desc == "": try: desc = d('div#html-brief').text().strip() except: pass #logger.info(desc) author = d('div.base-info> table> tbody> tr> td').eq( 0).contents()[1].strip() chinese, is_company = name_helper.name_check(author) if chinese and is_company: author = name_helper.company_name_normalize(author) author = None #logger.info(author) modify_date_str = d('div.base-info> table> tbody> tr> td').eq( 1).contents()[1].strip() #logger.info(modify_date_str) modify_date = datetime.datetime.strptime(modify_date_str, "%Y-%m-%d") #logger.info(modify_date) versionname = None try: versionname = d('div.base-info> table> tbody> tr> td').eq( 2).contents()[1].strip() if versionname.startswith("V"): versionname = versionname.replace("V", "") except: pass #logger.info(versionname) compatibility = d('div.base-info> table> tbody> tr> td').eq( 3).contents()[1].strip() language = d('div.base-info> table> tbody> tr> td').eq( 4).contents()[1].strip() if language == "其他": if hz.is_chinese_string(desc): language = "中文" #logger.info(language) icon = d('div#app-info-panel> div> dl> dt >img').attr("src").strip() #logger.info(icon) screenshots = [] try: screenshots = d('div#scrollbar').attr("data-snaps").split(",") except: pass commentbyeditor = None r = "<p><strong>【小编点评】</strong>(.*?)</p>" result = util.re_get_result(r, content) if result: (commentbyeditor, ) = result updates = None r = "<br/><b>【更新内容】</b><br/>(.*?)</div>" result = util.re_get_result(r, content) if result: (updates, ) = result updates = updates.replace("<br />", "\n").strip() tags = d("div.app-tags> a").text().replace(" ", ",") size = None r = "'size':'(.*?)'" result = util.re_get_result(r, content) if result: (size, ) = result size = int(size) downloadstr = d("span.s-3").eq(0).text().replace("下载:", "").replace( "次", "").replace("+", "").strip() download = None try: if downloadstr.endswith("千"): download = float(downloadstr.replace("千", "")) * 1000 elif downloadstr.endswith("万"): download = float(downloadstr.replace("万", "")) * 10000 elif downloadstr.endswith("亿"): download = float(downloadstr.replace("亿", "")) * 10000 * 10000 else: download = int(downloadstr) score = float(d("span.s-1").text().replace("分", "").strip()) * 0.5 except: traceback.print_exc() item = { "link": url, "apkname": package, "appmarket": APPMARKET, "name": name, "brief": None, "website": None, "description": desc, "commentbyeditor": commentbyeditor, "updateDate": modify_date, "language": language, "tags": tags, "version": versionname, "updates": updates, "size": size, "compatibility": compatibility, "icon": icon, "author": author, "screenshots": screenshots, "type": type, "key": str(key), "key_int": key, "download": download, } logger.info(json.dumps(item, ensure_ascii=False, cls=util.CJsonEncoder)) android.save(collection, APPMARKET, item) android.merge(item) if LATEST < key: LATEST = key