def clear_items(self): global logger_sourcing file_path = os.path.join( os.path.split(os.path.realpath(__file__))[0], u'dumps/saoanzi.csv') data = [] for anzi in dbutil.get_daily_saoanzi_sources(self.db, self.today): cactive = dbutil.get_company_active(self.db, anzi.companyId) need_verify = self.tcg.need_verify(anzi.companyId) if need_verify or (cactive != 'Y'): self.tcg.generate_tc( json.dumps({ 'id': anzi.companyId, 'source': 'track_saoanzi' })) dbutil.update_saoanzi_item_status(self.db, anzi.saoanziItemId, 'P') elif not self.__valid_message(anzi): dbutil.update_saoanzi_item_status(self.db, anzi.saoanziItemId, 'N') else: dbutil.update_saoanzi_item_status(self.db, anzi.saoanziItemId, 'Y') url = "http://pro.xiniudata.com/validator/#/company/%s/overview" \ % dbutil.get_company_code(self.db, anzi.companyId) # sources = ';'.join([s.name for s in dbutil.get_saoanzi_item_sources(self.db, anzi.id)]) source = anzi.source need_verify = u'需要检查' if (need_verify or (cactive != 'Y')) else u'不需要检查' data.append([ dbutil.get_company_name(self.db, anzi.companyId), url, need_verify, anzi.createTime, source ]) if not data: return # send email data = pandas.DataFrame(data) data.to_csv(file_path, encoding='utf_8_sig') # stat_verify = {title: len(set(detail[0])) for title, detail in data.groupby(3)} stat_verify = '<br/>'.join([ '%s\t%s' % (title, len(set(detail[0]))) for title, detail in data.groupby(2) ]) # stat_source = {title: len(detail) for title, detail in data.groupby(5)} stat_source = '<br/>'.join([ '%s\t%s' % (title, len(detail)) for title, detail in data.groupby(4) ]) stat = u'去重公司数<br/>%s<br/>每个源下的公司数<br/>%s\n' % (stat_verify, stat_source) receivers = ['victor', 'erin', 'weiguangxiao', 'gewei'] receivers = ';'.join(['*****@*****.**' % r for r in receivers]) title = u'扫案子项目列表 %s' % self.current_check_time.strftime('%Y-%m-%d %H') content = u'%s检查,今天共有%s个扫案子条目<br/>%s' % \ (self.current_check_time.strftime('%Y-%m-%d %H:%M'), len(data), stat) send_mail_file(u'烯牛扫案子后台', u'烯牛扫案子后台', "*****@*****.**", receivers, title, content, file_path)
def send_qmp_email(): print('this time:%s to send email' % datetime.datetime.now()) hour = time.localtime()[3] mongo = db.connect_mongo() collection = mongo.raw.qmp_rz_incr if hour == 8: items = list(collection.find().sort('createtime', -1).limit(50)) else: date = datetime.date.today().strftime('%Y-%m-%d') items = list(collection.find({'date': date})) mongo.close() cnt = len(items) from_alias = 'Hush' reply_alias = 'Hush' reply_email = '*****@*****.**' # to = '*****@*****.**' to = '[email protected];[email protected];[email protected];[email protected];[email protected];[email protected]' print('*******') subject = '企名片日常融资事件' content = '<html>共<b>%d</b>起融资事件,请查看附件</html>' % cnt file = 'qmp_rz_day.xls' wb = xlwt.Workbook() ws = wb.add_sheet('A Work Sheet') ws.write(0, 0, 'Product') ws.write(0, 1, 'Lunci') ws.write(0, 2, 'Date') ws.write(0, 3, 'Source') ws.write(0, 4, 'Jianjie') i = 1 for item in items: product = item.get('product') lunci = item.get('lunci') # Date = item.get('Date') date = item.get('news_time') # date = Date + ' ' + date jianjie = item.get('weiyu').decode('utf-8') source = item.get('qmp_url').decode('utf-8') if len(source) > 255: sources = source else: n = "HYPERLINK" sources = xlwt.Formula(n + '("%s";"%s")' % (source, source)) ws.write(i, 0, product) ws.write(i, 1, lunci) ws.write(i, 2, date) ws.write(i, 3, sources) ws.write(i, 4, jianjie) i += 1 wb.save(file) email_helper.send_mail_file(from_alias, reply_alias, reply_email, to, subject, content, file) print('done')
def corp_merge3(): tline = "" n = 0 n1 = 0 n2 = 0 n3 = 0 n4 = 0 n5 = 0 n6 = 0 n7 = 0 conn = db.connect_torndb() cnames = conn.query( "select name,count(*) as cnt from corporate_alias where (active is null or active !='N') " "and name is not null and name!='' group by name having cnt>1") # cnames = conn.query("select fullName,count(*) as cnt from corporate where (active is null or active !='N') " # "and fullName='上海中慎网络科技有限公司' group by fullName having cnt>1") logger.info("total names: %s", len(cnames)) for cname in cnames: pnames = [] fundingFlag = False cfullFlag = True full_name = cname["name"] corporate_ids = [] corporate_ids_f = [] stockFlag = False if full_name is None or full_name.strip() == "" or full_name.strip() == "-" \ or full_name.strip() == "个人" or full_name.strip() == "扎堆": continue corporate_aliases = conn.query( "select * from corporate_alias where name=%s and (active is null or active !='N')", full_name) for caa in corporate_aliases: ca = conn.get( "select * from corporate where (active is null or active !='N') and id=%s", caa["corporateId"]) if ca is None: continue # if ca["fullName"] != full_name: continue c_stock = conn.get( "select * from corporate_stock_exchange_rel where corporateId=%s limit 1", ca["id"]) if c_stock is not None: stockFlag = True continue company = conn.get( "select * from company where corporateId=%s and (active is null or active='Y') limit 1", ca["id"]) if company is not None: if ca["id"] not in corporate_ids: corporate_ids.append(int(ca["id"])) if ca["fullName"] != full_name: cfullFlag = False else: if ca["id"] not in corporate_ids_f: corporate_ids_f.append(int(ca["id"])) funding = conn.get( "select * from funding where corporateId=%s and (active is null or active='Y') " "order by fundingDate desc limit 1", caa["corporateId"]) if fundingFlag is False and funding is not None: fundingFlag = True pnames.append(company["name"]) if len(corporate_ids) > 1 and stockFlag is False: if len(pnames) >= 2: vv = compare(pnames) else: vv = 0 (chinese, company) = name_helper.name_check(full_name) if chinese is True: chinese_type = "Y" n5 += 1 if fundingFlag is True: n3 += 1 if cfullFlag is True: n4 += 1 if vv <= 0.75: n7 += 1 else: chinese_type = "N" n6 += 1 #do merge n += 1 logger.info("merge:%s %s-> %s", full_name, corporate_ids, chinese_type) mflag = corporate_util.autoMerge(corporate_ids, full_name) # # if mflag is None: # logger.info("wrong") # exit() if mflag == 1: n1 += 1 else: n2 += 1 # elif mflag == 2: # n2 += 1 # elif mflag == 3: # n3 += 1 # elif mflag == 4: # n4 += 1 # line = "%s+++%s+++%s\n" % ( # full_name, ";".join([str(id) for id in corporate_ids]), get_links(corporate_ids)) # fp2.write(line) # else: c1 = "否" c2 = "否" c3 = "否" if len(corporate_ids_f) == 1: c1 = "是" if len(corporate_ids_f) == len(corporate_ids): c2 = "是" if len(corporate_ids_f) == 0: c3 = "是" line = "%s+++%s+++%s+++%s+++%s+++%s+++%s+++%s+++%s+++%s+++%s\n" % ( full_name, ";".join([str(id) for id in corporate_ids]), get_links(corporate_ids), "中文名" if chinese_type == 'Y' else "英文名", "有融资" if fundingFlag is True else "无融资", "公司主要名称一致" if cfullFlag is True else "公司别名一致", "短名高度相似" if vv <= 0.75 else "短名不相似", "可以根据verify自动聚合" if mflag == 1 else " ", c1, c2, c3) # fp2.write(line) tline += line fp2 = open("me.txt", "w") fp2.write(tline) logger.info("merge num %s/%s/%s/%s/%s/%s/%s/%s", n, n1, n2, n3, n4, n5, n6, n7) content = '''<div>Dears, <br /><br /> 附件是目前系统中存在重复的公司,请在后台搜索 </div> ''' fp2.close() path = os.path.join(sys.path[0], "me.txt") logger.info(path) email_helper.send_mail_file( "烯牛数据数据开发组", "烯牛数据数据开发组", "*****@*****.**", ';'.join([i + '@xiniudata.com' for i in ["bamy"]]), "重复公司检索--人工审查", content, path) conn.close()
def corp_merge2(): fp2 = open("me.txt", "w") n = 0 n1 = 0 n2 = 0 n3 = 0 n4 = 0 conn = db.connect_torndb() cnames = conn.query( "select fullName,count(*) as cnt from corporate where (active is null or active !='N') " "and fullName is not null and fullName!='' group by fullName having cnt>1" ) # cnames = conn.query("select fullName,count(*) as cnt from corporate where (active is null or active !='N') " # "and fullName='上海中慎网络科技有限公司' group by fullName having cnt>1") for cname in cnames: full_name = cname["fullName"] corporate_ids = [] stockFlag = False if full_name is None or full_name.strip() == "" or full_name.strip() == "-" \ or full_name.strip() == "个人" or full_name.strip() == "扎堆": continue corporate_aliases = conn.query( "select * from corporate_alias where name=%s and (active is null or active !='N')", full_name) for caa in corporate_aliases: ca = conn.get( "select * from corporate where (active is null or active !='N') and id=%s", caa["corporateId"]) if ca is None: continue if ca["fullName"] != full_name: continue c_stock = conn.get( "select * from corporate_stock_exchange_rel where corporateId=%s limit 1", ca["id"]) if c_stock is not None: stockFlag = True continue company = conn.get( "select * from company where corporateId=%s and (active is null or active!='N') limit 1", ca["id"]) if company is not None: if ca["id"] not in corporate_ids: corporate_ids.append(int(ca["id"])) if len(corporate_ids) > 1 and stockFlag is False: logger.info("merge:%s-> %s", full_name, corporate_ids) #do merge n += 1 mflag = corporate_util.autoMerge(corporate_ids, full_name) if mflag is None: logger.info("wrong") exit() if mflag == 1: n1 += 1 elif mflag == 2: n2 += 1 elif mflag == 3: n3 += 1 elif mflag == 4: n4 += 1 line = "%s+++%s+++%s\n" % (full_name, ";".join( [str(id) for id in corporate_ids]), get_links(corporate_ids)) fp2.write(line) else: line = "%s+++%s+++%s\n" % (full_name, ";".join( [str(id) for id in corporate_ids]), get_links(corporate_ids)) fp2.write(line) logger.info("merge num %s/%s/%s/%s/%s", n4, n3, n2, n1, n) content = '''<div>Dears, <br /><br /> 附件是目前系统中存在重复的公司,请在后台搜索 </div> ''' fp2.close() path = os.path.join(sys.path[0], "me.txt") logger.info(path) email_helper.send_mail_file( "烯牛数据数据开发组", "烯牛数据数据开发组", "*****@*****.**", ';'.join([i + '@xiniudata.com' for i in ["celine", "zhlong", "bamy"]]), "重复公司检索--人工审查", content, path) conn.close()
def run_week(): mongo = db.connect_mongo() conn = db.connect_torndb() # 获取上周日 endDate = (datetime.datetime.today() - datetime.timedelta(days=time.localtime().tm_wday)) endDate = datetime.datetime(endDate.year, endDate.month, endDate.day) # 获取上周一 startDate = (datetime.datetime.today() - datetime.timedelta(days=time.localtime().tm_wday + 7)) startDate = datetime.datetime(startDate.year, startDate.month, startDate.day) # 登录用户和非登录用户分开来查 result_login = list( mongo.log.user_log.find( { '$and': [{ 'url_type': 'front' }, { "requestURL": { "$regex": "/^\/search/" } }, { "user": { "$exists": False } }, { 'time': { '$gt': startDate } }, { 'time': { '$lt': endDate } }] }, {'_id': 0})) #TODO result_tourist = list( mongo.log.user_log.find( { '$and': [{ 'url_type': 'front' }, { "requestURL": { "$regex": "/^\/search/" } }, { "user": { "$exists": True } }, { 'time': { '$gt': startDate } }, { 'time': { '$lt': endDate } }] }, {'_id': 0})) #TODO import pandas as pd df = pd.DataFrame(result) uids = [i['userId'] for i in result] result = conn.query( '''select u.id userId,u.username userName,o.name orgName from user u left join user_organization_rel r on r.userId=u.id left join organization o on r.organizationId=o.id where (r.active='Y' or r.active is null) and u.id in %s''', uids) df2 = pd.DataFrame(result) df3 = pd.merge(df, df2, on='userId', how='left') def keyword(x): if x.visitURL.find('open/') >= 0: keyword = x.visitURL.split('open/')[-1].strip() else: keyword = '' keyword = unquote(keyword.encode()) return keyword.decode() df3['keyword'] = df3.apply(keyword, axis=1) df3['specialOrg'] = df3.apply( lambda x: ','.join(re.findall(u'烯牛|以太', x.orgName)), axis=1) df3 = df3[df3.specialOrg != '烯牛'] fileName = 'search_weekly_report.xlsx' df3.to_excel( fileName, index=0, columns=['visitURL', 'userName', 'orgName', 'ip', 'time', 'keyword']) df3 = df3[df3.specialOrg == ''] content = df3.orgName.value_counts().to_frame()[:10].to_html() content = '''<div>Dears, <br /><br /> 附件是上周的用户搜索记录,搜索量前10的机构为: </div> ''' + content content2 = df3.keyword.value_counts().to_frame()[:100].to_html() content2 = ''' <div> <br /> 前100名的搜索词为(统计已过滤掉以太和烯牛成员的搜索数据): </div> ''' + content2 content = content + content2 # send_mail_file(from_alias, reply_alias, reply_email, to, subject, content, file) # '[email protected];[email protected]', recieveList = [ 'zhlong', 'jiaojunpeng', 'avery', 'arthur', 'bamy', 'celine', 'marchy', 'haiming' ] # recieveList = ['zhlong'] path = os.path.join(sys.path[0], fileName) email_helper.send_mail_file( "烯牛数据数据开发组", "烯牛数据数据开发组", "*****@*****.**", ';'.join([i + '@xiniudata.com' for i in recieveList]), "上周搜索周报(%s ~ %s)" % (startDate.strftime('%Y-%m-%d'), (endDate + datetime.timedelta(days=-1)).strftime('%Y-%m-%d')), content, path) mongo.close() conn.close()
def send_tzj_email(): print('this time:%s to send email' % datetime.datetime.now()) hour = time.localtime()[3] mongo = db.connect_mongo() collection = mongo.raw.tzj_rz_incr if hour == 8: items = list(collection.find().sort('createtime', -1).limit(50)) else: date = datetime.date.today().strftime('%Y-%m-%d') items = list(collection.find({'date': date})) mongo.close() cnt = len(items) from_alias = 'Hush' reply_alias = 'Hush' reply_email = '*****@*****.**' to = '[email protected];[email protected];[email protected];[email protected];[email protected];[email protected]' # to = '[email protected];[email protected];[email protected];[email protected];[email protected];[email protected]' print('*******') subject = '投资界日常融资事件' content = '<html>共<b>%d</b>起融资事件,请查看附件</html>' % cnt file = 'tzj_rz_day.xls' wb = xlwt.Workbook() ws = wb.add_sheet('A Work Sheet', cell_overwrite_ok=True) ws.write(0, 0, 'Product') ws.write(0, 1, 'Lunci') ws.write(0, 2, 'Date') ws.write(0, 3, 'Pro_Source') ws.write(0, 4, 'Invest_Source') ws.write(0, 5, 'Investment') i = 1 for item in items: product = item.get('product') lunci = item.get('lunci') date = item.get('date') pro_source = item.get('project_url').decode('utf-8') invest_source = item.get('invest_url').decode('utf-8') investr = item.get('investr') if len(pro_source) > 255: sources1 = pro_source else: n = "HYPERLINK" sources1 = xlwt.Formula(n + '("%s";"%s")' % (pro_source, pro_source)) if len(invest_source) > 255: sources2 = invest_source else: n = "HYPERLINK" sources2 = xlwt.Formula(n + '("%s";"%s")' % (invest_source, invest_source)) ws.write(i, 0, product) ws.write(i, 1, lunci) ws.write(i, 2, date) ws.write(i, 3, sources1) ws.write(i, 4, sources2) ws.write(i, 5, investr) i += 1 wb.save(file) email_helper.send_mail_file(from_alias, reply_alias, reply_email, to, subject, content, file) print('done')
def extract_data(investorId): tline = "" n = 0 n1 = 0 n2 = 0 n3 = 0 n4 = 0 conn = db.connect_torndb() mongo = db.connect_mongo() collection_gongshang = mongo.info.gongshang oaliases = conn.query( "select * from investor_alias where (active is null or active='Y') and " "(verify is null or verify !='N') and investorId=%s", investorId) oanames = [ alias["name"] for alias in oaliases if alias["name"] is not None and alias["type"] == 12010 ] anames = [] investorfs = conn.query( "select * from investor_fund where (active is null or active='Y') and " "(verify is null or verify !='N') and investorId=%s;", investorId) for investorf in investorfs: amacf = "是" if investorf["amacFundId"] is not None else "否" of = "是" if investorf["fullName"] in oanames else "否" item = collection_gongshang.find_one({'name': investorf["fullName"]}) if item is not None and item.has_key("invests") and len( item["invests"]) > 0: numiv = len(item["invests"]) else: numiv = "0" line = "%s+++%s+++%s+++%s+++%s\n" % ( investorf["fullName"], investorf["memo"], amacf, of, numiv) tline += line if investorf["fullName"] not in anames: anames.append(investorf["fullName"]) investorgs = conn.query( "select * from investor_gp where (active is null or active='Y') and " "(verify is null or verify !='N') and investorId=%s;", investorId) for investorg in investorgs: amacf = "是" if investorg["amacManagerId"] is not None else "否" of = "是" if investorg["fullName"] in oanames else "否" item = collection_gongshang.find_one({'name': investorg["fullName"]}) if item is not None and item.has_key("invests") and len( item["invests"]) > 0: numiv = len(item["invests"]) else: numiv = "0" line = "%s+++%s+++%s+++%s+++%s\n" % ( investorg["fullName"], investorg["memo"], amacf, of, numiv) tline += line if investorg["fullName"] not in anames: anames.append(investorg["fullName"]) tline += "\n\n" for oal in oaliases: if oal["name"] is None: continue if oal["name"] in anames: continue if oal["type"] != 12010: continue if amac_util.find_amac_manager( oal["name"]) is not None or amac_util.find_amac_fund( oal["name"]) is not None: amacf = "是" else: amacf = "否" createUser = oal["createUser"] if oal["createUser"] is not None else " " item = collection_gongshang.find_one({'name': oal["name"]}) if item is not None and item.has_key( "legalPersonName") and item["legalPersonName"].strip() not in [ "", "-", "—" ]: lp = item["legalPersonName"] else: lp = " " if item is not None and item.has_key("invests") and len( item["invests"]) > 0: numiv = len(item["invests"]) ivnames = [inv["name"] for inv in item["invests"]] ivnamesstr = ";".join(ivnames) ivnamesnn = [ inv["name"] for inv in item["invests"] if inv["name"] in anames ] ivnamesnnstr = ";".join(ivnamesnn) if len(ivnamesnn) > 0 else "无" else: numiv = "0" ivnamesstr = "无" ivnamesnnstr = "无" line = "%s+++%s+++%s+++%s+++%s+++%s+++%s\n" % ( oal["name"], createUser, amacf, lp, numiv, ivnamesstr, ivnamesnnstr) tline += line mongo.close() logger.info("%s - %s - %s", investorId, len(oanames), len(investorfs) + len(investorgs)) fp2 = open("me.txt", "w") fp2.write(tline) content = '''<div>Dears, <br /><br /> 附件是目前系统中存在重复的公司,请在后台搜索 </div> ''' fp2.close() path = os.path.join(sys.path[0], "me.txt") logger.info(path) email_helper.send_mail_file( "烯牛数据数据开发组", "烯牛数据数据开发组", "*****@*****.**", ';'.join([i + '@xiniudata.com' for i in ["bamy"]]), "重复机构检索--人工审查", content, path) fp2.close() conn.close()
def dup_alias(): tline = "" n = 0 n1 = 0 n2 = 0 n3 = 0 n4 = 0 conn = db.connect_torndb() cnames = conn.query( "select name,count(*) as cnt from investor_alias where (active is null or active !='N') " "and name is not null and name!='' and type=12010 group by name having cnt>1" ) logger.info(len(cnames)) for cname in cnames: investor_ids = [] investor_ids_un = [] investor_aids_ver = [] investor_as = conn.query( "select * from investor_alias where name=%s and (active is null or active !='N') and type=12010", cname["name"]) for ia in investor_as: investor = conn.get( "select * from investor where (active is null or active !='N') and id=%s", ia["investorId"]) if investor is not None: investor_ids.append(investor["id"]) if investor["id"] not in investor_ids_un: investor_ids_un.append(investor["id"]) if ia["verify"] == "Y": investor_aids_ver.append(ia["id"]) if len(investor_ids) > 1: n += 1 logger.info("dup:%s -> %s", cname["name"], investor_ids) aa = "否" ab = "否" ac = "否" # line = "%s+++%s+++%s\n" % (cname["name"], ";".join([str(id) for id in investor_ids]),get_links(investor_ids)) # tline += line if len(investor_ids_un) == 1: logger.info("dup:%s -> %s -- %s", cname["name"], investor_ids, "for same investor") aa = "是" ssinv = conn.get( "select * from investor_alias where investorId=%s and name=%s and (active is null or active !='N') limit 1", investor_ids_un[0], cname["name"]) logger.info("here we want to save: %s", ssinv["id"]) conn.update( "update investor_alias set active='N', modifyUser=-571 where id!=%s and type=12010 and name=%s", ssinv["id"], cname["name"]) # exit() n1 += 1 if len(investor_aids_ver) == 1: logger.info("dup:%s -> %s -- %s %s", cname["name"], investor_ids, "for one verify", investor_aids_ver[0]) ab = "是" conn.update( "update investor_alias set active='N', modifyUser=-571 where id!=%s and type=12010 and name=%s", investor_aids_ver[0], cname["name"]) # exit() n2 += 1 if len(investor_aids_ver) == 0: logger.info("dup:%s -> %s -- %s", cname["name"], investor_ids, "for None verify") ac = "是" sid = investor_ids[0] f = 0 for iid in investor_ids: iinv = conn.get( "select * from investor where (active is null or active !='N') and id=%s", iid) if iinv["fundingCntFrom2017"] > f: f = iinv["fundingCntFrom2017"] sid = iid logger.info("here we want to save: %s", sid) conn.update( "update investor_alias set active='N', modifyUser=-571 where investorId!=%s and type=12010 and name=%s", sid, cname["name"]) # exit() n3 += 1 line = "%s+++%s+++%s+++%s+++%s+++%s\n" % (cname["name"], ";".join( [str(id) for id in investor_ids]), get_links(investor_ids), aa, ab, ac) tline += line logger.info("%s - %s - %s - %s - %s", n, n1, n2, n3, n4) fp2 = open("me.txt", "w") fp2.write(tline) content = '''<div>Dears, <br /><br /> 附件是目前系统中存在重复的公司,请在后台搜索 </div> ''' fp2.close() path = os.path.join(sys.path[0], "me.txt") logger.info(path) email_helper.send_mail_file( "烯牛数据数据开发组", "烯牛数据数据开发组", "*****@*****.**", ';'.join([i + '@xiniudata.com' for i in ["bamy"]]), "重复机构检索--人工审查", content, path) fp2.close() conn.close()
def kuohao_alias(): tline = "" conn = db.connect_torndb() n = 0 n1 = 0 n2 = 0 n3 = 0 n4 = 0 # cnames = conn.query("select * from investor_alias where (active is null or active !='N') and name like %s", '%(%') cnames = conn.query( "select name,count(*) as cnt from investor_alias where (active is null or active !='N') " "and (name like %s or name like %s) group by name", '%(%', '%)%') for cname in cnames: wname = cname["name"] investors = conn.query( "select * from investor_alias where (active is null or active !='N') and name=%s", wname) for inv in investors: if inv["type"] != 12010: continue wid = inv["investorId"] investor = conn.get( "select * from investor where (active is null or active !='N') and id=%s", wid) if investor is None: continue n1 += 1 # logger.info("*****************name:%s",inv["name"]) mnames = [wname.replace("(", "(").replace(")", ")").strip()] # csameiid = "" investor_ids = [] for mname in mnames: # i0 = conn.get("select * from investor_alias where name=%s and (active is null or active !='N') and " # "investorId=%s limit 1", mname, wid) i0 = None if i0 is None: i1s = conn.query( "select * from investor_alias where name=%s and (active is null or active !='N')", mname) for i1 in i1s: iv1 = conn.get( "select * from investor where (active is null or active !='N') and id=%s", i1["investorId"]) if iv1 is not None and iv1["id"] not in investor_ids: investor_ids.append(iv1["id"]) else: if wid not in investor_ids: investor_ids.append(wid) if len(investor_ids) > 0: if wid in investor_ids and len(investor_ids) == 1: csameiid = "同一机构" n2 += 1 conn.update( "update investor_alias set active='N',modifyUser=-561 where id=%s", inv["id"]) else: csameiid = "多个机构" n3 += 1 line = "%s+++%s+++%s\n" % (cname["name"], ";".join([ str(id) for id in [str(wid)] + investor_ids ]), get_links([str(wid)] + investor_ids)) tline += line logger.info("%s - %s - %s - %s", wname, str(wid), ";".join([str(id) for id in investor_ids]), csameiid) n += 1 else: (chinese, cccompany) = name_helper.name_check(mnames[0]) if chinese is True: n4 += 1 logger.info("update!!!!!") conn.update( "update investor_alias set name=%s,modifyUser=-561 where id=%s", mnames[0], inv["id"]) logger.info("%s - %s - %s - %s - %s", n, n1, n2, n3, n4) fp2 = open("me.txt", "w") fp2.write(tline) content = '''<div>Dears, <br /><br /> 附件是目前系统中存在重复的公司,请在后台搜索 </div> ''' fp2.close() path = os.path.join(sys.path[0], "me.txt") logger.info(path) email_helper.send_mail_file( "烯牛数据数据开发组", "烯牛数据数据开发组", "*****@*****.**", ';'.join([i + '@xiniudata.com' for i in ["bamy"]]), "重复机构检索--人工审查", content, path) fp2.close() conn.close()
def run_week(): mongo = db.connect_mongo() conn = db.connect_torndb() # 获取上周日 endDate = (datetime.datetime.today() - datetime.timedelta(days=time.localtime().tm_wday)) endDate = datetime.datetime(endDate.year, endDate.month, endDate.day) # 获取上周一 startDate = (datetime.datetime.today() - datetime.timedelta(days=time.localtime().tm_wday + 7)) startDate = datetime.datetime(startDate.year, startDate.month, startDate.day) result = list( mongo.log.page_view.find( { '$and': [{ 'router': 'search' }, { 'time': { '$gt': startDate - datetime.timedelta(hours=8) } }, { 'time': { '$lt': endDate - datetime.timedelta(hours=8) } }] }, {'_id': 0})) import pandas as pd df = pd.DataFrame(result) df['time2'] = df.apply(lambda x: x.time + datetime.timedelta(hours=8), axis=1) uids = [i['userId'] for i in result] result = conn.query( '''select u.id userId,u.username userName,o.name orgName from user u left join user_organization_rel r on r.userId=u.id left join organization o on r.organizationId=o.id where (r.active='Y' or r.active is null) and u.id in %s''', uids) df2 = pd.DataFrame(result) df3 = pd.merge(df, df2, on='userId', how='left') def keyword(x): if x.visitURL.find('open/') >= 0: keyword = x.visitURL.split('open/')[-1].strip() else: keyword = '' keyword = unquote(keyword.encode()) return keyword.decode() df3['keyword'] = df3.apply(keyword, axis=1) df3['specialOrg'] = df3.apply( lambda x: ','.join(re.findall(u'烯牛|以太', x.orgName)) if pd.notnull(x.orgName) else '', axis=1) df3 = df3[df3.specialOrg != '烯牛'] for c in df3.columns: def illegal(row): import re content = row[c] if content is not None: ILLEGAL_CHARACTERS_RE = re.compile( r'[\000-\010]|[\013-\014]|[\016-\037]') # print 'content:',c,content try: content = ILLEGAL_CHARACTERS_RE.sub(r'', content) except: pass return content # print 'c:',c df3[c] = df3.apply(illegal, axis=1) fileName = 'search_weekly_report.xlsx' df3.to_excel( fileName, index=0, columns=['visitURL', 'userName', 'orgName', 'ip', 'time2', 'keyword']) df3 = df3[df3.specialOrg == ''] content = df3.orgName.value_counts().to_frame()[:10].to_html() content = '''<div>Dears, <br /><br /> 附件是上周的用户搜索记录,搜索量前10的机构为: </div> ''' + content content2 = df3.keyword.value_counts().to_frame()[:100].to_html() content2 = ''' <div> <br /> 前100名的搜索词为(统计已过滤掉以太和烯牛成员的搜索数据): </div> ''' + content2 content = content + content2 # send_mail_file(from_alias, reply_alias, reply_email, to, subject, content, file) # '[email protected];[email protected]', recieveList = [ 'avery', 'arthur', 'marchy', 'weiguangxiao', 'jiaojunpeng', 'charlotte', 'erin', 'jinglei', 'zhlong', 'bamy' ] # recieveList = ['zhlong','jiaojunpeng'] path = os.path.join(sys.path[0], fileName) email_helper.send_mail_file( "烯牛数据数据开发组", "烯牛数据数据开发组", "*****@*****.**", ';'.join([i + '@xiniudata.com' for i in recieveList]), "机构版(pro)上周搜索周报(%s ~ %s)" % (startDate.strftime('%Y-%m-%d'), (endDate + datetime.timedelta(days=-1)).strftime('%Y-%m-%d')), content, path) mongo.close() conn.close()
def run_week(): mongo = db.connect_mongo() conn = db.connect_torndb() # 获取上周日 endDate = (datetime.datetime.today() - datetime.timedelta(days=time.localtime().tm_wday)) endDate = datetime.datetime(endDate.year, endDate.month, endDate.day) # 获取上周一 startDate = (datetime.datetime.today() - datetime.timedelta(days=time.localtime().tm_wday + 7)) startDate = datetime.datetime(startDate.year, startDate.month, startDate.day) result = list( mongo.log.user_log.find( {'$and': [{'url_type': 'front'}, {'requestURL': {'$regex': '/search'}}, {'time': {'$gt': startDate - datetime.timedelta(hours=8)}}, {'time': {'$lt': endDate - datetime.timedelta(hours=8)}}]}, {'_id': 0})) import pandas as pd df = pd.DataFrame(result) df['time2']=df.apply(lambda x:x.time + datetime.timedelta(hours=8),axis=1) uids = [i.get('userId') for i in result] result = conn.query('''select u.id userId,u.username userName,o.name orgName from user u left join user_organization_rel r on r.userId=u.id left join organization o on r.organizationId=o.id where (r.active='Y' or r.active is null) and u.id in %s''', uids) df2 = pd.DataFrame(result) df3 = pd.merge(df, df2, on='userId', how='left') def keyword(x): if x.requestURL.find('search') >= 0: # keyword = x.requestURL.split('search/')[-1].split('&&name=')[-1].strip() keyword = x.requestURL.split('search/')[-1].split('&&name=')[-1].split('/search?name=')[-1].strip() else: keyword = '' keyword = unquote(keyword.encode()) try: keyword = keyword.decode() except: keyword = '' return keyword df3['keyword'] = df3.apply(keyword, axis=1) # df3['keyword'] = df3.apply(lambda x: '(空搜索)' if pd.isnull(x.keyword) or x.keyword in ['/search', ''] else x.keyword, # axis=1) df3['specialOrg'] = df3.apply(lambda x: ','.join(re.findall(u'烯牛|以太', x.orgName)) if pd.notnull(x.orgName) else '', axis=1) # df3 = df3[df3.specialOrg != '烯牛'] for c in df3.columns: def illegal(row): import re content = row[c] if content is not None: ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]') # print 'content:',c,content try: content = ILLEGAL_CHARACTERS_RE.sub(r'', content) except: pass return content # print 'c:',c df3[c] = df3.apply(illegal, axis=1) fileName = 'personal_search_weekly_report.xlsx' df3.to_excel(fileName, index=0, columns=['requestURL', 'userName', 'orgName', 'ip', 'time2', 'keyword']) hs=conn.query('''select * from hot_search limit 10''') hsString,updateTime=','.join([i['name'] for i in hs]),hs[0]['modifyTime'] # df3 = df3[df3.specialOrg == ''] content2 = df3.keyword.value_counts().to_frame()[:100].to_html() content2 = ''' <div> <div>Dears, <br /><br /> 附件是上周个人版的用户搜索记录: <br /><br /> 1、上周个人版用户总计搜索了 <b>%s</b> 次 <br /><br /> 2、这一周的热门搜索词是:<b>%s</b>;更新时间:%s <br /><br /> 3、前100名的搜索词为: </div> ''' % (df3.count()['code'],hsString,updateTime) + content2 content = content2 # send_mail_file(from_alias, reply_alias, reply_email, to, subject, content, file) # '[email protected];[email protected]', # Avery, Arthur, Marchy, 广肖, 小娇, Charlotte, 刘林, 荆雷 recieveList = ['avery', 'arthur', 'marchy', 'weiguangxiao', 'jiaojunpeng', 'charlotte', 'erin', 'jinglei', 'zhlong', 'bamy'] # recieveList = ['zhlong'] #todo path = os.path.join(sys.path[0], fileName) email_helper.send_mail_file("烯牛数据数据开发组", "烯牛数据数据开发组", "*****@*****.**", ';'.join([i + '@xiniudata.com' for i in recieveList]), "个人版上周搜索周报(%s ~ %s)" % (startDate.strftime('%Y-%m-%d'), (endDate + datetime.timedelta(days=-1)).strftime('%Y-%m-%d') ), content, path) mongo.close() conn.close()
def process_one(org_id, thedate=None, test=True): if thedate is None: today = datetime.datetime.now() else: today = thedate # 获取上周六 startDate = (today - datetime.timedelta(days=time.localtime().tm_wday + 2)) start_time = datetime.datetime(startDate.year, startDate.month, startDate.day, 21) # 获取这周六 endDate = (today - datetime.timedelta(days=time.localtime().tm_wday - 5)) end_time = datetime.datetime(endDate.year, endDate.month, endDate.day, 21) conn = db.connect_torndb() mongo = db.connect_mongo() df, _ = data_code.run(conn, mongo, start_time.strftime("%Y-%m-%d"), end_time.strftime("%Y-%m-%d")) df = df[(df.publishDateMerge >= start_time) & (df.publishDateMerge < end_time)] nameMap = {} string = u'''首次披露时间 项目名称 领域 是否国内 一句话简介 完整简介 融资详情 publishDateMerge companyName sector location brief description investmentDetail''' stringrows = string.split('\n') index = 0 for column in stringrows[1].split(): nameMap[column] = stringrows[0].split()[index] index += 1 df = df.rename(columns=nameMap) title = "烯牛数据融资事件表(%s ~ %s)" % (start_time.strftime("%m-%d"), end_time.strftime("%m-%d")) fileName = "funding (%s ~ %s).xlsx" % (start_time.strftime("%m-%d"), end_time.strftime("%m-%d")) from openpyxl import load_workbook import pandas as pd writer = pd.ExcelWriter(fileName, engin='openpyxl') book = load_workbook('template/template.xlsx') ws = book.active ws['b9'] = u'数据包含了%s至%s一周的国内外融资事件。' % (start_time.strftime("%Y年%m月%d日"), end_time.strftime("%Y年%m月%d日")) writer.book = book df.to_excel(excel_writer=writer, sheet_name=u"数据明细", index=0, columns=stringrows[0].split()) writer.save() writer.close() path = '/data/task-201606/spider2/aggregator/funding' path = sys.path[0] path = os.path.join(path, fileName) content = '''Hello,<br /><br /> 以下是本周(%s ~ %s)披露的国内外投融资事件列表,请查收!''' % (start_time.strftime("%m-%d"), end_time.strftime("%m-%d")) content = '''<!DOCTYPE html><html xmlns="http://www.w3.org/1999/xhtml" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office"><head> <title></title> <!--[if !mso]><!-- --> <meta http-equiv="X-UA-Compatible" content="IE=edge"> <!--<![endif]--><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta name="viewport" content="width=device-width, initial-scale=1.0"><style type="text/css"> #outlook a { padding: 0; } .ReadMsgBody { width: 100%; } .ExternalClass { width: 100%; } .ExternalClass * { line-height:100%; } body { margin: 0; padding: 0; -webkit-text-size-adjust: 100%; -ms-text-size-adjust: 100%; } table, td { border-collapse:collapse; mso-table-lspace: 0pt; mso-table-rspace: 0pt; } img { border: 0; height: auto; line-height: 100%; outline: none; text-decoration: none; -ms-interpolation-mode: bicubic; } p { display: block; margin: 13px 0; }</style><!--[if !mso]><!--><style type="text/css"> @media only screen and (max-width:480px) { @-ms-viewport { width:320px; } @viewport { width:320px; } }</style><!--<![endif]--><!--[if mso]><xml> <o:OfficeDocumentSettings> <o:AllowPNG/> <o:PixelsPerInch>96</o:PixelsPerInch> </o:OfficeDocumentSettings></xml><![endif]--><!--[if lte mso 11]><style type="text/css"> .outlook-group-fix { width:100% !important; }</style><![endif]--><!--[if !mso]><!--> <link href="https://fonts.googleapis.com/css?family=Ubuntu:300,400,500,700" rel="stylesheet" type="text/css"> <style type="text/css"> @import url(https://fonts.googleapis.com/css?family=Ubuntu:300,400,500,700); </style> <!--<![endif]--><style type="text/css"> @media only screen and (min-width:480px) { .mj-column-per-100 { width:100%!important; } }</style></head><body style="background: #FFFFFF;"> <div class="mj-container" style="background-color:#FFFFFF;"><!--[if mso | IE]> <table role="presentation" border="0" cellpadding="0" cellspacing="0" width="600" align="center" style="width:600px;"> <tr> <td style="line-height:0px;font-size:0px;mso-line-height-rule:exactly;"> <![endif]--><div style="margin:0px auto;max-width:600px;background:以上为本次追踪内容。 如有疑问,欢迎联系我们:) 烯牛数据团队 www.xiniudata.com;"><table role="presentation" cellpadding="0" cellspacing="0" style="font-size:0px;width:100%;background:以上为本次追踪内容。 如有疑问,欢迎联系我们:) 烯牛数据团队 www.xiniudata.com;" align="center" border="0"><tbody><tr><td style="text-align:center;vertical-align:top;direction:ltr;font-size:0px;padding:9px 0px 9px 0px;"><!--[if mso | IE]> <table role="presentation" border="0" cellpadding="0" cellspacing="0"> <tr> <td style="vertical-align:top;width:600px;"> <![endif]--><div class="mj-column-per-100 outlook-group-fix" style="vertical-align:top;display:inline-block;direction:ltr;font-size:13px;text-align:left;width:100%;"><table role="presentation" cellpadding="0" cellspacing="0" width="100%" border="0"><tbody><tr><td style="word-wrap:break-word;font-size:0px;padding:0px 20px 0px 20px;" align="center"><div style="cursor:auto;color:#000000;font-family:Ubuntu, Helvetica, Arial, sans-serif;font-size:11px;line-height:22px;text-align:center;"><p>Hi ,</p><p>附件是本周披露的国内外投融资事件列表,请查收!</p><p></p><p></p><p>如有疑问,欢迎联系我们:)</p><p>烯牛数据团队</p><p><a href="http://sctrack.sc.gg/track/click/eyJtYWlsbGlzdF9pZCI6IDAsICJ0YXNrX2lkIjogIiIsICJlbWFpbF9pZCI6ICIxNTMwMTgzNjU0NDk1XzYwMTE0XzgxNjRfNDczOS5zYy0xMF85XzRfNDAtaW5ib3VuZDAkYXJ0aHVyQHhpbml1ZGF0YS5jb20iLCAic2lnbiI6ICJkNWQ5MjZhM2I3YWM3M2E2NDQwMTMwYzRlZjUzYTg1NiIsICJ1c2VyX2hlYWRlcnMiOiB7fSwgImxhYmVsIjogMCwgImxpbmsiOiAiaHR0cCUzQS8vd3d3Lnhpbml1ZGF0YS5jb20iLCAidXNlcl9pZCI6IDYwMTE0LCAiY2F0ZWdvcnlfaWQiOiAxMTI1OTh9.html" target="_blank">www.xiniudata.com</a></p><p><img src="http://www.xiniudata.com/resources/image/icon-system/verify/ios-normal.jpeg"></p><p></p></div></td></tr></tbody></table></div><!--[if mso | IE]> </td></tr></table> <![endif]--></td></tr></tbody></table></div><!--[if mso | IE]> </td></tr></table> <![endif]--></div></body></html> ''' users = conn.query( "select * from org_track_user " "where active='Y' and orgId=%s", org_id) for user in users: if user["email"] is None or user["email"].strip() == "": continue if test is True: if user["email"] not in ["*****@*****.**"]: continue logger.info("%s", user["email"]) # email_helper.send_mail("烯牛数据","烯牛数据", "*****@*****.**", user["email"], title, content) email_helper.send_mail_file("烯牛数据", "烯牛数据", "*****@*****.**", user["email"], title, content, path) # pass conn.close() mongo.close()