def process(): conn_old = mysql.connect('bsppr', '192.168.241.7') mysql.insert(conn_old, 'set names utf8') conn_new = mysql.connect('bsppr', '192.168.241.32') mysql.insert(conn_new, 'set names utf8') cinfos_old = get_cinfos(conn_old) #print cinfos_old cinfos_new = get_cinfos(conn_new) mongo_conn = get_mongo_conn() tablename = 'weixin' tmpdatas = mongo.find(mongo_conn, tablename, {}, 50) rawdatas = [] for raw in tmpdatas: date = raw['pubtime'] now = datetime.datetime.now() diff = now - date print diff.days rawdatas.append(raw) if len(rawdatas) == 0: time.sleep(10) raw_old_qualified = filter(cinfos_old, rawdatas) old_insert_num = feed_xpost.feed_data_to_xpost(conn_old, raw_old_qualified, 'old') raw_new_qualified = filter(cinfos_new, rawdatas) new_insert_num = feed_xpost.feed_data_to_xpost(conn_new, raw_new_qualified, 'new')
def process(): conn_old = mysql.connect('bsppr', '192.168.241.7') mysql.insert(conn_old,'set names utf8') conn_new = mysql.connect('bsppr', '192.168.241.32') mysql.insert(conn_new,'set names utf8') cinfos_old = get_cinfos_moa(conn_old) cinfos_new = get_cinfos_moa(conn_new) mongo_conn = get_mongo_conn() tablename = 'weixin' while True: tmpdatas = mongo.find(mongo_conn, tablename, {},1000) rawdatas = [] for raw in tmpdatas: url = raw['url'] mongo.delete(mongo_conn, tablename, {'url':url}) date = raw['pubtime'] now = datetime.datetime.now() diff = now - date if diff.days>2: continue rawdatas.append(raw) if len(rawdatas)==0: print 'wait datas...' time.sleep(300) raw_old_qualified = filter(cinfos_old,rawdatas) if raw_old_qualified: old_insert_num = feed_xpost.feed_data_to_xpost(conn_old, raw_old_qualified,'old') print 'old_insert_num: ',old_insert_num raw_new_qualified = filter(cinfos_new,rawdatas) if raw_new_qualified: new_insert_num = feed_xpost.feed_data_to_xpost(conn_new, raw_new_qualified,'new') print 'new_insert_num: ',new_insert_num
def get_site_id(conn,url): domain = urlparse.urlparse(url).netloc sql = "select siteid,name from xsite where url='%s';"%(domain) data = mysql.query_one(conn, sql) if not data: isql = "insert into xsite(name,url) values('%s','%s');"%(domain,domain) mysql.insert(conn, isql) mysql.commit(conn) data = mysql.query_one(conn, sql) return data
def get_site_id(conn, url): domain = urlparse.urlparse(url).netloc sql = "select siteid,name from xsite where url='%s';" % (domain) data = mysql.query_one(conn, sql) if not data: isql = "insert into xsite(name,url) values('%s','%s');" % (domain, domain) mysql.insert(conn, isql) mysql.commit(conn) data = mysql.query_one(conn, sql) return data
def process(terrace): if terrace=='new': conn = mysql.connect('bsppr', '192.168.241.32') else: conn = mysql.connect('bsppr', '192.168.241.7') mysql.insert(conn,'set names utf8') mongo_conn = get_mongo_conn() tablename = 'comgeneral' while True: total = 0 before_total = 0 raw_total = 0 tmpdatas = mongo.find(mongo_conn, tablename, {'terrace':terrace},200) print '...%s get data ...'%(terrace) rawdatas = [] for raw in tmpdatas: raw_total += 1 url = raw['url'] mongo.delete(mongo_conn, tablename, {'url':url}) datestr = raw['pubtime'] if not datestr: continue updatetime = raw.get('updatetime',time.time) try: date = format_time(datestr,updatetime) #open(self.terrace,'a+').write('%s\n'%str(date)) now = datetime.datetime.now() diff = now - date except: open('date_error.dat','a+').write('%s\t%s\n'%(datestr,url)) continue if diff.days>2: continue raw.update({'pubtime':date}) rawdatas.append(raw) before_total += 1 if tmpdatas.count()==0: break insert_num = feed_xpost.feed_data_to_xpost(conn, rawdatas,terrace) total += insert_num open('total_%s.dat'%(terrace),'a+').write('%s\t%s\n'%(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),total)) open('raw_total_%s.dat'%(terrace),'a+').write('%s\t%s\n'%(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),raw_total)) open('before_total_%s.dat'%(terrace),'a+').write('%s\t%s\n'%(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),before_total))
def fetch_query_results(self, query_string, num=200): index_path = "/disk1/kol_search_index/index" query_index = QueryEnvironment() query_index.addIndex(index_path) # 根据query_string查询结果 # print query_string docs = query_index.runQuery(query_string, num) # 解析查询的结果 results = get_query_results(query_index, docs) datas = {} flag = 0 conn = "" now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") flag = 0 for result in results: if flag >= 200: break data = {} userid = result["userid"] site = result["site"] relevance = process_relevance(result["relevance"]) data.update({"userid": userid}) data.update({"site": site}) data.update({"relevance": relevance}) datas.update({flag: data}) flag += 1 if datas: conn = mysql.connect("kol_search") results = json.dumps(datas) results = conn.escape_string(results) query = str(self.keyword) + "#$#" + str(self.site) sql = "insert into search_result_cache(query,result,update_time) values('%s','%s','%s');" % ( query, results, now, ) # print sql mysql.insert(conn, sql) mysql.commit(conn) mysql.close(conn) query_index.close() return datas
def process_item(self, item, spider): self.total_cnt += 1 if isinstance(item, PageMetaItem): http_code = item['http_code'] self.file.write(item['url'] + '\n') self.redis.add_url(item['url']) if http_code >= 200 and http_code < 300: self.total += 1 try: if self.total % self.nums_in_eachDBFile == 0: self.db.closeDb() if os.path.exists(self.db_file): shutil.move(self.db_file, self.dbfile_move_target) else: err = '+++no_db_file:', self.db_file print err log.msg(err, level=log.ERROR) self._createNewDBFile() if item['url'] and item['content']: self._writeDBFile(item) except: print '=URL=', item['url'], '=body=', item['content'] info = sys.exc_info() print info[0], ":", info[1] elif isinstance(item, ReplycountItem): url = item['url'] appnameid = item['appnameid'] replyCount = item.get('replyCount', 0) readnum = item.get('readnum', 0) likenum = item.get('likenum', 0) unlikenum = item.get('unlikenum', 0) playnum = item.get('playnum', 0) repostsnum = item.get('repostsnum', 0) updatetime = item.get('updatetime', 0) sql = 'insert into container(url,appnameid,replyCount,readnum,likenum,unlikenum,playnum,repostsnum,updatetime) values("%s",%s,%s,%s,%s,%s,%s,%s,"%s");' sql = sql % (url, appnameid, replyCount, readnum, likenum, unlikenum, playnum, repostsnum, updatetime) mysql.insert(self.conn, sql) #cursor = self.conn.cursor() #cursor.execute(sql) mysql.commit(self.conn)
def process_item(self, item, spider): self.total_cnt += 1 if isinstance(item, PageMetaItem): http_code = item['http_code'] self.file.write(item['url']+'\n') self.redis.add_url(item['url']) if http_code >= 200 and http_code < 300: self.total += 1 try: if self.total % self.nums_in_eachDBFile == 0: self.db.closeDb() if os.path.exists(self.db_file): shutil.move(self.db_file,self.dbfile_move_target) else: err = '+++no_db_file:',self.db_file print err log.msg(err,level=log.ERROR) self._createNewDBFile() if item['url'] and item['content']: self._writeDBFile(item) except: print '=URL=',item['url'],'=body=',item['content'] info=sys.exc_info() print info[0],":",info[1] elif isinstance(item,ReplycountItem): url = item['url'] appnameid = item['appnameid'] replyCount = item.get('replyCount',0) readnum = item.get('readnum',0) likenum = item.get('likenum',0) unlikenum = item.get('unlikenum',0) playnum = item.get('playnum',0) repostsnum = item.get('repostsnum',0) updatetime = item.get('updatetime',0) sql = 'insert into container(url,appnameid,replyCount,readnum,likenum,unlikenum,playnum,repostsnum,updatetime) values("%s",%s,%s,%s,%s,%s,%s,%s,"%s");' sql = sql%(url,appnameid,replyCount,readnum,likenum,unlikenum,playnum,repostsnum,updatetime) mysql.insert(self.conn, sql) #cursor = self.conn.cursor() #cursor.execute(sql) mysql.commit(self.conn)
def fetch_query_results(self, query_string, num=200): index_path = '/disk1/kol_search_index/index' query_index = QueryEnvironment() query_index.addIndex(index_path) #根据query_string查询结果 #print query_string docs = query_index.runQuery(query_string, num) #解析查询的结果 results = get_query_results(query_index, docs) datas = {} flag = 0 conn = '' now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') flag = 0 for result in results: if flag >= 200: break data = {} userid = result['userid'] site = result['site'] relevance = process_relevance(result['relevance']) data.update({'userid': userid}) data.update({'site': site}) data.update({'relevance': relevance}) datas.update({flag: data}) flag += 1 if datas: conn = mysql.connect('kol_search') results = json.dumps(datas) results = conn.escape_string(results) query = str(self.keyword) + '#$#' + str(self.site) sql = "insert into search_result_cache(query,result,update_time) values('%s','%s','%s');" % ( query, results, now) #print sql mysql.insert(conn, sql) mysql.commit(conn) mysql.close(conn) query_index.close() return datas
def process(): conn_old = mysql.connect('bsppr', '192.168.241.7') mysql.insert(conn_old,'set names utf8') conn_new = mysql.connect('bsppr', '192.168.241.32') mysql.insert(conn_new,'set names utf8') cinfos_old = get_cinfos(conn_old) #print cinfos_old cinfos_new = get_cinfos(conn_new) mongo_conn = get_mongo_conn() tablename = 'weixin' tmpdatas = mongo.find(mongo_conn, tablename, {},50) rawdatas = [] for raw in tmpdatas: date = raw['pubtime'] now = datetime.datetime.now() diff = now - date print diff.days rawdatas.append(raw) if len(rawdatas)==0: time.sleep(10) raw_old_qualified = filter(cinfos_old,rawdatas) old_insert_num = feed_xpost.feed_data_to_xpost(conn_old, raw_old_qualified,'old') raw_new_qualified = filter(cinfos_new,rawdatas) new_insert_num = feed_xpost.feed_data_to_xpost(conn_new, raw_new_qualified,'new')
return oids def get_cinfo_obejctid(conn_new, conn_old, terrace, objectid): sql = 'select objectid,name,limiter,synonyms,exclude_limiter from object where objectid=%s' % ( objectid) if terrace == 'new': data = mysql.query_one(conn_new, sql) else: data = mysql.query_one(conn_old, sql) return data mongo_conn = mongo.connect('192.168.241.12', 'stream') conn_new, conn_old = get_conn() mysql.insert(conn_new, 'set names utf8') mysql.insert(conn_old, 'set names utf8') tablename = 'general' rawdatas = mongo.find(mongo_conn, tablename, {}, 30000) csvwriter = csv.writer(open('general_filter.csv', 'w')) csvwriter.writerow(['title,url,terrace_info,terrace']) htmlwriter = open('general_filter.html', 'w') html = ''' <html> <head> <title>平台过滤信息</title> <meta http-equiv="content-type" content="text/html;charset=utf-8"> </head> <body> ''' htmlwriter.write('%s\n' % (html))
return -1 insert_num = 0 for qualified_data in qualified_datas: objectid = qualified_data['objectid'] date = qualified_data['pubtime'].strftime('%Y-%m-%d') facet_sql = 'select id from xfacet where objectid=%s and type=1;'%(objectid) try: facetid = mysql.query_one(conn, facet_sql)[0] except Exception,e: #print e continue xentry_sql = 'select entryid from xentry where facetid=%s and date="%s";'%(facetid,date) xentryid = mysql.query_one(conn, xentry_sql) if not xentryid: xentry_insert_sql = 'insert into xentry(facetid,date) values(%s,"%s");'%(facetid,date) mysql.insert(conn, xentry_insert_sql) mysql.commit(conn) xentryid = mysql.query_one(conn, xentry_sql) try: xentryid = xentryid[0] except Exception,e: print e return -1 xpostnum = xentryid%8 title,abstract,posttime,url,author,comment_count,click_count,template_type = qualified_data['title'],'',qualified_data['pubtime'],qualified_data['url'],qualified_data['author'],0,0,qualified_data['type'] duplicate_sql = 'select postid from xpost%d where entryid=%d and url="%s";'%(xpostnum,xentryid,conn.escape_string(url)) cursor.execute(duplicate_sql) postid = cursor.fetchall() if postid: #print xpostnum,postid continue
objectid = qualified_data['objectid'] date = qualified_data['pubtime'].strftime('%Y-%m-%d') facet_sql = 'select id from xfacet where objectid=%s and type=1;' % ( objectid) try: facetid = mysql.query_one(conn, facet_sql)[0] except Exception, e: #print e continue xentry_sql = 'select entryid from xentry where facetid=%s and date="%s";' % ( facetid, date) xentryid = mysql.query_one(conn, xentry_sql) if not xentryid: xentry_insert_sql = 'insert into xentry(facetid,date) values(%s,"%s");' % ( facetid, date) mysql.insert(conn, xentry_insert_sql) mysql.commit(conn) xentryid = mysql.query_one(conn, xentry_sql) try: xentryid = xentryid[0] except Exception, e: print e return -1 xpostnum = xentryid % 8 title, abstract, posttime, url, author, comment_count, click_count, template_type = qualified_data[ 'title'], '', qualified_data['pubtime'], qualified_data[ 'url'], qualified_data['author'], 0, 0, qualified_data['type'] duplicate_sql = 'select postid from xpost%d where entryid=%d and url="%s";' % ( xpostnum, xentryid, conn.escape_string(url)) cursor.execute(duplicate_sql) postid = cursor.fetchall()