Пример #1
0
def process():
    conn_old = mysql.connect('bsppr', '192.168.241.7')
    mysql.insert(conn_old, 'set names utf8')
    conn_new = mysql.connect('bsppr', '192.168.241.32')
    mysql.insert(conn_new, 'set names utf8')
    cinfos_old = get_cinfos(conn_old)
    #print cinfos_old
    cinfos_new = get_cinfos(conn_new)
    mongo_conn = get_mongo_conn()
    tablename = 'weixin'
    tmpdatas = mongo.find(mongo_conn, tablename, {}, 50)
    rawdatas = []
    for raw in tmpdatas:
        date = raw['pubtime']
        now = datetime.datetime.now()
        diff = now - date
        print diff.days
        rawdatas.append(raw)
    if len(rawdatas) == 0:
        time.sleep(10)
    raw_old_qualified = filter(cinfos_old, rawdatas)
    old_insert_num = feed_xpost.feed_data_to_xpost(conn_old, raw_old_qualified,
                                                   'old')
    raw_new_qualified = filter(cinfos_new, rawdatas)
    new_insert_num = feed_xpost.feed_data_to_xpost(conn_new, raw_new_qualified,
                                                   'new')
Пример #2
0
def process():
    conn_old = mysql.connect('bsppr', '192.168.241.7')
    mysql.insert(conn_old,'set names utf8')
    conn_new = mysql.connect('bsppr', '192.168.241.32')
    mysql.insert(conn_new,'set names utf8')
    cinfos_old = get_cinfos_moa(conn_old)
    cinfos_new = get_cinfos_moa(conn_new)
    mongo_conn = get_mongo_conn()
    tablename = 'weixin'
    while True:
	tmpdatas = mongo.find(mongo_conn, tablename, {},1000)
        rawdatas = []
        for raw in tmpdatas:
	    url = raw['url']
            mongo.delete(mongo_conn, tablename, {'url':url})
            date = raw['pubtime']
            now = datetime.datetime.now()
            diff = now - date
            if diff.days>2:
                continue
            rawdatas.append(raw)
        if len(rawdatas)==0:
	    print 'wait datas...'
            time.sleep(300)
        raw_old_qualified = filter(cinfos_old,rawdatas)
	if raw_old_qualified:
            old_insert_num = feed_xpost.feed_data_to_xpost(conn_old, raw_old_qualified,'old')
	    print 'old_insert_num: ',old_insert_num
        raw_new_qualified = filter(cinfos_new,rawdatas)
	if raw_new_qualified:
            new_insert_num = feed_xpost.feed_data_to_xpost(conn_new, raw_new_qualified,'new')
	    print 'new_insert_num: ',new_insert_num
Пример #3
0
def get_site_id(conn,url):
    domain = urlparse.urlparse(url).netloc
    sql = "select siteid,name from xsite where url='%s';"%(domain)
    data = mysql.query_one(conn, sql)
    if not data:
        isql = "insert into xsite(name,url) values('%s','%s');"%(domain,domain)
        mysql.insert(conn, isql)
        mysql.commit(conn)
        data = mysql.query_one(conn, sql)
    return data
Пример #4
0
def get_site_id(conn, url):
    domain = urlparse.urlparse(url).netloc
    sql = "select siteid,name from xsite where url='%s';" % (domain)
    data = mysql.query_one(conn, sql)
    if not data:
        isql = "insert into xsite(name,url) values('%s','%s');" % (domain,
                                                                   domain)
        mysql.insert(conn, isql)
        mysql.commit(conn)
        data = mysql.query_one(conn, sql)
    return data
Пример #5
0
def process(terrace):
    if terrace=='new':
        conn = mysql.connect('bsppr', '192.168.241.32')
    else:
        conn = mysql.connect('bsppr', '192.168.241.7')
    mysql.insert(conn,'set names utf8')
    mongo_conn = get_mongo_conn()
    tablename = 'comgeneral'
    while True:
        total = 0
	before_total = 0
        raw_total = 0
        tmpdatas = mongo.find(mongo_conn, tablename, {'terrace':terrace},200)
        print '...%s get data ...'%(terrace)
        rawdatas = []
        for raw in tmpdatas:
            raw_total += 1
            url = raw['url']
            mongo.delete(mongo_conn, tablename, {'url':url})
            datestr = raw['pubtime']
            if not datestr:
                continue
            updatetime = raw.get('updatetime',time.time)
            try:
                date = format_time(datestr,updatetime)
                #open(self.terrace,'a+').write('%s\n'%str(date))
                now = datetime.datetime.now()
                diff = now - date
            except:
                open('date_error.dat','a+').write('%s\t%s\n'%(datestr,url))
                continue
            if diff.days>2:
                continue
            raw.update({'pubtime':date})
            rawdatas.append(raw)
	    before_total += 1
        if tmpdatas.count()==0:
            break
        insert_num = feed_xpost.feed_data_to_xpost(conn, rawdatas,terrace)
        total += insert_num

        open('total_%s.dat'%(terrace),'a+').write('%s\t%s\n'%(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),total))
        open('raw_total_%s.dat'%(terrace),'a+').write('%s\t%s\n'%(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),raw_total))
        open('before_total_%s.dat'%(terrace),'a+').write('%s\t%s\n'%(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),before_total))
Пример #6
0
 def fetch_query_results(self, query_string, num=200):
     index_path = "/disk1/kol_search_index/index"
     query_index = QueryEnvironment()
     query_index.addIndex(index_path)
     # 根据query_string查询结果
     # print query_string
     docs = query_index.runQuery(query_string, num)
     # 解析查询的结果
     results = get_query_results(query_index, docs)
     datas = {}
     flag = 0
     conn = ""
     now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
     flag = 0
     for result in results:
         if flag >= 200:
             break
         data = {}
         userid = result["userid"]
         site = result["site"]
         relevance = process_relevance(result["relevance"])
         data.update({"userid": userid})
         data.update({"site": site})
         data.update({"relevance": relevance})
         datas.update({flag: data})
         flag += 1
     if datas:
         conn = mysql.connect("kol_search")
         results = json.dumps(datas)
         results = conn.escape_string(results)
         query = str(self.keyword) + "#$#" + str(self.site)
         sql = "insert into search_result_cache(query,result,update_time) values('%s','%s','%s');" % (
             query,
             results,
             now,
         )
         # print sql
         mysql.insert(conn, sql)
         mysql.commit(conn)
         mysql.close(conn)
         query_index.close()
     return datas
Пример #7
0
    def process_item(self, item, spider):
        self.total_cnt += 1
        if isinstance(item, PageMetaItem):
            http_code = item['http_code']
            self.file.write(item['url'] + '\n')
            self.redis.add_url(item['url'])
            if http_code >= 200 and http_code < 300:
                self.total += 1
                try:
                    if self.total % self.nums_in_eachDBFile == 0:
                        self.db.closeDb()
                        if os.path.exists(self.db_file):
                            shutil.move(self.db_file, self.dbfile_move_target)
                        else:
                            err = '+++no_db_file:', self.db_file
                            print err
                            log.msg(err, level=log.ERROR)
                        self._createNewDBFile()

                    if item['url'] and item['content']:
                        self._writeDBFile(item)
                except:
                    print '=URL=', item['url'], '=body=', item['content']
                    info = sys.exc_info()
                    print info[0], ":", info[1]
        elif isinstance(item, ReplycountItem):
            url = item['url']
            appnameid = item['appnameid']
            replyCount = item.get('replyCount', 0)
            readnum = item.get('readnum', 0)
            likenum = item.get('likenum', 0)
            unlikenum = item.get('unlikenum', 0)
            playnum = item.get('playnum', 0)
            repostsnum = item.get('repostsnum', 0)
            updatetime = item.get('updatetime', 0)
            sql = 'insert into container(url,appnameid,replyCount,readnum,likenum,unlikenum,playnum,repostsnum,updatetime) values("%s",%s,%s,%s,%s,%s,%s,%s,"%s");'
            sql = sql % (url, appnameid, replyCount, readnum, likenum,
                         unlikenum, playnum, repostsnum, updatetime)
            mysql.insert(self.conn, sql)
            #cursor = self.conn.cursor()
            #cursor.execute(sql)
            mysql.commit(self.conn)
Пример #8
0
    def process_item(self, item, spider):
        self.total_cnt += 1
        if isinstance(item, PageMetaItem):
            http_code = item['http_code']
            self.file.write(item['url']+'\n')
            self.redis.add_url(item['url'])
            if http_code >= 200 and http_code < 300:
                self.total += 1
                try:
                    if self.total % self.nums_in_eachDBFile == 0:
                        self.db.closeDb()
                        if os.path.exists(self.db_file):
                            shutil.move(self.db_file,self.dbfile_move_target)
                        else:
                            err = '+++no_db_file:',self.db_file
                            print err
                            log.msg(err,level=log.ERROR)
                        self._createNewDBFile()
 
                    if item['url'] and item['content']:
                        self._writeDBFile(item)
                except:
                    print '=URL=',item['url'],'=body=',item['content']
                    info=sys.exc_info()
                    print info[0],":",info[1]
        elif isinstance(item,ReplycountItem):
            url = item['url']
            appnameid = item['appnameid']
            replyCount = item.get('replyCount',0)
            readnum = item.get('readnum',0) 
            likenum = item.get('likenum',0) 
            unlikenum = item.get('unlikenum',0) 
            playnum = item.get('playnum',0) 
            repostsnum = item.get('repostsnum',0) 
            updatetime =  item.get('updatetime',0)
            sql = 'insert into container(url,appnameid,replyCount,readnum,likenum,unlikenum,playnum,repostsnum,updatetime) values("%s",%s,%s,%s,%s,%s,%s,%s,"%s");'
            sql = sql%(url,appnameid,replyCount,readnum,likenum,unlikenum,playnum,repostsnum,updatetime)
            mysql.insert(self.conn, sql)
	    #cursor = self.conn.cursor()
	    #cursor.execute(sql)
            mysql.commit(self.conn)
Пример #9
0
 def fetch_query_results(self, query_string, num=200):
     index_path = '/disk1/kol_search_index/index'
     query_index = QueryEnvironment()
     query_index.addIndex(index_path)
     #根据query_string查询结果
     #print query_string
     docs = query_index.runQuery(query_string, num)
     #解析查询的结果
     results = get_query_results(query_index, docs)
     datas = {}
     flag = 0
     conn = ''
     now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
     flag = 0
     for result in results:
         if flag >= 200:
             break
         data = {}
         userid = result['userid']
         site = result['site']
         relevance = process_relevance(result['relevance'])
         data.update({'userid': userid})
         data.update({'site': site})
         data.update({'relevance': relevance})
         datas.update({flag: data})
         flag += 1
     if datas:
         conn = mysql.connect('kol_search')
         results = json.dumps(datas)
         results = conn.escape_string(results)
         query = str(self.keyword) + '#$#' + str(self.site)
         sql = "insert into search_result_cache(query,result,update_time) values('%s','%s','%s');" % (
             query, results, now)
         #print sql
         mysql.insert(conn, sql)
         mysql.commit(conn)
         mysql.close(conn)
         query_index.close()
     return datas
Пример #10
0
def process():
    conn_old = mysql.connect('bsppr', '192.168.241.7')
    mysql.insert(conn_old,'set names utf8')
    conn_new = mysql.connect('bsppr', '192.168.241.32')
    mysql.insert(conn_new,'set names utf8')
    cinfos_old = get_cinfos(conn_old)
    #print cinfos_old
    cinfos_new = get_cinfos(conn_new)
    mongo_conn = get_mongo_conn()
    tablename = 'weixin'
    tmpdatas = mongo.find(mongo_conn, tablename, {},50)
    rawdatas = []
    for raw in tmpdatas:
	date = raw['pubtime']
	now = datetime.datetime.now()
	diff = now - date
	print diff.days
	rawdatas.append(raw)
    if len(rawdatas)==0:
        time.sleep(10)
    raw_old_qualified = filter(cinfos_old,rawdatas)
    old_insert_num = feed_xpost.feed_data_to_xpost(conn_old, raw_old_qualified,'old')
    raw_new_qualified = filter(cinfos_new,rawdatas)
    new_insert_num = feed_xpost.feed_data_to_xpost(conn_new, raw_new_qualified,'new')
Пример #11
0
    return oids


def get_cinfo_obejctid(conn_new, conn_old, terrace, objectid):
    sql = 'select objectid,name,limiter,synonyms,exclude_limiter from object where objectid=%s' % (
        objectid)
    if terrace == 'new':
        data = mysql.query_one(conn_new, sql)
    else:
        data = mysql.query_one(conn_old, sql)
    return data


mongo_conn = mongo.connect('192.168.241.12', 'stream')
conn_new, conn_old = get_conn()
mysql.insert(conn_new, 'set names utf8')
mysql.insert(conn_old, 'set names utf8')
tablename = 'general'
rawdatas = mongo.find(mongo_conn, tablename, {}, 30000)
csvwriter = csv.writer(open('general_filter.csv', 'w'))
csvwriter.writerow(['title,url,terrace_info,terrace'])
htmlwriter = open('general_filter.html', 'w')
html = '''
<html>
    <head>
	<title>平台过滤信息</title>
        <meta http-equiv="content-type" content="text/html;charset=utf-8">
    </head>
    <body>
'''
htmlwriter.write('%s\n' % (html))
Пример #12
0
        return -1
    insert_num = 0
    for qualified_data in qualified_datas:
        objectid = qualified_data['objectid']
        date = qualified_data['pubtime'].strftime('%Y-%m-%d')
        facet_sql = 'select id from xfacet where objectid=%s and type=1;'%(objectid)
	try:
            facetid = mysql.query_one(conn, facet_sql)[0]
	except Exception,e:
	    #print e
	    continue
        xentry_sql = 'select entryid from xentry where facetid=%s and date="%s";'%(facetid,date)
        xentryid = mysql.query_one(conn, xentry_sql)
        if not xentryid:
            xentry_insert_sql = 'insert into xentry(facetid,date) values(%s,"%s");'%(facetid,date)
            mysql.insert(conn, xentry_insert_sql)
            mysql.commit(conn)
            xentryid = mysql.query_one(conn, xentry_sql)
        try:
            xentryid = xentryid[0]
        except Exception,e:
            print e
            return -1
        xpostnum = xentryid%8
        title,abstract,posttime,url,author,comment_count,click_count,template_type = qualified_data['title'],'',qualified_data['pubtime'],qualified_data['url'],qualified_data['author'],0,0,qualified_data['type']
        duplicate_sql = 'select postid from xpost%d where entryid=%d and url="%s";'%(xpostnum,xentryid,conn.escape_string(url))
        cursor.execute(duplicate_sql)
        postid = cursor.fetchall()
        if postid:
	    #print xpostnum,postid
            continue
Пример #13
0
 objectid = qualified_data['objectid']
 date = qualified_data['pubtime'].strftime('%Y-%m-%d')
 facet_sql = 'select id from xfacet where objectid=%s and type=1;' % (
     objectid)
 try:
     facetid = mysql.query_one(conn, facet_sql)[0]
 except Exception, e:
     #print e
     continue
 xentry_sql = 'select entryid from xentry where facetid=%s and date="%s";' % (
     facetid, date)
 xentryid = mysql.query_one(conn, xentry_sql)
 if not xentryid:
     xentry_insert_sql = 'insert into xentry(facetid,date) values(%s,"%s");' % (
         facetid, date)
     mysql.insert(conn, xentry_insert_sql)
     mysql.commit(conn)
     xentryid = mysql.query_one(conn, xentry_sql)
 try:
     xentryid = xentryid[0]
 except Exception, e:
     print e
     return -1
 xpostnum = xentryid % 8
 title, abstract, posttime, url, author, comment_count, click_count, template_type = qualified_data[
     'title'], '', qualified_data['pubtime'], qualified_data[
         'url'], qualified_data['author'], 0, 0, qualified_data['type']
 duplicate_sql = 'select postid from xpost%d where entryid=%d and url="%s";' % (
     xpostnum, xentryid, conn.escape_string(url))
 cursor.execute(duplicate_sql)
 postid = cursor.fetchall()