def save_words(handler, article, words, ext): redis_word = handler.redis_word mongo = handler.mongo if 'word' in article and article['word']: row = mongo.word_file.get({article['word']}) if row: row = json.loads(row) if row['sim'] == True: row['words'] = json.loads(row['words']) for word, cnt in row['words']['all'].iteritems(): word = word.lower() hkey = unicode2hash(word) key = hkey % 500 redis_word.hincrby(key, hkey, -1) redis_word.incr('total', -1) if article['sim'] == True: for word, cnt in words['all'].iteritems(): word = word.lower() hkey = unicode2hash(word) key = hkey % 500 redis_word.hincrby(key, hkey, 1) redis_word.incr('total', 1) if 'id' not in article or not article['id']: article['pubdate'], article['id'] = time2id(handler, article['pubtime']) else: article['pubdate'] = time.strftime('%Y%m%d', time.localtime(pubtime)) article['words'] = article['_id'] words = {'words':words, 'sim':article['sim']} words, article['tags'] = update_index(handler, article, words) mongo.word_file.put(article['_id'], json.dumps(words)) web_article = { '_id': article['_id'], 'id': article['id'], 'long': article['long'], 'title': article['title'], 'domain': article['domain'], 'src_name': article['src_name'], 'src_link': article['src_link'], 'tags': article['tags'], 'icons': article['icons'], 'url': article['url'], 'sim': article['sim'], 'icons': article['icons'], 'pubtime': article['pubtime'], 'last': article['last'], } content = web_content(handler, article, words, ext) web_article['content'] = mongo.text_file.put('web_%s' % article['_id'], content.encode('utf-8'), 'txt') while True: try: mongo.article.save(web_article) break except pymongo.errors.OperationFailure, e: time.sleep(1)
def word2num(self, word): word = word.lower() hkey = unicode2hash(word) key = hkey % 500 num = self.redis.hget(key, hkey) num = int(num) if num is not None else 0 return float(max(1, num))
def score(word): word = word.lower() hkey = unicode2hash(word) key = hkey % 500 df = self.redis.hget(key, hkey) df = int(df) if df is not None else 0 return df >= 5
def find(self, word, last=None, limit=20, fields=None): word = word.lower() whash = unicode2hash(word) count = self.keys.find({'word': whash}).count() if last is None: return count, list( self.keys.find({ 'word': whash }, fields=fields).sort([('pubtime', pymongo.DESCENDING) ]).limit(limit)) last = self.keys.find_one({ 'word': whash, 'article': last }, {'pubtime': 1}) if last is None: return count, [] pubtime = last['pubtime'] topic = self.keys.find({ 'word': whash, 'pubtime': { '$lt': pubtime } }, fields=fields).sort([('pubtime', pymongo.DESCENDING)]) return count, list(topic.limit(limit))
def find_page(self, word, page, limit=20, fields=None): skip = page * limit - limit word = word.lower() whash = unicode2hash(word) topic = self.keys.find({ 'word': whash }, fields=fields).sort([('pubtime', pymongo.DESCENDING)]) return topic.count(), list(topic.skip(skip).limit(limit))
def new(self, url, src_type, src, task, last=0): key = hashlib.md5(url.encode('utf-8')).hexdigest() xlong = unicode2hash(url) tpl = url2tpl(url) if tpl not in self.domain.tpls \ or key in self \ or self.domains.add_url(xlong, self.domain.id()) == 0: return 0 article = { '_id': key, 'id': '', 'long': xlong, 'url': url, 'domain': self.domain.id(), 'tpl': tpl, 'src_type': src_type, 'src': src, 'html': '', 'title': '', 'pages': {}, 'imgs': {}, 'icons': {}, 'tags': [], 'sim': False, 'f': False, 'version': 0, 'v': self.articles.new_version(), 'created': time.time(), 'last': time.time(), } if src_type == 'cate': article['src_link'] = task['url'] article['src_name'] = task['name'] if last > 0: article['pubtime'] = last else: article['pubtime'] = time.time() - 86400 * 60 else: article['src_link'] = self.domain.domain['link'] article['src_name'] = self.domain.domain['name'] article['pubtime'] = task['pubtime'] - 86400 * 15 article['pubtime'] = self.get_pubtime(article) if self.next < article['pubtime']: self.next = article['pubtime'] self.updates.add(article['_id']) self.queue.put(article) return 1
def find(self, word, last=None, limit=20, fields=None): word = word.lower() whash = unicode2hash(word) count = self.keys.find({'word':whash}).count() if last is None: return count, list(self.keys.find({'word':whash}, fields=fields).sort([('pubtime',pymongo.DESCENDING)]).limit(limit)) last = self.keys.find_one({'word':whash, 'article':last}, {'pubtime':1}) if last is None: return count, [] pubtime = last['pubtime'] topic = self.keys.find({'word':whash, 'pubtime':{'$lt':pubtime}}, fields=fields).sort([('pubtime',pymongo.DESCENDING)]) return count, list(topic.limit(limit))
def add(self, word, id, imgs, pubtime, icons): word = word.lower() whash = unicode2hash(word) res = self.index.find_one({'_id': word}) row = { '_id': str2hash('%d-%s' % (whash, id)), 'article': id, 'word': whash, 'imgs': imgs, 'pubtime': pubtime, 'rank': self.rank(imgs, pubtime) } if res is None: res = { '_id': word, 'word': whash, 'rank': 0, 'count': 0, 'icon': '', 'icon_time': 0, 'auto': True, } if res['count'] >= 1500: words = list( self.keys.find({ 'word': whash }).sort([('rank', -1)]).skip(999).limit(1)) if words: self.keys.remove({ 'word': whash, 'rank': { '$lt': words[0]['rank'] } }) res['rank'] = words[0]['rank'] res['count'] = 1000 if row['rank'] > res['rank']: if icons and res[ 'auto'] == True and pubtime - res['icon_time'] > 3 * 86400: res['icon'] = icons.pop() res['icon_time'] = pubtime self.keys.save(row) res['count'] += 1 self.index.save(res)
def _upgrade_word(row): row['words'] = json.loads(row['words']) spider.word_file.put(row['_id'], json.dumps({ 'sim': row['sim'], 'words': row['words'], })) if row['sim'] == True: words = row['words'] for word, cnt in words['all'].iteritems(): word = word.lower() hkey = unicode2hash(word) key = hkey % 500 redis_word.hincrby(key, hkey, 1) redis_word.incr('total', 1)
def merger_pages(self, article, pages): tmp_pages = article['pages'] article['pages'] = {} for page in pages: md5 = hashlib.md5(page.encode('utf-8')).hexdigest() xlong = unicode2hash(page) self.domains.add_url(xlong, article['domain']) if md5 in tmp_pages: article['pages'][md5] = tmp_pages[md5] else: article['pages'][md5] = { 'url': page, 'path': '', 'status': 'wait', 'last': time.time(), } for md5, page in tmp_pages.iteritems(): if md5 not in article['pages'] and page['status'] == 'done': self.articles.html_file.remove(page['path'])
def add(self, word, id, num, imgs, pubtime, icons): word = word.lower() whash = unicode2hash(word) res = self.index.find_one({'_id':word}) row = { '_id': str2hash('%d-%s' % (whash, id)), 'article': id, 'word': whash, 'num': num, 'imgs': imgs, 'pubtime': pubtime, 'rank': self.rank(num, imgs, pubtime) } if res is None: res = { '_id': word, 'word': whash, 'rank': 0, 'count': 0, 'icon': '', 'icon_time': 0, 'auto': True, } if res['count'] >= 2: if res['count'] >= 1500: words = list(self.keys.find({'word':whash}).sort([('rank', -1)]).skip(999).limit(1)) if words: self.keys.remove({'word':whash, 'rank':{'$lt':words[0]['rank']}}) res['rank'] = words[0]['rank'] if row['rank'] > res['rank']: if icons and res['auto'] == True and pubtime - res['icon_time'] > 3 * 86400: res['icon'] = icons.pop() res['icon_time'] = pubtime self.keys.save(row) res['count'] += 1 self.index.save(res)
def find_page(self, word, page, limit=20, fields=None): skip = page * limit - limit word = word.lower() whash = unicode2hash(word) topic = self.keys.find({'word':whash}, fields=fields).sort([('pubtime',pymongo.DESCENDING)]) return topic.count(), list(topic.skip(skip).limit(limit))
def _upgrade_article(row): article = dict((x, row[x]) for x in keys) xlong = unicode2hash(article['url']) redis_url.sadd(article['domain'], xlong) article['long'] = xlong article['pubdate'], article['id'] = '', '' del article['v']['tag'] article['icons'] = row['icons'] if 'content' in row and row['content']: article['content'] = iweb.text_file.put('spider_%s' % article['_id'], row['content'].encode('utf-8'), 'txt') else: article['content'] = '' if article['v']['sim'] > 0: article['sim'] = sim(article, row['content']) else: article['sim'] = False if row['v']['seg'] > 0: # article['pubdate'], article['id'] = time2id(article['pubtime']) # article['words'] = article['_id'] # words = iweb.word_file.get(article['words']) # if words is not None: # words = json.loads(words) # words['sim'] = article['sim'] # words, article['tags'] = update_index(article, words) # iweb.word_file.put(row['_id'], json.dumps(words)) # web_article = { # '_id': article['_id'], # 'id': article['id'], # 'long': article['long'], # 'title': article['title'], # 'domain': article['domain'], # 'src_name': article['src_name'], # 'src_link': article['src_link'], # 'tags': article['tags'], # 'icons': article['icons'], # 'url': article['url'], # 'sim': article['sim'], # 'icons': article['icons'], # 'pubtime': article['pubtime'], # 'last': article['last'], # } # content = web_content(row, article, words) # web_article['content'] = iweb.text_file.put('web_%s' % article['_id'], content.encode('utf-8'), 'txt') # while True: # try: # iweb.article.save(web_article) # break # except pymongo.errors.OperationFailure, e: # print str(e) # gevent.sleep(1) # if len(article['tags']) >= 3 and article['icons']: # topics.add(web_article) # else: # row['exc'] = 'ValueError' pass else: article['words'] = '' article['tag'] = [] if row['exc']: article['exc'] = row['exc'] while True: try: iweb.spider_exc.save(article) break except pymongo.errors.OperationFailure, e: print str(e) gevent.sleep(1)
def save_words(handler, article, words, ext): redis_word = handler.redis_word mongo = handler.mongo if 'word' in article and article['word']: row = mongo.word_file.get({article['word']}) if row: row = json.loads(row) if row['sim'] == True: row['words'] = json.loads(row['words']) for word, cnt in row['words']['all'].iteritems(): word = word.lower() hkey = unicode2hash(word) key = hkey % 500 redis_word.hincrby(key, hkey, -1) redis_word.incr('total', -1) if article['sim'] == True: for word, cnt in words['all'].iteritems(): word = word.lower() hkey = unicode2hash(word) key = hkey % 500 redis_word.hincrby(key, hkey, 1) redis_word.incr('total', 1) if 'id' not in article or not article['id']: article['pubdate'], article['id'] = time2id(handler, article['pubtime']) else: article['pubdate'] = time.strftime('%Y%m%d', time.localtime(pubtime)) article['words'] = article['_id'] words = {'words': words, 'sim': article['sim']} words, article['tags'] = update_index(handler, article, words) mongo.word_file.put(article['_id'], json.dumps(words)) web_article = { '_id': article['_id'], 'id': article['id'], 'long': article['long'], 'title': article['title'], 'domain': article['domain'], 'src_name': article['src_name'], 'src_link': article['src_link'], 'tags': article['tags'], 'icons': article['icons'], 'url': article['url'], 'sim': article['sim'], 'icons': article['icons'], 'pubtime': article['pubtime'], 'last': article['last'], } content = web_content(handler, article, words, ext) web_article['content'] = mongo.text_file.put('web_%s' % article['_id'], content.encode('utf-8'), 'txt') while True: try: mongo.article.save(web_article) break except pymongo.errors.OperationFailure, e: time.sleep(1)