def find_page1(keys): def update_pagelist(it, page_list): url = i['url'] for page1 in page_list: if url == page1['url']: page1['weight'] += 100 return page = {} page['url'] = url page['weight'] = 0 if i.has_key('weight'): page['weight'] = i['weight'] page["userchose"] = 0 if i.has_key('userchose'): page["userchose"] = i['userchose'] page_list.append(page) key1 = [] for k in keys: k = doclex.tolower(k) if k not in key1: key1.append(k) key = key1 page_list = [] for k in keys: c = collection_url_index.find({"key":k}) for i in c: update_pagelist(i, page_list) return page_list
def find_page1(keys): def update_pagelist(it, page_list): url = i['url'] for page1 in page_list: if url == page1['url']: page1['weight'] += 100 return page = {} page['url'] = url page['weight'] = 0 if i.has_key('weight'): page['weight'] = i['weight'] page["userchose"] = 0 if i.has_key('userchose'): page["userchose"] = i['userchose'] page_list.append(page) key1 = [] for k in keys: k = doclex.tolower(k) if k not in key1: key1.append(k) key = key1 page_list = [] for k in keys: c = collection_url_index.find({"key": k}) for i in c: update_pagelist(i, page_list) return page_list
def process_url(urlinfo): url = urlinfo['url'] if url in processed_url_list: return "url is be processed" processed_url_list.append(url) print url,"process url" info = get_page(url) if info is None: print "url, error" return data, headers = info try: htmlp = htmlprocess(urlinfo) htmlp.feed(data) urlinfo = htmlp.urlinfo encodingdate = chardet.detect(headers['date']) date = unicode(headers['date'], encodingdate['encoding']) title = urlinfo['title'] if title == "": if len(urlinfo['titlegen']) > 0: title = urlinfo['titlegen'] profile = urlinfo['profile']['0'] if profile == "": profile = urlinfo['profile']['1'] if profile == "": if len(urlinfo['profile']['2']) > 0: profile = urlinfo['profile']['2'][0] if title != "" and profile != "": encodingdate = chardet.detect(title) title = unicode(title, encodingdate['encoding']) encodingdate = chardet.detect(profile) profile = unicode(profile, encodingdate['encoding']) collection_url_profile.update({'key':url} , {'key':url, 'urlprofile':profile.encode('utf-8'), 'timetmp':time.time(), 'date':date, 'title':title.encode('utf-8')}, True) for w in ['1', '2', '3']: keywords = urlinfo['keys'][w] for key1 in keywords: key = doclex.tolower(key1) collection_url_index.update({'key':key, 'url':url}, {'$set':{'key':key, 'url':url, 'timetmp':time.time(), 'weight':int(w), 'userchose':0}}, True) urlinfolist = htmlp.urllist for key, info in urlinfolist.iteritems(): process_url(info) urlinfolist[key] = {} except: import traceback traceback.print_exc()
def process_url(urlinfo): url = urlinfo['url'] if url[len(url) - 1] == '/': url = url[0:len(url) - 1] urlinfo['url'] = url if url in processed_url_list: return "url is be processed" processed_url_list.append(url) print url,"process url" info = get_page(url) if info is None: print "url, error" return urllist.processurl(url) data, headers = info try: htmlp = htmlprocess(urlinfo) htmlp.feed(data) try: urlinfo = htmlp.urlinfo encodingdate = chardet.detect(headers['date']) date = unicode(headers['date'], encodingdate['encoding']) title = infosort.gettitle(urlinfo['title']) if len(title) > 16: title = title[0:16] + u'...' profile = infosort.getprofile(urlinfo['profile']) if title != u"" and profile != u"": print "update url", url collection_url_profile.update({'key':url} , {'$set':{'key':url, 'urlprofile':profile.encode('utf-8', 'ignore'), 'timetmp':time.time(), 'date':date.encode('utf-8', 'ignore'), 'title':title.encode('utf-8', 'ignore')}}, True) updatekeywords = [] weight1 = [] for key in urlinfo['keys']['1']: if not doclex.inviald_key(key): key = doclex.tolower(key) key = key.encode('utf-8', 'ignore') if key not in weight1: weight1.append(key) for key in weight1: if key not in updatekeywords: updatekeywords.append(key) collection_url_index.update({'key':key, 'url':url}, {'$set':{'url':url, 'key':key, 'weight':htmlp.weight+500}}, True) weight2 = [] for key in urlinfo['keys']['2']: if not doclex.inviald_key(key): key = doclex.tolower(key) key = key.encode('utf-8', 'ignore') if key not in weight2: weight2.append(key) for key in weight2: if key not in updatekeywords: updatekeywords.append(key) collection_url_index.update({'key':key, 'url':url}, {'$set':{'url':url, 'key':key, 'weight':htmlp.weight+300}}, True) weight3 = [] for key in urlinfo['keys']['3']: if not doclex.inviald_key(key): key = doclex.tolower(key) key = key.encode('utf-8', 'ignore') if key not in weight3: weight3.append(key) for key in weight3: if key not in updatekeywords: updatekeywords.append(key) collection_url_index.update({'key':key, 'url':url}, {'$set':{'url':url, 'key':key, 'weight':htmlp.weight}}, True) except: import traceback traceback.print_exc() urlinfolist = htmlp.urllist return urlinfolist except: import traceback traceback.print_exc()
def process_url(urlinfo): url = urlinfo['url'] if url[len(url) - 1] == '/': url = url[0:len(url) - 1] urlinfo['url'] = url if url in processed_url_list: return "url is be processed" processed_url_list.append(url) print url, "process url" info = get_page(url) if info is None: print "url, error" return urllist.processurl(url) data, headers = info try: htmlp = htmlprocess(urlinfo) htmlp.feed(data) try: urlinfo = htmlp.urlinfo encodingdate = chardet.detect(headers['date']) date = unicode(headers['date'], encodingdate['encoding']) title = infosort.gettitle(urlinfo['title']) if len(title) > 16: title = title[0:16] + u'...' profile = infosort.getprofile(urlinfo['profile']) if title != u"" and profile != u"": print "update url", url collection_url_profile.update({'key': url}, { '$set': { 'key': url, 'urlprofile': profile.encode('utf-8', 'ignore'), 'timetmp': time.time(), 'date': date.encode('utf-8', 'ignore'), 'title': title.encode('utf-8', 'ignore') } }, True) updatekeywords = [] weight1 = [] for key in urlinfo['keys']['1']: if not doclex.inviald_key(key): key = doclex.tolower(key) key = key.encode('utf-8', 'ignore') if key not in weight1: weight1.append(key) for key in weight1: if key not in updatekeywords: updatekeywords.append(key) collection_url_index.update({ 'key': key, 'url': url }, { '$set': { 'url': url, 'key': key, 'weight': htmlp.weight + 500 } }, True) weight2 = [] for key in urlinfo['keys']['2']: if not doclex.inviald_key(key): key = doclex.tolower(key) key = key.encode('utf-8', 'ignore') if key not in weight2: weight2.append(key) for key in weight2: if key not in updatekeywords: updatekeywords.append(key) collection_url_index.update({ 'key': key, 'url': url }, { '$set': { 'url': url, 'key': key, 'weight': htmlp.weight + 300 } }, True) weight3 = [] for key in urlinfo['keys']['3']: if not doclex.inviald_key(key): key = doclex.tolower(key) key = key.encode('utf-8', 'ignore') if key not in weight3: weight3.append(key) for key in weight3: if key not in updatekeywords: updatekeywords.append(key) collection_url_index.update({ 'key': key, 'url': url }, { '$set': { 'url': url, 'key': key, 'weight': htmlp.weight } }, True) except: import traceback traceback.print_exc() urlinfolist = htmlp.urllist return urlinfolist except: import traceback traceback.print_exc()
def find_page(input, index): keys = doclex.splityspace(input) for k in keys: collection_key.insert({"key":k}) #if cache_page.has_key(input): # cache_page['input']['timetmp'] = time.time() # return cache_page['input']['urllist'] page_list = [] for k in keys: k = doclex.tolower(k) #if key_page.has_key(k): #key_page['input']['timetmp'] = time.time() # page_list = key_page['input']['urllist'] #else: c = collection_page.find({"key":k}).limit(10).skip(index*10) for i in c: page = {} page['url'] = i['url'] page['timetmp'] = i['timetmp'] page['key'] = i['key'] page_list.append(page) remove_list = [] for url in page_list: c = gethtml.collection_url_profile.find({'key':url['url']}) if c.count() <= 0: remove_list.append(url) continue for i in c: url["profile"] = i['urlprofile'] url["date"] = i['date'] url["title"] = i['title'] for url in remove_list: page_list.remove(url) #key_page['input'] = {} #key_page['input']['timetmp'] = time.time() #key_page['input']['urllist'] = page_list if len(page_list) == 0: keys = doclex.lex(input.encode('utf-8')) for k in keys: k = doclex.tolower(k) #if key_page.has_key(k): # key_page['input']['timetmp'] = time.time() # page_list = key_page['input']['urllist'] #else: c = collection_page.find({"key":k}).limit(10).skip(index*10) for i in c: page = {} page['url'] = i['url'] page['timetmp'] = i['timetmp'] page['key'] = i['key'] page_list.append(page) remove_list = [] for url in page_list: c = gethtml.collection_url_profile.find({'key':url['url']}) if c.count() <= 0: remove_list.append(url) continue for i in c: url["profile"] = i['urlprofile'] url["date"] = i['date'] url["title"] = i['title'] for url in remove_list: page_list.remove(url) #key_page['input'] = {} #key_page['input']['timetmp'] = time.time() #key_page['input']['urllist'] = page_list #cache_page['input'] = {} #cache_page['input']['timetmp'] = time.time() #cache_page['input']['urllist'] = page_list page_list = page_list[0: 10] return page_list