def handle_data(self, data): if self.current_tag == 'title': keys = doclex.lex(data) if isinstance(keys, list) and len(keys) > 0: for key in keys: #if not self.key_url.has_key(key): #self.key_url[key] = [] #print key self.keywords.append(key) #self.key_url[key].append(self.url) data = doclex.delspace(data) if len(data) > 0: self.title = data #collection_url_title.insert({'key':self.url, 'title':data, 'timetmp':time.time()}) elif self.current_tag == 'a': #if not judged_url(self.link_url): # self.link_url = self.url + self.link_url keys = doclex.simplesplit(data) if isinstance(keys, list) and len(keys) > 0: for key in keys: if not self.key_url.has_key(key): self.key_url[key] = [] if self.link_url != self.url and judged_url(self.link_url): self.key_url[key].append(self.link_url) #print key, self.link_url else: if self.current_tag == 'p' or self.current_tag == 'div': self.data.append(data)
def process_page(url, data): if data is None: return try: key_url = {} url_profile = "" htmlp = htmlprocess(url) encoding = chardet.detect(data) if encoding['encoding'] is None: return udata = unicode(data, encoding['encoding']) htmlp.feed(udata.encode('utf-8')) keywords = htmlp.keywords key_url.update(htmlp.key_url) if len(key_url) > 0: for key, value in key_url.iteritems(): if len(value) > 0: urllist = [] urllist = [url for url in value if urllist.count(url) == 0] if url not in key_url[key]: key_url[key] = urllist for data in htmlp.data: data = doclex.delspace(data) if len(data) < 32: url_profile += data keys = doclex.simplesplit(data) keywords.extend(keys) if isinstance(keys, list) and len(keys) > 0: for key in keys: if not key_url.has_key(key): key_url[key] = [] if url not in key_url[key]: key_url[key].append(url) else: if len(data) > 100: url_profile += data[0:len(data) if len(data) < 100 else 100] + "..." keys1 = doclex.lex(data) keywords.extend(keys1) for key1 in keys1: if not key_url.has_key(key1): key_url[key1] = [] if url not in key_url[key1]: key_url[key1].append(url) return htmlp.link, url_profile, keywords, htmlp.profile, key_url, htmlp.title except: #import traceback #traceback.print_exc() pass
def handle_data(self, data): if self.current_tag == 'title': try: data = doclex.delspace(data) keys = doclex.lex(data) if isinstance(keys, list) and len(keys) > 0: for key in keys: self.urlinfo['keys']['2'].append(key) if len(data) > 0: self.urlinfo['title'] = data except: import traceback traceback.print_exc() elif self.current_tag == 'a': try: if self.sub_url != "": keys = doclex.simplesplit(data) if isinstance(keys, list) and len(keys) > 0: for key in keys: if key in self.urllist[self.sub_url]['keys']['3']: self.urllist[self.sub_url]['keys']['3'].remove( key) if key not in self.urllist[self.sub_url]['keys'][ '1'] and key not in self.urllist[ self.sub_url]['keys']['2']: self.urllist[self.sub_url]['keys']['2'].append( key) encodingdate = chardet.detect(data) if encodingdate['encoding']: udata = unicode(data, encodingdate['encoding']) tlen = 16 if len(udata) < 16: tlen = len(udata) self.urllist[self.sub_url]['titlegen'].append( udata[0:tlen].encode('utf-8')) if len(udata) > 16: self.urllist[self.sub_url]['profile']['1'] = udata[ 0:32].encode('utf-8') except: import traceback traceback.print_exc() else: if self.current_tag == 'p' or self.current_tag == 'div': try: if not doclex.invialddata(data): data = doclex.delspace(data) encodingdate = chardet.detect(data) udata = unicode(data, encodingdate['encoding']) tlen = 16 if len(udata) < 16: tlen = len(udata) self.urlinfo['titlegen'].append( udata[0:tlen].encode('utf-8')) if len(udata) > 32: self.urlinfo['profile']['2'].append( (udata[0:32] + u"...").encode('utf-8')) keys1 = doclex.lex(data) for key in keys1: self.urlinfo['keys']['3'].append(key) except: import traceback traceback.print_exc()
def find_page(input, index): keys = doclex.splityspace(input) for k in keys: collection_key.insert({"key":k}) #if cache_page.has_key(input): # cache_page['input']['timetmp'] = time.time() # return cache_page['input']['urllist'] page_list = [] for k in keys: k = doclex.tolower(k) #if key_page.has_key(k): #key_page['input']['timetmp'] = time.time() # page_list = key_page['input']['urllist'] #else: c = collection_page.find({"key":k}).limit(10).skip(index*10) for i in c: page = {} page['url'] = i['url'] page['timetmp'] = i['timetmp'] page['key'] = i['key'] page_list.append(page) remove_list = [] for url in page_list: c = gethtml.collection_url_profile.find({'key':url['url']}) if c.count() <= 0: remove_list.append(url) continue for i in c: url["profile"] = i['urlprofile'] url["date"] = i['date'] url["title"] = i['title'] for url in remove_list: page_list.remove(url) #key_page['input'] = {} #key_page['input']['timetmp'] = time.time() #key_page['input']['urllist'] = page_list if len(page_list) == 0: keys = doclex.lex(input.encode('utf-8')) for k in keys: k = doclex.tolower(k) #if key_page.has_key(k): # key_page['input']['timetmp'] = time.time() # page_list = key_page['input']['urllist'] #else: c = collection_page.find({"key":k}).limit(10).skip(index*10) for i in c: page = {} page['url'] = i['url'] page['timetmp'] = i['timetmp'] page['key'] = i['key'] page_list.append(page) remove_list = [] for url in page_list: c = gethtml.collection_url_profile.find({'key':url['url']}) if c.count() <= 0: remove_list.append(url) continue for i in c: url["profile"] = i['urlprofile'] url["date"] = i['date'] url["title"] = i['title'] for url in remove_list: page_list.remove(url) #key_page['input'] = {} #key_page['input']['timetmp'] = time.time() #key_page['input']['urllist'] = page_list #cache_page['input'] = {} #cache_page['input']['timetmp'] = time.time() #cache_page['input']['urllist'] = page_list page_list = page_list[0: 10] return page_list
def handle_starttag(self, tag, attrs): self.current_tag = tag self.style = 'None' self.sub_url = "" if tag == 'meta': for name,value in attrs: if name == 'name': if value == 'keywords' or value == 'metaKeywords': self.style = 'keywords' elif value == 'description' or value == 'metaDescription': self.style = 'profile' for name,value in attrs: if name == 'content': try: if isinstance(value, str): encodingdate = chardet.detect(value) if encodingdate['encoding']: value = unicode(value, encodingdate['encoding']) if self.style == 'keywords': keywords = doclex.simplesplit(value) if isinstance(keywords, list): for key in keywords: self.urlinfo['keys']['1'].append(key) elif self.style == 'profile': self.urlinfo['profile'].append(value) keys1 = doclex.lex(value) for key in keys1: self.urlinfo['keys']['2'].append(key) keys1 = doclex.vaguesplit(value) for key in keys1: self.urlinfo['keys']['3'].append(key) tlen = 16 if len(value) < 16: tlen = len(value) self.urlinfo['title'].append(value[0:tlen]) except: import traceback traceback.print_exc() if tag == 'a' or tag == 'A': self.sub_url = "" for name,value in attrs: if name == 'href': if len(value) == 0: return if not judged_url(value): if self.current_url[len(self.current_url) - 1] != '/' and value[0] != '/': value = self.current_url + '/' + value else: value = self.current_url + value if value.find('void') != -1: return if value.find('javascript') != -1: return if value.find('javaScript') != -1: return if self.current_url.find("apple") != -1: if value.find("http://www.apple.com/cn/mac#ac-gn-menustate") !=-1: return if self.current_url.find("cnblogs") != -1: if value.find("http://msg.cnblogs.com/send?recipient=itwriter") != -1: return elif value.find("http://i.cnblogs.com/EditPosts.aspx?opt=1") != -1: return elif value.find("http://i.cnblogs.com/EditPosts.aspx?postid=1935371") != -1: return elif value.find("http://msg.cnblogs.com/send?recipient=itwriter/") != -1: return elif value.find("http://msg.cnblogs.com/send?recipient=itwriter/GetUsername.aspx") != -1: return elif value.find("/EnterMyBlog.aspx?NewArticle=1") != -1: return elif value.find("GetUsername") != -1: return elif value.find("GetMyPassword") != -1: return elif value.find("http://i.cnblogs.com/EditPosts.aspx?postid=") != -1: return elif value[len(value) - 1] == '#': value = value[0:-1] if self.current_url.find(value) != -1: return if value[len(value) - 1] == '#': value = value[0:-1] if value != self.current_url and len(value) < 64 and not ingoreurl(value): self.urllist[value] = {'url':value, 'keys':{'1':[], '2':[], '3':[]}, 'title':[], 'profile':[]} self.sub_url = value
def handle_data(self, data): if self.current_tag == 'title': try: encodingdate = chardet.detect(data) if encodingdate['encoding']: data = unicode(data, encodingdate['encoding']) if not doclex.invialddata(data): if len(data) > 0: self.urlinfo['title'].append(data) keys = doclex.lex(data) if isinstance(keys, list) and len(keys) > 0: for key in keys: self.urlinfo['keys']['2'].append(key) keys = doclex.vaguesplit(data) if isinstance(keys, list) and len(keys) > 0: for key in keys: self.urlinfo['keys']['3'].append(key) except: import traceback traceback.print_exc() elif self.current_tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: try: encodingdate = chardet.detect(data) if encodingdate['encoding']: data = unicode(data, encodingdate['encoding']) if not doclex.invialddata(data): if len(data) > 0: self.urlinfo['title'].append(data) keys = doclex.lex(data) if isinstance(keys, list) and len(keys) > 0: for key in keys: self.urlinfo['keys']['2'].append(key) keys = doclex.vaguesplit(data) if isinstance(keys, list) and len(keys) > 0: for key in keys: self.urlinfo['keys']['3'].append(key) except: import traceback traceback.print_exc() elif self.current_tag == 'a' or self.current_tag == 'A': try: if self.sub_url != "": encodingdate = chardet.detect(data) if encodingdate['encoding']: data = unicode(data, encodingdate['encoding']) keys = doclex.simplesplit(data) if isinstance(keys, list) and len(keys) > 0: for key in keys: if key in self.urllist[self.sub_url]['keys']['3']: self.urllist[self.sub_url]['keys']['3'].remove(key) if key not in self.urllist[self.sub_url]['keys']['1'] and key not in self.urllist[self.sub_url]['keys']['2']: self.urllist[self.sub_url]['keys']['1'].append(key) keys1 = doclex.lex(data) for key in keys1: self.urllist[self.sub_url]['keys']['2'].append(key) keys1 = doclex.vaguesplit(data) for key in keys1: self.urllist[self.sub_url]['keys']['3'].append(key) tlen = 16 if len(data) < 16: tlen = len(data) self.urllist[self.sub_url]['title'].append(data[0:tlen]) if len(data) > 32: self.urllist[self.sub_url]['profile'].append(data[0:32]) except: import traceback traceback.print_exc() else: if self.current_tag == 'div' or self.current_tag == 'p': try: encodingdate = chardet.detect(data) if encodingdate['encoding']: data = unicode(data, encodingdate['encoding']) if not doclex.invialddata(data): data = doclex.delspace(data) if data[0] == u'<': return if len(data) > 100: tlen = 16 if len(data) < 16: tlen = len(data) self.urlinfo['title'].append(data[0:tlen]) if len(data) > 32: self.urlinfo['profile'].append(data[0:32] + u"...") keys1 = doclex.lex(data) for key in keys1: self.urlinfo['keys']['2'].append(key) keys1 = doclex.vaguesplit(data) for key in keys1: self.urlinfo['keys']['3'].append(key) self.weight += 200 except: import traceback traceback.print_exc()
def handle_data(self, data): if self.current_tag == 'title': try: data = doclex.delspace(data) keys = doclex.lex(data) if isinstance(keys, list) and len(keys) > 0: for key in keys: self.urlinfo['keys']['2'].append(key) if len(data) > 0: self.urlinfo['title'] = data except: import traceback traceback.print_exc() elif self.current_tag == 'a': try: if self.sub_url != "": keys = doclex.simplesplit(data) if isinstance(keys, list) and len(keys) > 0: for key in keys: if key in self.urllist[self.sub_url]['keys']['3']: self.urllist[self.sub_url]['keys']['3'].remove(key) if key not in self.urllist[self.sub_url]['keys']['1'] and key not in self.urllist[self.sub_url]['keys']['2']: self.urllist[self.sub_url]['keys']['2'].append(key) encodingdate = chardet.detect(data) if encodingdate['encoding']: udata = unicode(data, encodingdate['encoding']) tlen = 16 if len(udata) < 16: tlen = len(udata) self.urllist[self.sub_url]['titlegen'].append(udata[0:tlen].encode('utf-8')) if len(udata) > 16: self.urllist[self.sub_url]['profile']['1'] = udata[0:32].encode('utf-8') except: import traceback traceback.print_exc() else: if self.current_tag == 'p' or self.current_tag == 'div': try: if not doclex.invialddata(data): data = doclex.delspace(data) encodingdate = chardet.detect(data) udata = unicode(data, encodingdate['encoding']) tlen = 16 if len(udata) < 16: tlen = len(udata) self.urlinfo['titlegen'].append(udata[0:tlen].encode('utf-8')) if len(udata) > 32: self.urlinfo['profile']['2'].append((udata[0:32] + u"...").encode('utf-8')) keys1 = doclex.lex(data) for key in keys1: self.urlinfo['keys']['3'].append(key) except: import traceback traceback.print_exc()