def processinput(input): keys = doclex.splityspace(input) for k in keys: collection_key.update({"key":k}, {"key":k}, True) keys1 = doclex.simplesplit(input) findkey = [] for key in keys: if key not in findkey: findkey.append(key) for key in keys1: if key not in findkey: findkey.append(key) addkey = [] for keywords in thesaurus: for key in findkey: if key in keywords: addkey.extend(keywords) for key in addkey: if key not in findkey: findkey.append(key) return findkey
def processinput(input): keys = doclex.splityspace(input) for k in keys: collection_key.update({"key": k}, {"key": k}, True) keys1 = doclex.simplesplit(input) findkey = [] for key in keys: if key not in findkey: findkey.append(key) for key in keys1: if key not in findkey: findkey.append(key) addkey = [] for keywords in thesaurus: for key in findkey: if key in keywords: addkey.extend(keywords) for key in addkey: if key not in findkey: findkey.append(key) return findkey
def handle_data(self, data): if self.current_tag == 'title': keys = doclex.lex(data) if isinstance(keys, list) and len(keys) > 0: for key in keys: #if not self.key_url.has_key(key): #self.key_url[key] = [] #print key self.keywords.append(key) #self.key_url[key].append(self.url) data = doclex.delspace(data) if len(data) > 0: self.title = data #collection_url_title.insert({'key':self.url, 'title':data, 'timetmp':time.time()}) elif self.current_tag == 'a': #if not judged_url(self.link_url): # self.link_url = self.url + self.link_url keys = doclex.simplesplit(data) if isinstance(keys, list) and len(keys) > 0: for key in keys: if not self.key_url.has_key(key): self.key_url[key] = [] if self.link_url != self.url and judged_url(self.link_url): self.key_url[key].append(self.link_url) #print key, self.link_url else: if self.current_tag == 'p' or self.current_tag == 'div': self.data.append(data)
def __init__(self, urlinfo): HTMLParser.HTMLParser.__init__(self) self.urllist = {} self.sub_url = "" self.urlinfo = urlinfo self.current_url = urlinfo['url'] if self.current_url == "https://www.tmall.com" or self.current_url == "http://www.tmall.com" or self.current_url.find("tmall") != -1: self.urlinfo['profile'].append(u"天猫(英文:Tmall,亦称淘宝商城、天猫商城)原名淘宝商城,是一个综合性购物网站。2012年1月11日上午,淘宝商城正式宣布更名为“天猫”。") self.weight = 0 self.weight = urllist.countweight(self.current_url) encodingdate = chardet.detect(self.current_url) if self.current_url.count('/') == 2: if encodingdate['encoding']: uurl = unicode(self.current_url, encodingdate['encoding']) keywords = doclex.simplesplit(uurl) for k in keywords: if k not in [u'http', u'https', u'www', u'com', u'cn', u'net', u'org', u'edu', u'gov', u'int', u'mil', u'ad', u'ae', u'af', u'ag', u'ai', u'al', u'am', u'an', u'ao', u'aq', u'ar', u'as', u'at', u'au', u'aw', u'az', u'ba', u'bb', u'bd', u'be', u'bf', u'bg', u'bh', u'bi', u'bm', u'bj', u'bn', u'bo', u'br', u'bs', u'bt', u'bv', u'bw', u'by', u'bz', u'ca', u'cc', u'cf']: self.urlinfo['keys']['1'].append(k) self.current_tag = "" self.style = ""
def process_page(url, data): if data is None: return try: key_url = {} url_profile = "" htmlp = htmlprocess(url) encoding = chardet.detect(data) if encoding['encoding'] is None: return udata = unicode(data, encoding['encoding']) htmlp.feed(udata.encode('utf-8')) keywords = htmlp.keywords key_url.update(htmlp.key_url) if len(key_url) > 0: for key, value in key_url.iteritems(): if len(value) > 0: urllist = [] urllist = [url for url in value if urllist.count(url) == 0] if url not in key_url[key]: key_url[key] = urllist for data in htmlp.data: data = doclex.delspace(data) if len(data) < 32: url_profile += data keys = doclex.simplesplit(data) keywords.extend(keys) if isinstance(keys, list) and len(keys) > 0: for key in keys: if not key_url.has_key(key): key_url[key] = [] if url not in key_url[key]: key_url[key].append(url) else: if len(data) > 100: url_profile += data[0:len(data) if len(data) < 100 else 100] + "..." keys1 = doclex.lex(data) keywords.extend(keys1) for key1 in keys1: if not key_url.has_key(key1): key_url[key1] = [] if url not in key_url[key1]: key_url[key1].append(url) return htmlp.link, url_profile, keywords, htmlp.profile, key_url, htmlp.title except: #import traceback #traceback.print_exc() pass
def handle_starttag(self, tag, attrs): self.current_tag = tag self.style = 'None' if tag == 'a': for name,value in attrs: if name == 'href': #print value, 'link value' if self.url.find("cnblogs") != -1: if value.find("http://msg.cnblogs.com/send?recipient=itwriter") != -1: return elif value.find("http://i.cnblogs.com/EditPosts.aspx?opt=1") != -1: return elif value.find("http://i.cnblogs.com/EditPosts.aspx?postid=1935371") != -1: return elif value.find("http://msg.cnblogs.com/send?recipient=itwriter/") != -1: return elif value.find("http://msg.cnblogs.com/send?recipient=itwriter/GetUsername.aspx") != -1: return elif value.find("/EnterMyBlog.aspx?NewArticle=1") != -1: return elif value[len(value) - 1] == '#': value = value[0:-1] if value.find('javascript') != -1: return if self.url.find(value) != -1: return if not judged_url(value): value = self.url + value self.link_url = value if self.link_url != self.url: self.link.append(value) elif tag == 'meta': for name,value in attrs: if name == 'name': if value == 'keywords' or value == 'metaKeywords': self.style = 'keywords' elif value == 'description' or value == 'metaDescription': self.style = 'profile' for name,value in attrs: if name == 'content': if self.style == 'keywords': self.keywords = doclex.simplesplit(value) elif self.style == 'profile': self.profile = value
def __init__(self, urlinfo): HTMLParser.HTMLParser.__init__(self) self.urllist = {} self.sub_url = "" self.urlinfo = urlinfo self.current_url = urlinfo['url'] keywords = doclex.simplesplit(self.current_url) for key in keywords: self.urlinfo['keys']['1'].append(key) self.current_tag = "" self.style = ""
def countweight(url): if url.count('/') > 2: return 0 encodingdate = chardet.detect(url) if encodingdate['encoding']: uurl = unicode(url, encodingdate['encoding']) keywords = doclex.simplesplit(uurl) removelist = [] for k in keywords: if k in prefixes: removelist.append(k) if k in postfix: removelist.append(k) for k in removelist: keywords.remove(k) if len(keywords) == 1: return 500 return 400
def handle_starttag(self, tag, attrs): self.current_tag = tag self.style = 'None' self.sub_url = "" if tag == 'meta': for name, value in attrs: if name == 'name': if value == 'keywords' or value == 'metaKeywords': self.style = 'keywords' elif value == 'description' or value == 'metaDescription': self.style = 'profile' for name, value in attrs: if name == 'content': if self.style == 'keywords': keywords = doclex.simplesplit(value) if isinstance(keywords, list): for key in keywords: self.urlinfo['keys']['1'].append(key) elif self.style == 'profile': self.urlinfo['profile']['0'] = value encodingdate = chardet.detect(value) if encodingdate['encoding']: udata = unicode(value, encodingdate['encoding']) tlen = 16 if len(udata) < 16: tlen = len(udata) self.urlinfo['titlegen'].append( udata[0:tlen].encode('utf-8')) else: self.urlinfo['titlegen'].append(value) if tag == 'a': self.sub_url = "" for name, value in attrs: if name == 'href': if len(value) == 0: return if not judged_url(value): if self.current_url[len(self.current_url) - 1] != '/' and value[0] != '/': value = self.current_url + '/' + value else: value = self.current_url + value if value.find('javascript') != -1: return if value.find('javaScript') != -1: return if self.current_url.find("apple") != -1: if value.find( "http://www.apple.com/cn/mac#ac-gn-menustate" ) != -1: return if self.current_url.find("cnblogs") != -1: if value.find( "http://msg.cnblogs.com/send?recipient=itwriter" ) != -1: return elif value.find( "http://i.cnblogs.com/EditPosts.aspx?opt=1" ) != -1: return elif value.find( "http://i.cnblogs.com/EditPosts.aspx?postid=1935371" ) != -1: return elif value.find( "http://msg.cnblogs.com/send?recipient=itwriter/" ) != -1: return elif value.find( "http://msg.cnblogs.com/send?recipient=itwriter/GetUsername.aspx" ) != -1: return elif value.find( "/EnterMyBlog.aspx?NewArticle=1") != -1: return elif value.find("GetUsername") != -1: return elif value.find("GetMyPassword") != -1: return elif value.find( "http://i.cnblogs.com/EditPosts.aspx?postid=" ) != -1: return elif value[len(value) - 1] == '#': value = value[0:-1] if self.current_url.find(value) != -1: return if value[len(value) - 1] == '#': value = value[0:-1] if value != self.current_url and len( value) < 64 and not ingoreurl(value): self.urllist[value] = { 'url': value, 'keys': { '1': [], '2': [], '3': [] }, 'title': '', 'titlegen': [], 'profile': { '0': '', '1': '', '2': [] } } self.sub_url = value
def handle_data(self, data): if self.current_tag == 'title': try: data = doclex.delspace(data) keys = doclex.lex(data) if isinstance(keys, list) and len(keys) > 0: for key in keys: self.urlinfo['keys']['2'].append(key) if len(data) > 0: self.urlinfo['title'] = data except: import traceback traceback.print_exc() elif self.current_tag == 'a': try: if self.sub_url != "": keys = doclex.simplesplit(data) if isinstance(keys, list) and len(keys) > 0: for key in keys: if key in self.urllist[self.sub_url]['keys']['3']: self.urllist[self.sub_url]['keys']['3'].remove( key) if key not in self.urllist[self.sub_url]['keys'][ '1'] and key not in self.urllist[ self.sub_url]['keys']['2']: self.urllist[self.sub_url]['keys']['2'].append( key) encodingdate = chardet.detect(data) if encodingdate['encoding']: udata = unicode(data, encodingdate['encoding']) tlen = 16 if len(udata) < 16: tlen = len(udata) self.urllist[self.sub_url]['titlegen'].append( udata[0:tlen].encode('utf-8')) if len(udata) > 16: self.urllist[self.sub_url]['profile']['1'] = udata[ 0:32].encode('utf-8') except: import traceback traceback.print_exc() else: if self.current_tag == 'p' or self.current_tag == 'div': try: if not doclex.invialddata(data): data = doclex.delspace(data) encodingdate = chardet.detect(data) udata = unicode(data, encodingdate['encoding']) tlen = 16 if len(udata) < 16: tlen = len(udata) self.urlinfo['titlegen'].append( udata[0:tlen].encode('utf-8')) if len(udata) > 32: self.urlinfo['profile']['2'].append( (udata[0:32] + u"...").encode('utf-8')) keys1 = doclex.lex(data) for key in keys1: self.urlinfo['keys']['3'].append(key) except: import traceback traceback.print_exc()
def handle_starttag(self, tag, attrs): self.current_tag = tag self.style = 'None' self.sub_url = "" if tag == 'meta': for name,value in attrs: if name == 'name': if value == 'keywords' or value == 'metaKeywords': self.style = 'keywords' elif value == 'description' or value == 'metaDescription': self.style = 'profile' for name,value in attrs: if name == 'content': try: if isinstance(value, str): encodingdate = chardet.detect(value) if encodingdate['encoding']: value = unicode(value, encodingdate['encoding']) if self.style == 'keywords': keywords = doclex.simplesplit(value) if isinstance(keywords, list): for key in keywords: self.urlinfo['keys']['1'].append(key) elif self.style == 'profile': self.urlinfo['profile'].append(value) keys1 = doclex.lex(value) for key in keys1: self.urlinfo['keys']['2'].append(key) keys1 = doclex.vaguesplit(value) for key in keys1: self.urlinfo['keys']['3'].append(key) tlen = 16 if len(value) < 16: tlen = len(value) self.urlinfo['title'].append(value[0:tlen]) except: import traceback traceback.print_exc() if tag == 'a' or tag == 'A': self.sub_url = "" for name,value in attrs: if name == 'href': if len(value) == 0: return if not judged_url(value): if self.current_url[len(self.current_url) - 1] != '/' and value[0] != '/': value = self.current_url + '/' + value else: value = self.current_url + value if value.find('void') != -1: return if value.find('javascript') != -1: return if value.find('javaScript') != -1: return if self.current_url.find("apple") != -1: if value.find("http://www.apple.com/cn/mac#ac-gn-menustate") !=-1: return if self.current_url.find("cnblogs") != -1: if value.find("http://msg.cnblogs.com/send?recipient=itwriter") != -1: return elif value.find("http://i.cnblogs.com/EditPosts.aspx?opt=1") != -1: return elif value.find("http://i.cnblogs.com/EditPosts.aspx?postid=1935371") != -1: return elif value.find("http://msg.cnblogs.com/send?recipient=itwriter/") != -1: return elif value.find("http://msg.cnblogs.com/send?recipient=itwriter/GetUsername.aspx") != -1: return elif value.find("/EnterMyBlog.aspx?NewArticle=1") != -1: return elif value.find("GetUsername") != -1: return elif value.find("GetMyPassword") != -1: return elif value.find("http://i.cnblogs.com/EditPosts.aspx?postid=") != -1: return elif value[len(value) - 1] == '#': value = value[0:-1] if self.current_url.find(value) != -1: return if value[len(value) - 1] == '#': value = value[0:-1] if value != self.current_url and len(value) < 64 and not ingoreurl(value): self.urllist[value] = {'url':value, 'keys':{'1':[], '2':[], '3':[]}, 'title':[], 'profile':[]} self.sub_url = value
def handle_data(self, data): if self.current_tag == 'title': try: encodingdate = chardet.detect(data) if encodingdate['encoding']: data = unicode(data, encodingdate['encoding']) if not doclex.invialddata(data): if len(data) > 0: self.urlinfo['title'].append(data) keys = doclex.lex(data) if isinstance(keys, list) and len(keys) > 0: for key in keys: self.urlinfo['keys']['2'].append(key) keys = doclex.vaguesplit(data) if isinstance(keys, list) and len(keys) > 0: for key in keys: self.urlinfo['keys']['3'].append(key) except: import traceback traceback.print_exc() elif self.current_tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: try: encodingdate = chardet.detect(data) if encodingdate['encoding']: data = unicode(data, encodingdate['encoding']) if not doclex.invialddata(data): if len(data) > 0: self.urlinfo['title'].append(data) keys = doclex.lex(data) if isinstance(keys, list) and len(keys) > 0: for key in keys: self.urlinfo['keys']['2'].append(key) keys = doclex.vaguesplit(data) if isinstance(keys, list) and len(keys) > 0: for key in keys: self.urlinfo['keys']['3'].append(key) except: import traceback traceback.print_exc() elif self.current_tag == 'a' or self.current_tag == 'A': try: if self.sub_url != "": encodingdate = chardet.detect(data) if encodingdate['encoding']: data = unicode(data, encodingdate['encoding']) keys = doclex.simplesplit(data) if isinstance(keys, list) and len(keys) > 0: for key in keys: if key in self.urllist[self.sub_url]['keys']['3']: self.urllist[self.sub_url]['keys']['3'].remove(key) if key not in self.urllist[self.sub_url]['keys']['1'] and key not in self.urllist[self.sub_url]['keys']['2']: self.urllist[self.sub_url]['keys']['1'].append(key) keys1 = doclex.lex(data) for key in keys1: self.urllist[self.sub_url]['keys']['2'].append(key) keys1 = doclex.vaguesplit(data) for key in keys1: self.urllist[self.sub_url]['keys']['3'].append(key) tlen = 16 if len(data) < 16: tlen = len(data) self.urllist[self.sub_url]['title'].append(data[0:tlen]) if len(data) > 32: self.urllist[self.sub_url]['profile'].append(data[0:32]) except: import traceback traceback.print_exc() else: if self.current_tag == 'div' or self.current_tag == 'p': try: encodingdate = chardet.detect(data) if encodingdate['encoding']: data = unicode(data, encodingdate['encoding']) if not doclex.invialddata(data): data = doclex.delspace(data) if data[0] == u'<': return if len(data) > 100: tlen = 16 if len(data) < 16: tlen = len(data) self.urlinfo['title'].append(data[0:tlen]) if len(data) > 32: self.urlinfo['profile'].append(data[0:32] + u"...") keys1 = doclex.lex(data) for key in keys1: self.urlinfo['keys']['2'].append(key) keys1 = doclex.vaguesplit(data) for key in keys1: self.urlinfo['keys']['3'].append(key) self.weight += 200 except: import traceback traceback.print_exc()
def handle_data(self, data): if self.current_tag == 'title': try: data = doclex.delspace(data) keys = doclex.lex(data) if isinstance(keys, list) and len(keys) > 0: for key in keys: self.urlinfo['keys']['2'].append(key) if len(data) > 0: self.urlinfo['title'] = data except: import traceback traceback.print_exc() elif self.current_tag == 'a': try: if self.sub_url != "": keys = doclex.simplesplit(data) if isinstance(keys, list) and len(keys) > 0: for key in keys: if key in self.urllist[self.sub_url]['keys']['3']: self.urllist[self.sub_url]['keys']['3'].remove(key) if key not in self.urllist[self.sub_url]['keys']['1'] and key not in self.urllist[self.sub_url]['keys']['2']: self.urllist[self.sub_url]['keys']['2'].append(key) encodingdate = chardet.detect(data) if encodingdate['encoding']: udata = unicode(data, encodingdate['encoding']) tlen = 16 if len(udata) < 16: tlen = len(udata) self.urllist[self.sub_url]['titlegen'].append(udata[0:tlen].encode('utf-8')) if len(udata) > 16: self.urllist[self.sub_url]['profile']['1'] = udata[0:32].encode('utf-8') except: import traceback traceback.print_exc() else: if self.current_tag == 'p' or self.current_tag == 'div': try: if not doclex.invialddata(data): data = doclex.delspace(data) encodingdate = chardet.detect(data) udata = unicode(data, encodingdate['encoding']) tlen = 16 if len(udata) < 16: tlen = len(udata) self.urlinfo['titlegen'].append(udata[0:tlen].encode('utf-8')) if len(udata) > 32: self.urlinfo['profile']['2'].append((udata[0:32] + u"...").encode('utf-8')) keys1 = doclex.lex(data) for key in keys1: self.urlinfo['keys']['3'].append(key) except: import traceback traceback.print_exc()