示例#1
0
 def handle_data(self, data):
     if self.current_tag == 'title':
         keys = doclex.lex(data)
         if isinstance(keys, list) and len(keys) > 0:
             for key in keys:
                 #if not self.key_url.has_key(key):
                     #self.key_url[key] = []
                 #print key
                 self.keywords.append(key)
                 #self.key_url[key].append(self.url)
         data = doclex.delspace(data)
         if len(data) > 0:
             self.title = data
             #collection_url_title.insert({'key':self.url, 'title':data, 'timetmp':time.time()})
     elif self.current_tag == 'a':
         #if not judged_url(self.link_url):
         #    self.link_url = self.url + self.link_url
         keys = doclex.simplesplit(data)
         if isinstance(keys, list) and len(keys) > 0:
             for key in keys:
                 if not self.key_url.has_key(key):
                     self.key_url[key] = []
                 if self.link_url != self.url and judged_url(self.link_url):
                     self.key_url[key].append(self.link_url)
                     #print key, self.link_url
     else:
         if self.current_tag == 'p' or self.current_tag == 'div':
             self.data.append(data)
示例#2
0
def process_page(url, data):
    if data is None:
        return

    try:
        key_url = {}
        url_profile = ""

        htmlp = htmlprocess(url)
        encoding = chardet.detect(data)
        if encoding['encoding'] is None:
            return
        udata = unicode(data, encoding['encoding'])
        htmlp.feed(udata.encode('utf-8'))

        keywords = htmlp.keywords

        key_url.update(htmlp.key_url)
        if len(key_url) > 0:
            for key, value in key_url.iteritems():
                if len(value) > 0:
                    urllist = []
                    urllist = [url for url in value if urllist.count(url) == 0]
                    if url not in key_url[key]:
                        key_url[key] = urllist

        for data in htmlp.data:
            data = doclex.delspace(data)
            if len(data) < 32:
                url_profile += data
                keys = doclex.simplesplit(data)
                keywords.extend(keys)
                if isinstance(keys, list) and len(keys) > 0:
                    for key in keys:
                        if not key_url.has_key(key):
                            key_url[key] = []
                        if url not in key_url[key]:
                            key_url[key].append(url)
            else:
                if len(data) > 100:
                    url_profile += data[0:len(data) if len(data) < 100 else 100] + "..."
                keys1 = doclex.lex(data)
                keywords.extend(keys1)
                for key1 in keys1:
                    if not key_url.has_key(key1):
                        key_url[key1] = []
                    if url not in key_url[key1]:
                        key_url[key1].append(url)

        return htmlp.link, url_profile, keywords, htmlp.profile, key_url, htmlp.title

    except:
        #import traceback
        #traceback.print_exc()
        pass
示例#3
0
    def handle_data(self, data):
        if self.current_tag == 'title':
            try:
                data = doclex.delspace(data)
                keys = doclex.lex(data)
                if isinstance(keys, list) and len(keys) > 0:
                    for key in keys:
                        self.urlinfo['keys']['2'].append(key)
                if len(data) > 0:
                    self.urlinfo['title'] = data
            except:
                import traceback
                traceback.print_exc()

        elif self.current_tag == 'a':
            try:
                if self.sub_url != "":
                    keys = doclex.simplesplit(data)
                    if isinstance(keys, list) and len(keys) > 0:
                        for key in keys:
                            if key in self.urllist[self.sub_url]['keys']['3']:
                                self.urllist[self.sub_url]['keys']['3'].remove(
                                    key)
                            if key not in self.urllist[self.sub_url]['keys'][
                                    '1'] and key not in self.urllist[
                                        self.sub_url]['keys']['2']:
                                self.urllist[self.sub_url]['keys']['2'].append(
                                    key)

                    encodingdate = chardet.detect(data)
                    if encodingdate['encoding']:
                        udata = unicode(data, encodingdate['encoding'])
                        tlen = 16
                        if len(udata) < 16:
                            tlen = len(udata)
                        self.urllist[self.sub_url]['titlegen'].append(
                            udata[0:tlen].encode('utf-8'))
                        if len(udata) > 16:
                            self.urllist[self.sub_url]['profile']['1'] = udata[
                                0:32].encode('utf-8')

            except:
                import traceback
                traceback.print_exc()
        else:
            if self.current_tag == 'p' or self.current_tag == 'div':
                try:
                    if not doclex.invialddata(data):
                        data = doclex.delspace(data)

                        encodingdate = chardet.detect(data)
                        udata = unicode(data, encodingdate['encoding'])
                        tlen = 16
                        if len(udata) < 16:
                            tlen = len(udata)
                        self.urlinfo['titlegen'].append(
                            udata[0:tlen].encode('utf-8'))

                        if len(udata) > 32:
                            self.urlinfo['profile']['2'].append(
                                (udata[0:32] + u"...").encode('utf-8'))

                        keys1 = doclex.lex(data)
                        for key in keys1:
                            self.urlinfo['keys']['3'].append(key)

                except:
                    import traceback
                    traceback.print_exc()
示例#4
0
def find_page(input, index):
    keys = doclex.splityspace(input)
    for k in keys:
        collection_key.insert({"key":k})

    #if cache_page.has_key(input):
    #    cache_page['input']['timetmp'] = time.time()
    #    return cache_page['input']['urllist']

    page_list = []
    for k in keys:
            k = doclex.tolower(k)
        #if key_page.has_key(k):
            #key_page['input']['timetmp'] = time.time()
        #    page_list = key_page['input']['urllist']
        #else:
            c = collection_page.find({"key":k}).limit(10).skip(index*10)
            for i in c:
                page = {}
                page['url'] = i['url']
                page['timetmp'] = i['timetmp']
                page['key'] = i['key']
                page_list.append(page)

            remove_list = []

            for url in page_list:
                c = gethtml.collection_url_profile.find({'key':url['url']})
                if c.count() <= 0:
                    remove_list.append(url)
                    continue
                for i in c:
                    url["profile"] = i['urlprofile']
                    url["date"] = i['date']
                    url["title"] = i['title']

            for url in remove_list:
                page_list.remove(url)

            #key_page['input'] = {}
            #key_page['input']['timetmp'] = time.time()
            #key_page['input']['urllist'] = page_list

    if len(page_list) == 0:
        keys = doclex.lex(input.encode('utf-8'))
        for k in keys:
                k = doclex.tolower(k)
            #if key_page.has_key(k):
            #    key_page['input']['timetmp'] = time.time()
            #    page_list = key_page['input']['urllist']
            #else:
                c = collection_page.find({"key":k}).limit(10).skip(index*10)
                for i in c:
                    page = {}
                    page['url'] = i['url']
                    page['timetmp'] = i['timetmp']
                    page['key'] = i['key']
                    page_list.append(page)

                    remove_list = []

                for url in page_list:
                    c = gethtml.collection_url_profile.find({'key':url['url']})
                    if c.count() <= 0:
                        remove_list.append(url)
                        continue
                    for i in c:
                        url["profile"] = i['urlprofile']
                        url["date"] = i['date']
                        url["title"] = i['title']

                for url in remove_list:
                    page_list.remove(url)

                #key_page['input'] = {}
                #key_page['input']['timetmp'] = time.time()
                #key_page['input']['urllist'] = page_list

    #cache_page['input'] = {}
    #cache_page['input']['timetmp'] = time.time()
    #cache_page['input']['urllist'] = page_list

    page_list = page_list[0: 10]

    return page_list
示例#5
0
    def handle_starttag(self, tag, attrs):
        self.current_tag = tag
        self.style = 'None'
        self.sub_url = ""

        if tag == 'meta':
            for name,value in attrs:
                if name == 'name':
                    if value == 'keywords' or value == 'metaKeywords':
                        self.style = 'keywords'
                    elif value == 'description' or value == 'metaDescription':
                        self.style = 'profile'

            for name,value in attrs:
                if name == 'content':
                    try:
                        if isinstance(value, str):
                            encodingdate = chardet.detect(value)
                            if encodingdate['encoding']:
                                value = unicode(value, encodingdate['encoding'])

                        if self.style == 'keywords':
                            keywords = doclex.simplesplit(value)
                            if isinstance(keywords, list):
                                for key in keywords:
                                    self.urlinfo['keys']['1'].append(key)

                        elif self.style == 'profile':
                            self.urlinfo['profile'].append(value)

                            keys1 = doclex.lex(value)
                            for key in keys1:
                                self.urlinfo['keys']['2'].append(key)

                            keys1 = doclex.vaguesplit(value)
                            for key in keys1:
                                self.urlinfo['keys']['3'].append(key)

                            tlen = 16
                            if len(value) < 16:
                                tlen = len(value)
                            self.urlinfo['title'].append(value[0:tlen])

                    except:
                        import traceback
                        traceback.print_exc()

        if tag == 'a' or tag == 'A':
            self.sub_url = ""
            for name,value in attrs:
                if name == 'href':
                    if len(value) == 0:
                        return

                    if not judged_url(value):
                        if self.current_url[len(self.current_url) - 1] != '/' and value[0] != '/':
                            value = self.current_url + '/' + value
                        else:
                            value = self.current_url + value

                    if value.find('void') != -1:
                        return

                    if value.find('javascript') != -1:
                        return

                    if value.find('javaScript') != -1:
                        return

                    if self.current_url.find("apple") != -1:
                        if value.find("http://www.apple.com/cn/mac#ac-gn-menustate") !=-1:
                            return

                    if self.current_url.find("cnblogs") != -1:
                        if value.find("http://msg.cnblogs.com/send?recipient=itwriter") != -1:
                            return
                        elif value.find("http://i.cnblogs.com/EditPosts.aspx?opt=1") != -1:
                            return
                        elif value.find("http://i.cnblogs.com/EditPosts.aspx?postid=1935371") != -1:
                            return
                        elif value.find("http://msg.cnblogs.com/send?recipient=itwriter/") != -1:
                            return
                        elif value.find("http://msg.cnblogs.com/send?recipient=itwriter/GetUsername.aspx") != -1:
                            return
                        elif value.find("/EnterMyBlog.aspx?NewArticle=1") != -1:
                            return
                        elif value.find("GetUsername") != -1:
                            return
                        elif value.find("GetMyPassword") != -1:
                            return
                        elif value.find("http://i.cnblogs.com/EditPosts.aspx?postid=") != -1:
                            return
                        elif value[len(value) - 1] == '#':
                            value = value[0:-1]

                    if self.current_url.find(value) != -1:
                        return

                    if value[len(value) - 1] == '#':
                        value = value[0:-1]

                    if value != self.current_url and len(value) < 64 and not ingoreurl(value):
                        self.urllist[value] = {'url':value, 'keys':{'1':[], '2':[], '3':[]}, 'title':[], 'profile':[]}
                        self.sub_url = value
示例#6
0
    def handle_data(self, data):
        if self.current_tag == 'title':
            try:
                encodingdate = chardet.detect(data)
                if encodingdate['encoding']:
                    data = unicode(data, encodingdate['encoding'])

                    if not doclex.invialddata(data):
                        if len(data) > 0:
                            self.urlinfo['title'].append(data)

                        keys = doclex.lex(data)
                        if isinstance(keys, list) and len(keys) > 0:
                            for key in keys:
                                self.urlinfo['keys']['2'].append(key)

                        keys = doclex.vaguesplit(data)
                        if isinstance(keys, list) and len(keys) > 0:
                            for key in keys:
                                self.urlinfo['keys']['3'].append(key)
            except:
                import traceback
                traceback.print_exc()

        elif self.current_tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            try:
                encodingdate = chardet.detect(data)
                if encodingdate['encoding']:
                    data = unicode(data, encodingdate['encoding'])

                    if not doclex.invialddata(data):
                        if len(data) > 0:
                            self.urlinfo['title'].append(data)

                        keys = doclex.lex(data)
                        if isinstance(keys, list) and len(keys) > 0:
                            for key in keys:
                                self.urlinfo['keys']['2'].append(key)

                        keys = doclex.vaguesplit(data)
                        if isinstance(keys, list) and len(keys) > 0:
                            for key in keys:
                                self.urlinfo['keys']['3'].append(key)
            except:
                import traceback
                traceback.print_exc()

        elif self.current_tag == 'a' or self.current_tag == 'A':
            try:
                if self.sub_url != "":
                    encodingdate = chardet.detect(data)
                    if encodingdate['encoding']:
                        data = unicode(data, encodingdate['encoding'])

                        keys = doclex.simplesplit(data)
                        if isinstance(keys, list) and len(keys) > 0:
                            for key in keys:
                                if key in self.urllist[self.sub_url]['keys']['3']:
                                    self.urllist[self.sub_url]['keys']['3'].remove(key)
                                if key not in self.urllist[self.sub_url]['keys']['1'] and key not in self.urllist[self.sub_url]['keys']['2']:
                                    self.urllist[self.sub_url]['keys']['1'].append(key)

                        keys1 = doclex.lex(data)
                        for key in keys1:
                            self.urllist[self.sub_url]['keys']['2'].append(key)

                        keys1 = doclex.vaguesplit(data)
                        for key in keys1:
                            self.urllist[self.sub_url]['keys']['3'].append(key)

                        tlen = 16
                        if len(data) < 16:
                            tlen = len(data)
                        self.urllist[self.sub_url]['title'].append(data[0:tlen])

                        if len(data) > 32:
                            self.urllist[self.sub_url]['profile'].append(data[0:32])

            except:
                import traceback
                traceback.print_exc()
        else:
            if self.current_tag == 'div' or self.current_tag == 'p':
                try:
                    encodingdate = chardet.detect(data)
                    if encodingdate['encoding']:
                        data = unicode(data, encodingdate['encoding'])

                        if not doclex.invialddata(data):
                            data = doclex.delspace(data)

                            if data[0] == u'<':
                                return

                            if len(data) > 100:
                                tlen = 16
                                if len(data) < 16:
                                    tlen = len(data)
                                self.urlinfo['title'].append(data[0:tlen])

                                if len(data) > 32:
                                    self.urlinfo['profile'].append(data[0:32] + u"...")

                                keys1 = doclex.lex(data)
                                for key in keys1:
                                    self.urlinfo['keys']['2'].append(key)

                                keys1 = doclex.vaguesplit(data)
                                for key in keys1:
                                    self.urlinfo['keys']['3'].append(key)

                                self.weight += 200

                except:
                    import traceback
                    traceback.print_exc()
示例#7
0
    def handle_data(self, data):
        if self.current_tag == 'title':
            try:
                data = doclex.delspace(data)
                keys = doclex.lex(data)
                if isinstance(keys, list) and len(keys) > 0:
                    for key in keys:
                        self.urlinfo['keys']['2'].append(key)
                if len(data) > 0:
                    self.urlinfo['title'] = data
            except:
                import traceback
                traceback.print_exc()

        elif self.current_tag == 'a':
            try:
                if self.sub_url != "":
                    keys = doclex.simplesplit(data)
                    if isinstance(keys, list) and len(keys) > 0:
                        for key in keys:
                            if key in self.urllist[self.sub_url]['keys']['3']:
                                self.urllist[self.sub_url]['keys']['3'].remove(key)
                            if key not in self.urllist[self.sub_url]['keys']['1'] and key not in self.urllist[self.sub_url]['keys']['2']:
                                self.urllist[self.sub_url]['keys']['2'].append(key)

                    encodingdate = chardet.detect(data)
                    if encodingdate['encoding']:
                        udata = unicode(data, encodingdate['encoding'])
                        tlen = 16
                        if len(udata) < 16:
                            tlen = len(udata)
                        self.urllist[self.sub_url]['titlegen'].append(udata[0:tlen].encode('utf-8'))
                        if len(udata) > 16:
                            self.urllist[self.sub_url]['profile']['1'] = udata[0:32].encode('utf-8')

            except:
                import traceback
                traceback.print_exc()
        else:
            if self.current_tag == 'p' or self.current_tag == 'div':
                try:
                    if not doclex.invialddata(data):
                        data = doclex.delspace(data)

                        encodingdate = chardet.detect(data)
                        udata = unicode(data, encodingdate['encoding'])
                        tlen = 16
                        if len(udata) < 16:
                            tlen = len(udata)
                        self.urlinfo['titlegen'].append(udata[0:tlen].encode('utf-8'))

                        if len(udata) > 32:
                            self.urlinfo['profile']['2'].append((udata[0:32] + u"...").encode('utf-8'))

                        keys1 = doclex.lex(data)
                        for key in keys1:
                            self.urlinfo['keys']['3'].append(key)

                except:
                    import traceback
                    traceback.print_exc()