Exemplo n.º 1
0
 def has_similar(self, url):
     '''
     '''
     if url == "":
         return (False, [], [])
     if self.has_node(url):
         return (True, [], self.search_url_index(url))
     else:
         max_rate = 0
         url_split_list, host_po = up.url_split(url)
         max_list = []
         max_url = ""
         rate = 0
         for my_url in self.treeContent:
             if fuzz.ratio(
                     os.path.splitext(my_url)[1],
                     os.path.splitext(url)[1]) < SIMILAR_THRESHOLD:
                 # if not has the same expend name
                 continue
             if (fuzz.ratio(url, my_url) / 100.0) < self.SIMILAR_THRESHOLD:
                 continue
             my_url_list, my_host_po = up.url_split(my_url)
             rate, dismatch_list = up.url_list_compare(
                 url_split_list, my_url_list)
             rate = max(fuzz.ratio(url, my_url) / 100.0, rate)
             if max_rate < rate:
                 max_rate = rate
                 max_list = dismatch_list
                 max_url = my_url
         if max_rate > self.SIMILAR_THRESHOLD and max_url != "":
             print "SIMILAR URL(Tree.has_similar):\n", max_url, "\n", url
             return (True, dismatch_list, self.search_url_index(max_url))
         return (False, max_list, [])
Exemplo n.º 2
0
 def has_similar(self, url):
     '''
     '''
     if url == "":
         return (False,[], [])
     if self.has_node( url):
         return (True,[], self.search_url_index(url))
     else:
         max_rate = 0
         url_split_list, host_po = up.url_split( url)
         max_list =[]
         max_url = ""
         rate = 0
         for my_url in self.treeContent:
             if fuzz.ratio( os.path.splitext( my_url)[1], os.path.splitext( url)[1])<SIMILAR_THRESHOLD:
             # if not has the same expend name
                 continue
             if (fuzz.ratio(url, my_url)/100.0)< self.SIMILAR_THRESHOLD:
                 continue
             my_url_list, my_host_po = up.url_split(my_url)
             rate, dismatch_list = up.url_list_compare(url_split_list, my_url_list)
             rate = max( fuzz.ratio(url, my_url)/100.0, rate)
             if max_rate < rate:
                 max_rate = rate
                 max_list = dismatch_list
                 max_url = my_url
         if max_rate > self.SIMILAR_THRESHOLD and max_url != "":
             print "SIMILAR URL(Tree.has_similar):\n",max_url,"\n",url
             return (True, dismatch_list, self.search_url_index(max_url))
         return (False, max_list, [])
Exemplo n.º 3
0
    def find_similar_and_replace(self,
                                 url,
                                 indSet,
                                 context={
                                     'front': ' ',
                                     'back': ' '
                                 }):
        if url == "":
            return ""
        else:
            max_rate = 0
            max_ind = -2
            url_split_list, url_host_po = up.url_split(url)
            for ind in indSet:
                my_url = self.treeContent[ind]
                if fuzz.ratio(
                        os.path.splitext(my_url)[1],
                        os.path.splitext(url)[1]) < SIMILAR_THRESHOLD:
                    # if not has the same expend name
                    continue
                DIYrate, dis_list = up.similar_ratio(url, my_url)
                rate = max(
                    fuzz.ratio(url, my_url) / 100.0 + 0.001, DIYrate + 0.001)
                if rate < self.SIMILAR_THRESHOLD or rate <= max_rate:
                    continue
                front, back, start = self.get_surround_text(ind)
                context_similar_ratio = (fuzz.ratio(front, context['front'])/100.0+0.001) \
                * (fuzz.ratio(back, context['back'])/100.0+0.001)
                rate = rate * context_similar_ratio
                if rate < self.SIMILAR_THRESHOLD or rate <= max_rate:
                    continue
                else:
                    max_rate = rate
                    max_ind = ind

            # max_ind is the one that most similar to the given url
            # replace
            if max_rate == 0 or max_rate < self.SIMILAR_THRESHOLD or max_ind < 0:
                print "No similar"
                return ""
            url_split_list, host_po = up.url_split(url)
            simi_url_list, simi_url_host_po = up.url_split(
                self.treeContent[max_ind])
            DIYrate, dismatch_list = up.url_list_compare(
                url_split_list, simi_url_list)
            tmp_url = url
            for item in dismatch_list:
                tmp_url = up.replace_url(tmp_url, item)
            print "SIMILAR URL(Tree.find_similar_and_replace):\n", self.treeContent[
                max_ind], "\n", url
            print "replace:", tmp_url
            #            if len(dismatch_list)>=2:
            #                print "Multi-Replacement:\n",url
            #                print self.treeContent[ max_ind]
            #                input('Multi-Replacement:\n')
            return tmp_url
Exemplo n.º 4
0
    def find_similar_and_replace(self, url, indSet, context={'front':' ', 'back':' '}):
        if url == "":
            return ""
        else:
            max_rate = 0
            max_ind = -2
            url_split_list, url_host_po = up.url_split( url)
            for ind in indSet:
                my_url = self.treeContent[ ind]
                if fuzz.ratio( os.path.splitext( my_url)[1], os.path.splitext( url)[1])<SIMILAR_THRESHOLD:
                    # if not has the same expend name
                    continue
                DIYrate, dis_list = up.similar_ratio( url, my_url)
                rate = max( fuzz.ratio(url, my_url)/100.0 +0.001, DIYrate+0.001)
                if rate < self.SIMILAR_THRESHOLD or rate <=max_rate:
                    continue
                front, back, start = self.get_surround_text( ind)
                context_similar_ratio = (fuzz.ratio(front, context['front'])/100.0+0.001) \
                * (fuzz.ratio(back, context['back'])/100.0+0.001)
                rate = rate* context_similar_ratio
                if rate < self.SIMILAR_THRESHOLD or rate <=max_rate:
                    continue
                else:
                    max_rate = rate
                    max_ind = ind
                    
            # max_ind is the one that most similar to the given url
            # replace
            if max_rate == 0 or max_rate< self.SIMILAR_THRESHOLD or max_ind<0:
                print "No similar"
                return ""
            url_split_list, host_po = up.url_split( url)
            simi_url_list, simi_url_host_po = up.url_split( self.treeContent[ max_ind])
            DIYrate, dismatch_list = up.url_list_compare(url_split_list, simi_url_list)
            tmp_url = url
            for item in dismatch_list:
                tmp_url = up.replace_url(tmp_url, item)
            print "SIMILAR URL(Tree.find_similar_and_replace):\n",self.treeContent[ max_ind],"\n",url
            print "replace:",tmp_url
#            if len(dismatch_list)>=2:
#                print "Multi-Replacement:\n",url
#                print self.treeContent[ max_ind]
#                input('Multi-Replacement:\n')
            return tmp_url