def has_similar(self, url): ''' ''' if url == "": return (False, [], []) if self.has_node(url): return (True, [], self.search_url_index(url)) else: max_rate = 0 url_split_list, host_po = up.url_split(url) max_list = [] max_url = "" rate = 0 for my_url in self.treeContent: if fuzz.ratio( os.path.splitext(my_url)[1], os.path.splitext(url)[1]) < SIMILAR_THRESHOLD: # if not has the same expend name continue if (fuzz.ratio(url, my_url) / 100.0) < self.SIMILAR_THRESHOLD: continue my_url_list, my_host_po = up.url_split(my_url) rate, dismatch_list = up.url_list_compare( url_split_list, my_url_list) rate = max(fuzz.ratio(url, my_url) / 100.0, rate) if max_rate < rate: max_rate = rate max_list = dismatch_list max_url = my_url if max_rate > self.SIMILAR_THRESHOLD and max_url != "": print "SIMILAR URL(Tree.has_similar):\n", max_url, "\n", url return (True, dismatch_list, self.search_url_index(max_url)) return (False, max_list, [])
def has_similar(self, url): ''' ''' if url == "": return (False,[], []) if self.has_node( url): return (True,[], self.search_url_index(url)) else: max_rate = 0 url_split_list, host_po = up.url_split( url) max_list =[] max_url = "" rate = 0 for my_url in self.treeContent: if fuzz.ratio( os.path.splitext( my_url)[1], os.path.splitext( url)[1])<SIMILAR_THRESHOLD: # if not has the same expend name continue if (fuzz.ratio(url, my_url)/100.0)< self.SIMILAR_THRESHOLD: continue my_url_list, my_host_po = up.url_split(my_url) rate, dismatch_list = up.url_list_compare(url_split_list, my_url_list) rate = max( fuzz.ratio(url, my_url)/100.0, rate) if max_rate < rate: max_rate = rate max_list = dismatch_list max_url = my_url if max_rate > self.SIMILAR_THRESHOLD and max_url != "": print "SIMILAR URL(Tree.has_similar):\n",max_url,"\n",url return (True, dismatch_list, self.search_url_index(max_url)) return (False, max_list, [])
def find_similar_and_replace(self, url, indSet, context={ 'front': ' ', 'back': ' ' }): if url == "": return "" else: max_rate = 0 max_ind = -2 url_split_list, url_host_po = up.url_split(url) for ind in indSet: my_url = self.treeContent[ind] if fuzz.ratio( os.path.splitext(my_url)[1], os.path.splitext(url)[1]) < SIMILAR_THRESHOLD: # if not has the same expend name continue DIYrate, dis_list = up.similar_ratio(url, my_url) rate = max( fuzz.ratio(url, my_url) / 100.0 + 0.001, DIYrate + 0.001) if rate < self.SIMILAR_THRESHOLD or rate <= max_rate: continue front, back, start = self.get_surround_text(ind) context_similar_ratio = (fuzz.ratio(front, context['front'])/100.0+0.001) \ * (fuzz.ratio(back, context['back'])/100.0+0.001) rate = rate * context_similar_ratio if rate < self.SIMILAR_THRESHOLD or rate <= max_rate: continue else: max_rate = rate max_ind = ind # max_ind is the one that most similar to the given url # replace if max_rate == 0 or max_rate < self.SIMILAR_THRESHOLD or max_ind < 0: print "No similar" return "" url_split_list, host_po = up.url_split(url) simi_url_list, simi_url_host_po = up.url_split( self.treeContent[max_ind]) DIYrate, dismatch_list = up.url_list_compare( url_split_list, simi_url_list) tmp_url = url for item in dismatch_list: tmp_url = up.replace_url(tmp_url, item) print "SIMILAR URL(Tree.find_similar_and_replace):\n", self.treeContent[ max_ind], "\n", url print "replace:", tmp_url # if len(dismatch_list)>=2: # print "Multi-Replacement:\n",url # print self.treeContent[ max_ind] # input('Multi-Replacement:\n') return tmp_url
def find_similar_and_replace(self, url, indSet, context={'front':' ', 'back':' '}): if url == "": return "" else: max_rate = 0 max_ind = -2 url_split_list, url_host_po = up.url_split( url) for ind in indSet: my_url = self.treeContent[ ind] if fuzz.ratio( os.path.splitext( my_url)[1], os.path.splitext( url)[1])<SIMILAR_THRESHOLD: # if not has the same expend name continue DIYrate, dis_list = up.similar_ratio( url, my_url) rate = max( fuzz.ratio(url, my_url)/100.0 +0.001, DIYrate+0.001) if rate < self.SIMILAR_THRESHOLD or rate <=max_rate: continue front, back, start = self.get_surround_text( ind) context_similar_ratio = (fuzz.ratio(front, context['front'])/100.0+0.001) \ * (fuzz.ratio(back, context['back'])/100.0+0.001) rate = rate* context_similar_ratio if rate < self.SIMILAR_THRESHOLD or rate <=max_rate: continue else: max_rate = rate max_ind = ind # max_ind is the one that most similar to the given url # replace if max_rate == 0 or max_rate< self.SIMILAR_THRESHOLD or max_ind<0: print "No similar" return "" url_split_list, host_po = up.url_split( url) simi_url_list, simi_url_host_po = up.url_split( self.treeContent[ max_ind]) DIYrate, dismatch_list = up.url_list_compare(url_split_list, simi_url_list) tmp_url = url for item in dismatch_list: tmp_url = up.replace_url(tmp_url, item) print "SIMILAR URL(Tree.find_similar_and_replace):\n",self.treeContent[ max_ind],"\n",url print "replace:",tmp_url # if len(dismatch_list)>=2: # print "Multi-Replacement:\n",url # print self.treeContent[ max_ind] # input('Multi-Replacement:\n') return tmp_url