def single_magic(self, method = 'justext'): # Language tree = makeTree(self.html, self.url) if method == 'justext': try: self.get_language(tree) except: self.lang = 'en' self.jt = justy(self.html, self.lang) self.title = justyTitle(self.jt) self.title = justyBody(self.jt) else: self.body = normalize(normalize(getBody(tree))) if self.body: self.get_language(tree) self.title = getTitle(tree) wrong_imgs = ['icon', 'logo', 'advert', 'toolbar', 'footer', 'layout'] self.img_links = list(set([urljoin(self.source_name, x) for x in tree.xpath('//img/@src') if not any([w in x for w in wrong_imgs])])) self.body_blob = TextBlob(self.body) self.publish_date = '' for date_group in get_dates(tree, self.lang): if date_group: self.publish_date = str(date_group[0]) self.entities = extract_entities(self.body_blob)
def multi_magic(self, most_similar_tree): import textblob this_tree = prune_first(makeTree(self.html, self.url), most_similar_tree) self.multiTitle = getTitle(this_tree) self.multiBody = get_multi_body(this_tree) self.multiBodyBlob = textblob.TextBlob(self.multiBody) self.multi_publish_date = get_publish_from_meta(this_tree) or ''
#def fn(num): num = 5 it = 0 s = 0 fscores = [] fouten = 0 printing = True wrongs = [] for x in filtered_domains: for case in filtered_domains[x]: try: t = case[2] realy = normalize(''.join( [x for x in t.xpath('//span[@class="x-nc-sel1"]/text()')])) #ys = [x['text'] for x in getTitle(t,False)] ys = [getTitle(t)] #if any([z in y for z in x.split('.')]): if any(['-' in y or '|' in y for y in ys]): fscores.append(1) it += 1 continue f = f1(set(re.sub('[^a-zA-Z0-9]+', ' ', ys[0].lower()).split()), set(re.sub('[^a-zA-Z0-9]+', ' ', realy.lower()).split())) fscores.append(f) y = " ".join(re.sub('[^a-zA-Z0-9]+', ' ', ys[0].lower()).split()) realy = " ".join( re.sub('[^a-zA-Z0-9]+', ' ', realy.lower()).split()) if f < 1: fouten += 1 wrongs.append(it)