Exemplo n.º 1
0
    def single_magic(self, method = 'justext'):
        # Language
        tree = makeTree(self.html, self.url)  
        if method == 'justext':
            try:
                self.get_language(tree)
            except:
                self.lang = 'en' 
            self.jt = justy(self.html, self.lang)
            self.title = justyTitle(self.jt)
            self.title = justyBody(self.jt)
        else: 
            self.body = normalize(normalize(getBody(tree)))
            
            if self.body:
                self.get_language(tree) 

            self.title = getTitle(tree)
            
        wrong_imgs = ['icon', 'logo', 'advert', 'toolbar', 'footer', 'layout']
        self.img_links = list(set([urljoin(self.source_name, x) for x in tree.xpath('//img/@src') 
                                   if not any([w in x for w in wrong_imgs])])) 
        self.body_blob = TextBlob(self.body)

        
        self.publish_date = ''
        for date_group in get_dates(tree, self.lang):
            if date_group:
                self.publish_date = str(date_group[0])

        self.entities = extract_entities(self.body_blob)
Exemplo n.º 2
0
 def multi_magic(self, most_similar_tree):
     import textblob
     this_tree = prune_first(makeTree(self.html, self.url), most_similar_tree)  
     self.multiTitle = getTitle(this_tree)
     self.multiBody = get_multi_body(this_tree)
     self.multiBodyBlob = textblob.TextBlob(self.multiBody) 
     self.multi_publish_date = get_publish_from_meta(this_tree) or ''
Exemplo n.º 3
0
#def fn(num):
num = 5
it = 0
s = 0
fscores = []
fouten = 0
printing = True
wrongs = []
for x in filtered_domains:
    for case in filtered_domains[x]:
        try:
            t = case[2]
            realy = normalize(''.join(
                [x for x in t.xpath('//span[@class="x-nc-sel1"]/text()')]))
            #ys = [x['text'] for x in getTitle(t,False)]
            ys = [getTitle(t)]
            #if any([z in y for z in x.split('.')]):
            if any(['-' in y or '|' in y for y in ys]):
                fscores.append(1)
                it += 1
                continue

            f = f1(set(re.sub('[^a-zA-Z0-9]+', ' ', ys[0].lower()).split()),
                   set(re.sub('[^a-zA-Z0-9]+', ' ', realy.lower()).split()))
            fscores.append(f)
            y = " ".join(re.sub('[^a-zA-Z0-9]+', ' ', ys[0].lower()).split())
            realy = " ".join(
                re.sub('[^a-zA-Z0-9]+', ' ', realy.lower()).split())
            if f < 1:
                fouten += 1
                wrongs.append(it)