Exemplo n.º 1
0
 def process_pages(self):
     skipped = []
     pbar = ProgressBar(widgets=['Processing pages: ', SimpleProgress()], maxval=len(self.urls)).start()
     i = 0
     
     for (num, url) in self.urls:
         pbar.update(int(num))
         if (num and url):
             html = helpers.get_html(num, url)
             if html is not None:
                 self.urls_with_nums[url] = num
                 soup = BeautifulSoup(html.encode('utf-8', 'ignore'), 'lxml')
                 page = Page(title=soup.title.string, num=num, html=soup.prettify(), url=url, text=soup.body.get_text())
                 page.index = i
                 self.indices_with_pages[i] = page
                 if page.ID not in self.pages_with_ids.keys():
                     self.pages_with_ids[page.ID] = page
                 else:
                     raise RuntimeError('COLLISION: %s collides with %s with hash %s.' % (page.num, self.pages_with_ids[page.ID].num, page.ID))
                 for link in soup.find_all('a'):
                     if link.get('href') and 'mailto:' != link.get('href').strip()[0:7]:
                         page.a.append(link)
                 self.pages.append(page)
                 i += 1
             else:
                 skipped.append(num)
         else:
             skipped.append(num)
     pbar.finish()
     print "Skipped page(s) %s because of an error." % (', '.join(skipped))
Exemplo n.º 2
0
 def calc_page_ranks(self, d=0.85):
     self.adj = numpy.zeros( (len(self.pages_with_ids),len(self.pages_with_ids)) )
     pbar = ProgressBar(widgets=['Processing links: ', SimpleProgress()], maxval=len(self.pages_with_ids.keys())).start()
     progress = 1
     for (ID, page) in self.pages_with_ids.iteritems():
         pbar.update(progress)
         # magic PageRank
         for a in page.a:
             href = a.get('href')
             # normalize URLS
             url = page.normalize_url(href)
             if url in self.S:
                 soup = BeautifulSoup(helpers.get_html(self.urls_with_nums[url]).encode('utf-8', 'ignore'), 'lxml')
                 ID = helpers.page_hash(soup.prettify())
                 if ID in self.pages_with_ids.keys():
                     #print "%s (#%d) cites %s (#%d)" % (page.num, page.index, self.pages_with_ids[ID].num, self.pages_with_ids[ID].index)
                     #print self.urls[int(self.pages_with_ids[ID].num)-1]
                     self.adj[page.index][self.pages_with_ids[ID].index] = 1.0
         progress += 1
     # Normalize adjacency matrix into PageRanks
     pbar = ProgressBar(widgets=['Normalizing adjacencies: ', SimpleProgress()], maxval=len(self.pages_with_ids.keys())).start()
     progress = 1
     col_sums = numpy.sum(self.adj, axis=1)
     for (ID, page) in self.pages_with_ids.iteritems():
         pbar.update(progress)
         for k in xrange(len(self.adj[page.index])):
             if col_sums[page.index] != 0:
                 self.adj[page.index][k] = self.adj[page.index][k] / col_sums[page.index]
             else:
                 self.adj[page.index][k] = 0.0
             self.indices_with_pages[k]
         progress += 1  
     pbar.finish()
     numpy.savetxt("adj.txt", self.adj)
     # Run PageRank and converge to principal eigenvector of adj matrix
     self.ranks = numpy.ones(len(self.pages_with_ids.keys()))
     z = numpy.ones(len(self.pages_with_ids.keys()))
     b = 1.0 - d
     pbar = ProgressBar(widgets=['Running PageRank: ', SimpleProgress()], maxval=1000).start()
     for m in xrange(1000):
         pbar.update(m)
         u = numpy.dot(self.adj, self.ranks)
         e = d*u
         f = b*z
         self.ranks = e+f
     pbar.finish()
     # Updating ranks of the pages
     pbar = ProgressBar(widgets=['Updating pages with new ranks: ', SimpleProgress()], maxval=len(self.pages_with_ids.keys())).start()
     progress = 1
     for (ID, page) in self.pages_with_ids.iteritems():
         pbar.update(progress)
         page.rank = self.ranks[page.index]
         progress += 1
     pbar.finish()
     numpy.savetxt("page_ranks.txt", self.ranks)
Exemplo n.º 3
0
def extract_data(url):
    """Given news article URL, return a dict with its data organised"""

    #author link?

    parsed_url = urlparse(url)
    html = helpers.get_html(url)

    if parsed_url.netloc == 'www.canarias7.es':
        try:
            text = html.find(attrs={'itemprop':'articleBody'}).get_text().strip()\
                if html.find(attrs={'itemprop':'articleBody'}) else ""
            headline = html.find(attrs={'itemprop':'headline'}).get_text().strip()\
                if html.find(attrs={'itemprop':'headline'}) else ""
            subheadline = html.find(attrs={'class':'subheadline'}).get_text().strip()\
                if html.find(attrs={'class':'subheadline'}) else ""
            date = html.find(attrs={'class':'datefrom'}).get_text().strip()\
                if html.find(attrs={'class':'datefrom'}) else ""
            author = html.find(attrs={'itemprop':'author'}).get_text().strip().title()\
                if html.find(attrs={'itemprop':'author'}) else 'Anónimo'
            n_comments = html.find(attrs={'class':'numComments'}).get_text().strip()\
                if html.find(attrs={'class':'numComments'}) else 0
            categories = parsed_url.path.split('/')[1:3]
            labels = [topic.find('a').get_text() if topic.find('a') else ""\
                for topic in html.find_all(attrs={'class':'topic'})]
        except Exception as e:
            print(e, parsed_url.path[1:])

    if parsed_url.netloc == 'www.laprovincia.es':
        try:
            text = html.find(attrs={'itemprop':'articleBody'}).get_text().strip()\
                if html.find(attrs={'itemprop':'articleBody'}) else ""
            headline = html.find(attrs={'itemprop':'headline'}).get_text().strip()\
                if html.find(attrs={'itemprop':'headline'}) else ""
            subheadline = html.find(attrs={'itemprop':'description'}).get_text().strip()\
                if html.find(attrs={'itemprop':'description'}) else ""
            date = html.find(attrs={'itemprop':'dateCreated'}).get_text().split('|')[0].strip()\
                if html.find(attrs={'itemprop':'dateCreated'}) else ""
            author = html.find(attrs={'itemprop':'author'}).get_text().strip().title()\
                if html.find(attrs={'itemprop':'author'}) else 'Anónimo'
            n_comments = html.find(attrs={'class':'textveces'}).get_text().strip()\
                if html.find(attrs={'class':'textveces'}) else 0
            categories = parsed_url.path.split('/')[1:3]
            labels = [x.get_text() for x in html.find(attrs={'id':'listaTags'}).findChildren('a')[1:]]\
                if html.find(attrs={'id':'listaTags'}) else []
        except Exception as e:
            print(e, parsed_url.path[1:])

    if parsed_url.netloc == 'www.eldia.es':
        try:
            text = html.find(attrs={'itemprop':'articleBody'}).get_text().strip()\
                if html.find(attrs={'itemprop':'articleBody'}) else ""
            headline = html.find(attrs={'itemprop':'headline'}).get_text().strip()\
                if html.find(attrs={'itemprop':'headline'}) else ""
            subheadline = html.find(attrs={'itemprop':'description'}).get_text().strip()\
                if html.find(attrs={'itemprop':'description'}) else ""
            date = html.find(attrs={'itemprop':'dateCreated'}).get_text().split('|')[0].strip()\
                if html.find(attrs={'itemprop':'dateCreated'}) else ""
            author = html.find(attrs={'itemprop':'author'}).get_text().strip().title()\
                if html.find(attrs={'itemprop':'author'}) else 'Anónimo'
            n_comments = html.find(attrs={'class':'textveces'}).get_text().strip()\
                if html.find(attrs={'class':'textveces'}) else 0
            categories = parsed_url.path.split('/')[1:3]
            labels = [x.get_text() for x in html.find(attrs={'id':'listaTags'}).findChildren('a')[1:]]\
                if html.find(attrs={'id':'listaTags'}) else []
        except Exception as e:
            print(e, parsed_url.path[1:])

    if parsed_url.netloc == 'www.noticanarias.com':
        try:
            text = ' '.join([x.get_text().strip() for x in html.find(attrs={'itemprop':'articleBody'}).findChildren('p')])\
                if html.find(attrs={'itemprop':'articleBody'}) else ""
            headline = html.find(attrs={'itemprop':'headline'}).get_text().strip()\
                if html.find(attrs={'itemprop':'headline'}) else ""
            subheadline = ""
            date = html.find(attrs={'class':'vw-post-date updated'}).findChildren('time')[0]['datetime'].split('T')[0].strip()\
                if html.find(attrs={'class':'vw-post-date updated'}) else ""
            author = html.find(attrs={'itemprop':'name'}).get_text().strip().title()\
                if html.find(attrs={'itemprop':'name'}) else 'Anónimo'
            n_comments = ""
            categories = ['','']
            labels = [x.get_text() for x in html.find_all('a',attrs={'rel':'tag'})]\
                if html.find_all('a',attrs={'rel':'tag'}) else []
        except Exception as e:
            print(e, parsed_url.path[1:])

    if parsed_url.netloc == 'www.canarias24horas.com':
        try:
            text = html.find(attrs={'class':'itemFullText'}).get_text().strip()\
                if html.find(attrs={'class':'itemFullText'}) else ""
            headline = html.find(attrs={'class':'itemTitle'}).get_text().strip()\
                if html.find(attrs={'class':'itemTitle'}) else ""
            subheadline = html.find(attrs={'class':'itemIntroText'}).get_text().strip()\
                if html.find(attrs={'class':'itemIntroText'}) else ""
            date = html.find(attrs={'class':'gkDate'}).get_text().strip()\
                if html.find(attrs={'class':'gkDate'}) else ""
            author = html.find(attrs={'class':'itemAuthor'}).get_text().strip().title().split()[-1]\
                if html.find(attrs={'class':'itemAuthor'}) else 'Anónimo'
            n_comments = ""
            categories = parsed_url.path.split('/')[1:3]
            labels = [topic.get_text() for topic in html.find(attrs={'class':'itemTags'}).findChildren('a')]\
                if html.find(attrs={'class':'itemTags'}) else []
        except Exception as e:
            print(e, parsed_url.path[1:])

    if parsed_url.netloc == 'canariasnoticias.es':
        try:
            text = ' '.join([x.get_text().strip() for x in html.find(attrs={'class':'noticia-body'}).findChildren('p')])\
                if html.find(attrs={'class':'noticia-body'}) else ""
            headline = html.find('h1', attrs={'class':'title'}).get_text().strip()\
                if html.find('h1', attrs={'class':'title'}) else ""
            subheadline = html.find('h3', attrs={'class':'subtitle'}).get_text().strip()\
                if html.find('h3', attrs={'class':'subtitle'}) else ""
            date = html.find(attrs={'class':'date'}).get_text().strip()\
                if html.find(attrs={'class':'date'}) else ""
            author = html.find(attrs={'class':'author'}).get_text().strip().title()\
                if html.find(attrs={'class':'author'}) else 'Anónimo'
            n_comments = html.find(attrs={'class':'comment-count'}).get_text().strip()\
                if html.find(attrs={'class':'comment-count'}) else 0
            categories = parsed_url.path.split('/')[1:3]
            labels = []
        except Exception as e:
            print(e, parsed_url.path[1:])

    if parsed_url.netloc == 'tribunadecanarias.es':
        try:
            text = ' '.join([x.get_text().strip() for x in html.find(attrs={'itemprop':'articleBody'}).findChildren('p')])\
                if html.find(attrs={'itemprop':'articleBody'}) else ""
            headline = html.find(attrs={'itemprop':'headline'}).get_text().strip()\
                if html.find(attrs={'itemprop':'headline'}) else ""
            subheadline = html.find(attrs={'class':'subheadline'}).get_text().strip()\
                if html.find(attrs={'class':'subheadline'}) else ""
            date = html.find(attrs={'id':'t1'}).get_text().strip()\
                if html.find(attrs={'id':'t1'}) else datetime.datetime.now().strftime('%Y-%m-%d')
            author = html.find(attrs={'itemprop':'author'}).get_text().strip().title()\
                if html.find(attrs={'itemprop':'author'}) else 'Anónimo'
            n_comments = html.find(attrs={'class':'numComments'}).get_text().strip()\
                if html.find(attrs={'class':'numComments'}) else 0
            categories = parsed_url.path.split('/')[1:3]
            labels = []
        except Exception as e:
            print(e, parsed_url.path[1:])

    
    
   #save categories in one
   # extracted data in dict form
    data_dict = {   
                    'newspaper':parsed_url.netloc,
                    'news_link':parsed_url.path[1:],
                    'headline':headline,
                    'subhead':subheadline,
                    'author':author,
                    'date':date,
                    'raw_text':text,
                    'n_comments':n_comments,
                    'main_cat':categories[1],
                    'sub_cat':categories[0],
                    'labels':labels
                }
    
    return data_dict
Exemplo n.º 4
0
 def test_get_html(self):
     self.assertIsNotNone(helpers.get_html(self.url))
Exemplo n.º 5
0
def get_links(url,n_links=5):
    """Given a news outlet main website, return its news links. Identifies where links are depending on outlet"""

    # it doesnt get content that loads with JS - solution would query directly those urls
    # https://gohighbrow.com/scraping-javascript-heavy-websites/ not implemented

    parsed_url = urlparse(url)
    html = helpers.get_html(url) #get bs4 object

    links = [] #store news links

    # give parameters to scrape website depending on link
    if parsed_url.netloc == 'www.canarias7.es':
        for link in html.find_all(['h2','h3','div'], attrs={'class':'headline'}): #include div (more news but more noise)
            if link.parent.has_attr('href'):# skip None links - without the structure (normally voting polls etc)
                links.append(link.parent['href'])
            

    if parsed_url.netloc == 'www.laprovincia.es':
        for link in html.find_all('a', attrs={'data-tipo':'noticia'}):
            links.append(link['href'])

    if parsed_url.netloc == 'www.eldia.es':
        for link in html.find_all('a', attrs={'data-tipo':'noticia'}):
            links.append(link['href'])
    
    if parsed_url.netloc == 'www.noticanarias.com':
        for link in html.find_all('a', attrs={'itemprop':'url'}):
            links.append(link['href'])

    if parsed_url.netloc == 'www.canarias24horas.com':
        for link in html.find_all('h4', attrs={'class':'nspHeader'}):
            links.append(link.findChildren('a')[0]['href'])
    
    #add data scrapers#add data scrapers#add data scrapers#add data scrapers#add data scrapers
    if parsed_url.netloc == 'canariasnoticias.es':
        for link in html.find_all(attrs={'class':'title'}):
            links.append(link.find('a')['href'])

    if parsed_url.netloc == 'www.sanborondon.info':
        for link in html.find_all(attrs={'class':'nspHeader'}):
            links.append(link.find('a')['href'])

    if parsed_url.netloc == 'tribunadecanarias.es':
        for link in html.find_all(attrs={'class':'ns2-title'}):
            links.append(link.find('a')['href'])

    if parsed_url.netloc == 'www.canariasdiario.com':
        for link in html.find_all(attrs={'itemprop':'mainEntityOfPage'}):
            links.append(link['href'])

    if parsed_url.netloc == 'www.europapress.es':
        for link in html.find_all(attrs={'itemprop':'headline'}):
            print(link)
            links.append(link.find('a')['href'])

    if parsed_url.netloc == 'www.efe.com':
        for link in html.find_all('a', attrs={'itemprop':'url'}):
            links.append(link['href'])

    
    return links