Пример #1
0
	async def post(self):
		query = self.get_argument('query')
		if self.get_argument('type') == 'url':
			doc_id = await(indexer.index('user-stories', url=query, refresh=True))
		else:
			doc_id = await(indexer.index('user-stories', title=query, refresh=True))
		if doc_id is not None:
			url = '/interactive/search/' + doc_id['index'] + '/' + doc_id['id']
			self.redirect(url, permanent=False)
		else:
			self.send_error(
				500,
				reason='Failed to index remote site. (Remote site probably returned an error)'
			)
Пример #2
0
 def __isolate_content(self, stories):
     self._content = []
     for s in stories:
         desc = None
         h = s.find_all('a', {'class': 'gs-c-promo-heading'})
         if len(h) > 0:
             if not str(h[0].contents[0].contents[0]).startswith('<'):
                 title = h[0].contents[0].contents[0]
             else:
                 title = h[0].contents[1].contents[0]
             d = s.find_all('p', {'class': 'gs-c-promo-summary'})
             if len(d) > 0:
                 desc = d[0].contents[0]
             url = h[0]['href']
             if url == '/radio/player/bbc_world_service' or url == '/news/world_radio_and_tv':
                 continue
             elif url.startswith('#'):
                 continue
             elif url.startswith('/'):
                 url = urljoin('https://www.bbc.com/news', url)
             doc_id = loop.run_until_complete(
                 indexer.index('stories',
                               url=url,
                               title=title,
                               description=desc,
                               check_all=False))
Пример #3
0
 def __isolate_content(self, stories):
     self._content = []
     for s in stories:
         desc = None
         title = ''
         url = ''
         links = s.find_all('a', {'class': 'article-icon'})
         titles = s.find_all({'class': 'headline'})
         descriptions = s.find_all({'class': 'article-intro'})
         if len(links) > 0:
             url = links[0]['href']
         if len(titles) > 0:
             title = titles[0].contents
         if len(descriptions) > 0:
             desc = descriptions[0].find(text=True, recursive=False)
         if url.startswith('#'):
             continue
         elif url == '':
             continue
         elif url.startswith('/'):
             url = urljoin('http://www.spiegel.de/international/', url)
         doc_id = loop.run_until_complete(
             indexer.index('stories',
                           url=url,
                           title=title,
                           description=desc,
                           check_all=False))
Пример #4
0
 def __isolate_content(self, stories):
     self._content = []
     for s in stories:
         desc = None
         title = ''
         url = ''
         links = s.find_all('a')
         titles = s.find_all({'class': 'esl82me2'})
         descriptions = s.find_all({'class': 'e1n8kpyg0'})
         bulletpoints = s.find_all({'class': 'e1n8kpyg1'})
         if len(links) > 0:
             url = links[0]['href']
         if len(titles) > 0:
             title = titles[0].contents
         if len(descriptions) > 0:
             desc = descriptions[0].contents
         elif len(bulletpoints) > 0:
             for b in bulletpoints[0]:
                 desc += b.contents
         if url.startswith('#'):
             continue
         elif url == '':
             continue
         elif url.startswith('/'):
             url = urljoin('https://nytimes.com/', url)
         doc_id = loop.run_until_complete(
             indexer.index('stories',
                           url=url,
                           title=title,
                           description=desc,
                           check_all=False))
Пример #5
0
 def __isolate_content(self, links):
     self._content = []
     for h in links:
         desc = None
         title = ''
         try:
             if ('menu__link' not in h['class']
                     and 'hide_overlay' not in h['class']):
                 if not h['class'][0].startswith('social-share-'):
                     try:
                         title = h.contents[0]
                     except IndexError:
                         pass
         except KeyError:
             try:
                 title = h.contents[0]
             except IndexError:
                 pass
         try:
             url = h['href']
         except KeyError:
             continue
         if url.startswith('#'):
             continue
         elif url.startswith('/'):
             url = urljoin('http://thehill.com/', url)
         doc_id = loop.run_until_complete(
             indexer.index('stories',
                           url=url,
                           title=title,
                           description=desc,
                           check_all=False))
Пример #6
0
 def __isolate_content(self, stories):
     self._content = []
     for s in stories:
         desc = None
         title = ''
         h = s.find_all('a')
         t = s.find_all({'class': 'cd__headline-text'})
         if len(h) > 0:
             url = h[0]['href']
         if len(t) > 0:
             title = t[0].contents
         if url.startswith('#'):
             continue
         elif url.startswith('/'):
             url = urljoin('https://cnn.com/', url)
         doc_id = loop.run_until_complete(
             indexer.index('stories',
                           url=url,
                           title=title,
                           description=desc,
                           check_all=False))
Пример #7
0
 def __isolate_content(self, links):
     for h in links:
         desc = None
         title = ''
         try:
             if str(h.contents[0]).startswith('<'):
                 if not str(h.contents[0].contents[0]).startswith('<'):
                     title = h.contents[0].contents[0]
                     try:
                         if 'top-sec-desc' in h.parent.contents[3]['class']:
                             desc = h.parent.contents[3].contents[0]
                     except KeyError:
                         pass
                     except TypeError:
                         pass
                 elif not h.contents[0].contents[0].contents[0].startswith(
                         '<'):
                     title = h.contents[0].contents[0].contents[0]
         except IndexError:  # malformed HTML
             pass
         except TypeError:
             pass
         try:
             url = h['href']
             if url.startswith('/topics'):
                 continue
             elif url.startswith('#'):
                 continue
             elif url.startswith('/'):
                 url = urljoin('https://www.aljazeera.com/', url)
         except KeyError:  # Yes, here at Al Jazeera we use empty <a> tags!
             pass
         if not url.startswith('http') or title == '' or title == '\n':
             continue
         doc_id = loop.run_until_complete(
             indexer.index('stories',
                           url=url,
                           title=title,
                           description=desc,
                           check_all=False))
Пример #8
0
 def __isolate_content(self, links):
     self._content = []
     for h in links:
         desc = None
         try:
             title = h.contents[0]
         except TypeError:
             if str(h.contents[0].contents[0]).startswith('<'):
                 title = h.contents[1].contents[0]
             else:
                 title = h.contents[0].contents[0]
         url = h['href']
         if url.startswith('#'):
             continue
         elif url.startswith('/'):
             url = urljoin('https://www.theguardian.com', url)
         doc_id = loop.run_until_complete(
             indexer.index('stories',
                           url=url,
                           title=title,
                           description=desc,
                           check_all=False))
Пример #9
0
 def __isolate_content(self, links):
     self._content = []
     for h in links:
         desc = None
         if h.name == 'a':
             title = h.contents[0]
             url = h['href']
         elif h.name == 'div':
             title = h.contents[0].contents[0]
             url = h.contents[0]['href']
             b = h.parent.find_all({'class': 'blurb'})
             if len(b) == 1:
                 desc = b[0].contents[0]
         if url.startswith('#'):
             continue
         elif url.startswith('/'):
             url = urljoin('https://www.washingtonpost.com', url)
         doc_id = loop.run_until_complete(
             indexer.index('stories',
                           url=url,
                           title=title,
                           description=desc,
                           check_all=False))
Пример #10
0
        url = ':'.join(url.split(':')[1:])
    if '?' in url:
        url = '?'.join(url.split('?')[:1])
    if '#' in url:
        url = '#'.join(url.split('#')[:1])
    if url.startswith('//www.'):
        url = '//' + '.'.join(url.split('.')[1:])
    return (url)


loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
for s in reddit.subreddit('worldnews').hot(limit=30):
    if test_in_sites(s.url):
        doc_id = loop.run_until_complete(
            indexer.index('user-stories', url=s.url, refresh=True))
        if doc_id is None:
            continue
        query = {"query": {"match": {"_id": doc_id['id']}}}
        queries = es.search(index=doc_id['index'], body=query)
        if queries['hits']['total'] > 0:
            query = get_query(queries['hits']['hits'][0]['_source'])
            results = es.search(index="stories*", body=query)
            stories = []
            opinions = []
            urls = [clean_url(s.url)]
            for r in results['hits']['hits']:
                cleaned_url = clean_url(r['_source']['url'])
                if ('/opinion/' in r['_source']['url']
                        or '/opinions/' in r['_source']['url']
                        or '/blogs/' in r['_source']['url']