async def post(self): query = self.get_argument('query') if self.get_argument('type') == 'url': doc_id = await(indexer.index('user-stories', url=query, refresh=True)) else: doc_id = await(indexer.index('user-stories', title=query, refresh=True)) if doc_id is not None: url = '/interactive/search/' + doc_id['index'] + '/' + doc_id['id'] self.redirect(url, permanent=False) else: self.send_error( 500, reason='Failed to index remote site. (Remote site probably returned an error)' )
def __isolate_content(self, stories): self._content = [] for s in stories: desc = None h = s.find_all('a', {'class': 'gs-c-promo-heading'}) if len(h) > 0: if not str(h[0].contents[0].contents[0]).startswith('<'): title = h[0].contents[0].contents[0] else: title = h[0].contents[1].contents[0] d = s.find_all('p', {'class': 'gs-c-promo-summary'}) if len(d) > 0: desc = d[0].contents[0] url = h[0]['href'] if url == '/radio/player/bbc_world_service' or url == '/news/world_radio_and_tv': continue elif url.startswith('#'): continue elif url.startswith('/'): url = urljoin('https://www.bbc.com/news', url) doc_id = loop.run_until_complete( indexer.index('stories', url=url, title=title, description=desc, check_all=False))
def __isolate_content(self, stories): self._content = [] for s in stories: desc = None title = '' url = '' links = s.find_all('a', {'class': 'article-icon'}) titles = s.find_all({'class': 'headline'}) descriptions = s.find_all({'class': 'article-intro'}) if len(links) > 0: url = links[0]['href'] if len(titles) > 0: title = titles[0].contents if len(descriptions) > 0: desc = descriptions[0].find(text=True, recursive=False) if url.startswith('#'): continue elif url == '': continue elif url.startswith('/'): url = urljoin('http://www.spiegel.de/international/', url) doc_id = loop.run_until_complete( indexer.index('stories', url=url, title=title, description=desc, check_all=False))
def __isolate_content(self, stories): self._content = [] for s in stories: desc = None title = '' url = '' links = s.find_all('a') titles = s.find_all({'class': 'esl82me2'}) descriptions = s.find_all({'class': 'e1n8kpyg0'}) bulletpoints = s.find_all({'class': 'e1n8kpyg1'}) if len(links) > 0: url = links[0]['href'] if len(titles) > 0: title = titles[0].contents if len(descriptions) > 0: desc = descriptions[0].contents elif len(bulletpoints) > 0: for b in bulletpoints[0]: desc += b.contents if url.startswith('#'): continue elif url == '': continue elif url.startswith('/'): url = urljoin('https://nytimes.com/', url) doc_id = loop.run_until_complete( indexer.index('stories', url=url, title=title, description=desc, check_all=False))
def __isolate_content(self, links): self._content = [] for h in links: desc = None title = '' try: if ('menu__link' not in h['class'] and 'hide_overlay' not in h['class']): if not h['class'][0].startswith('social-share-'): try: title = h.contents[0] except IndexError: pass except KeyError: try: title = h.contents[0] except IndexError: pass try: url = h['href'] except KeyError: continue if url.startswith('#'): continue elif url.startswith('/'): url = urljoin('http://thehill.com/', url) doc_id = loop.run_until_complete( indexer.index('stories', url=url, title=title, description=desc, check_all=False))
def __isolate_content(self, stories): self._content = [] for s in stories: desc = None title = '' h = s.find_all('a') t = s.find_all({'class': 'cd__headline-text'}) if len(h) > 0: url = h[0]['href'] if len(t) > 0: title = t[0].contents if url.startswith('#'): continue elif url.startswith('/'): url = urljoin('https://cnn.com/', url) doc_id = loop.run_until_complete( indexer.index('stories', url=url, title=title, description=desc, check_all=False))
def __isolate_content(self, links): for h in links: desc = None title = '' try: if str(h.contents[0]).startswith('<'): if not str(h.contents[0].contents[0]).startswith('<'): title = h.contents[0].contents[0] try: if 'top-sec-desc' in h.parent.contents[3]['class']: desc = h.parent.contents[3].contents[0] except KeyError: pass except TypeError: pass elif not h.contents[0].contents[0].contents[0].startswith( '<'): title = h.contents[0].contents[0].contents[0] except IndexError: # malformed HTML pass except TypeError: pass try: url = h['href'] if url.startswith('/topics'): continue elif url.startswith('#'): continue elif url.startswith('/'): url = urljoin('https://www.aljazeera.com/', url) except KeyError: # Yes, here at Al Jazeera we use empty <a> tags! pass if not url.startswith('http') or title == '' or title == '\n': continue doc_id = loop.run_until_complete( indexer.index('stories', url=url, title=title, description=desc, check_all=False))
def __isolate_content(self, links): self._content = [] for h in links: desc = None try: title = h.contents[0] except TypeError: if str(h.contents[0].contents[0]).startswith('<'): title = h.contents[1].contents[0] else: title = h.contents[0].contents[0] url = h['href'] if url.startswith('#'): continue elif url.startswith('/'): url = urljoin('https://www.theguardian.com', url) doc_id = loop.run_until_complete( indexer.index('stories', url=url, title=title, description=desc, check_all=False))
def __isolate_content(self, links): self._content = [] for h in links: desc = None if h.name == 'a': title = h.contents[0] url = h['href'] elif h.name == 'div': title = h.contents[0].contents[0] url = h.contents[0]['href'] b = h.parent.find_all({'class': 'blurb'}) if len(b) == 1: desc = b[0].contents[0] if url.startswith('#'): continue elif url.startswith('/'): url = urljoin('https://www.washingtonpost.com', url) doc_id = loop.run_until_complete( indexer.index('stories', url=url, title=title, description=desc, check_all=False))
url = ':'.join(url.split(':')[1:]) if '?' in url: url = '?'.join(url.split('?')[:1]) if '#' in url: url = '#'.join(url.split('#')[:1]) if url.startswith('//www.'): url = '//' + '.'.join(url.split('.')[1:]) return (url) loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) for s in reddit.subreddit('worldnews').hot(limit=30): if test_in_sites(s.url): doc_id = loop.run_until_complete( indexer.index('user-stories', url=s.url, refresh=True)) if doc_id is None: continue query = {"query": {"match": {"_id": doc_id['id']}}} queries = es.search(index=doc_id['index'], body=query) if queries['hits']['total'] > 0: query = get_query(queries['hits']['hits'][0]['_source']) results = es.search(index="stories*", body=query) stories = [] opinions = [] urls = [clean_url(s.url)] for r in results['hits']['hits']: cleaned_url = clean_url(r['_source']['url']) if ('/opinion/' in r['_source']['url'] or '/opinions/' in r['_source']['url'] or '/blogs/' in r['_source']['url']