def __init__(self, url, include_text='', full_text=False, connections=None): self.url = url self.full_text = full_text self.include_text = include_text self.source = detect_news_source(url) self.connections = connections self.count = 0
def _data_produce(self, items): items = data_inserter(self.include_text, "keyword", items) if (__debug__) and 0 == len(items) and 'any' == self.source: print("please debug this source: {} | {}".format( self.url, detect_news_source(self.url))) items = data_hasher("hash", ["title", "published", "source"], items) return items
async def archive_feed_by_filter(url, include_text, ap=None, osp=None, connections=None): from newsfeed.filter import NewsFeedFilter if not ap: from db.providers import ArchiveProvider ap = ArchiveProvider() if not osp: from db.providers import ObserverStatProvider osp = ObserverStatProvider() nff = NewsFeedFilter(url, include_text, full_text=True, connections=connections) items = await nff.as_output() count = nff.feedCount() total = len(items) # checking duplicate items by hash items = await ap.as_find_distinct_items_by("hash", items) ids = list(await ap.as_save_all(items)) acceptances = len(ids) rejects = total - acceptances await osp.as_save({ 'count': count, 'total': total, 'acceptances': acceptances, 'rejects': rejects }) return dict_cleaner( None, { 'source': detect_news_source(url), 'url': url, 'include': include_text, 'count': count, 'total': total, 'acceptances': acceptances, 'rejects': rejects, 'items': ids, 'info': '(%d/%d)' % (acceptances, total), 'infomation': '(%d/%d) %d successfully created, %d duplicates found.' % (acceptances, total, acceptances, rejects) })
def __init__(self, url, html, source=None): self.data = {} self.soup = BeautifulSoup(html, "html.parser") self.url = normalize_link(url) self.html = html if not source: self.source = detect_news_source(self.url) else: self.source = source self.context = load_context(self.source) self.trimtext = load_trimtext(self.source) self.dummy = {'pass': False, 'link': '', 'source': 'any'}
def fetch_news_all(urls, encoding='utf-8', timeout=60, limit=5, remedy=0, source=None): from concurrent.futures import ThreadPoolExecutor from requests_futures.sessions import FuturesSession import threading sem = threading.Semaphore(limit) collect = [] resopones = [] with FuturesSession(session=requests.Session(), executor=ThreadPoolExecutor(max_workers=os.cpu_count())) as session: connection = 0 failed_urls = [] futures = ((url, session.get(url, timeout=timeout)) for url in urls) for url, future in futures: connection = connection + 1 target_source = None if not source: source = detect_news_source(url) else: url_source = detect_news_source(url) if 'youtube' == url_source: target_source = source else: target_source = url_source if __debug__: if 'any' == target_source: print(f"[*skip*:{connection}] ({url})") else: print(f"[{target_source}:{connection}] ({url})") if remedy: log.error(f"[{__name__}] Retry: {url}") try: with sem: if target_source != 'any': resopones.append((target_source, future.result())) else: resopones.append((target_source, None)) except requests.exceptions.RequestException as e: failed_urls.append(url) log.error(f"[{__name__}] Failure when trying to fetch {url}") log.info(e, exc_info=True) continue for (target_source, resp) in resopones: if resp: resp.encoding = encoding html = clean_html(resp.text) news = NewsDataProcessor(resp.url, html, target_source) output = news.output() else: output = {} collect.append(output) if failed_urls and remedy < limit: remedy = remedy + 1 return collect + fetch_news_all(failed_urls, encoding, timeout, limit, True, source) else: return collect
def test_detect_news_source(self): for source, urls in self.urls.items(): for url in urls: self.assertEqual(detect_news_source(url), source)