def QuickSearchUrls( query: Text, from_date: datetime.date, to_date: datetime.date = None, lang: Text = 'en', country: Text = 'usa', number_of_results: int = 100, ) -> Iterable[Text]: """Performs a Google news search using a query. Args: query: search query. from_date: search news from this date. to_date: search news until this date, default to today. lang: search news in this language. country: search news in this country. number_of_results: number of results to return. Returns: A list of searched result URLs. """ if not to_date: to_date = datetime.date.today() tbs = googlesearch.get_tbs(from_date, to_date) return googlesearch.search( query, lang=lang, tbs=tbs, country=country, tpe="nws", stop=number_of_results)
def scrape(self, job, number_of_urls=10): query, from_date, to_date = job.values() urls = [] for d in pd.date_range(from_date, to_date): tbs = get_tbs(from_date=d, to_date=d) #"%Y-%m-%d" results = search(query, tbs=tbs, pause=2, stop=number_of_urls) for url in results: urls.append({"date": d.date(), "url": url}) return pd.DataFrame(urls, columns=["date", "url"])
def crawl_denied_news(start, end, context_word='', interval=10, date_format='%Y%m%d', use_api=False, callback=None): # Input = (start:date(string), end:date(string)) # Output = DataFrame(id, title, link, snippet, date) start_date = datetime.strptime(start, date_format) denied_news = pd.DataFrame(columns=['d_id', 'd_title', 'd_link', 'd_snippet', 'd_date']) while start_date <= datetime.strptime(end, date_format): end_interval = start_date + timedelta(days=interval) print('From {}, Until {}'.format(start_date.strftime('%Y-%m-%d'), end_interval.strftime('%Y-%m-%d'))) if use_api: sort = 'date:r:{}:{}'.format(start_date.strftime(date_format), end_interval.strftime(date_format)) results = mine_links_api(CRAWLER_QUERY_WORDS.format(context_word), num=10, sort=sort, gl='ir', exactTerms=EXACT_TERM) if 'items' in results.keys(): for res in results['items']: item = { 'd_id': uuid.uuid1().hex, 'd_title': '-' if not res['title'] else res['title'], 'd_link': res['link'], 'd_snippet': '-' if not res['snippet'] else res['snippet'], 'd_date': (start_date + timedelta(days=interval / 2)).strftime('%Y%m%d') if not 'date' in res['pagemap'][ 'metatags'][ 0].keys() else ''.join( res['pagemap']['metatags'][0]['date'].split('-'))[:8] } if callback: callback(item) denied_news = denied_news.append(item, ignore_index=True) else: results = mine_google_links(CRAWLER_QUERY_WORDS.format(context_word), num=30, stop=30, pause=random.randint(2, 8), tbs=google.get_tbs(start_date, end_interval + timedelta(days=interval))) for (title, link, snippet, date) in results: if any(dword in title for dword in DENIAL_WORDS): title = ' '.join(title.split()) snippet = ' '.join(snippet.split()) if date != '': date = ' '.join(date.split()) date = cleaner.date_persian2english(date, delimiter=' ', persian_month=True) item = { 'd_id': uuid.uuid1().hex, 'd_title': '-' if not title else title, 'd_link': link, 'd_snippet': '-' if not snippet else snippet, 'd_date': date} if callback: callback(item) denied_news = denied_news.append(item, ignore_index=True) start_date = end_interval + timedelta(hours=12) denied_news.to_excel("denied_news.xlsx", index_label=False, index=False) return denied_news
def download_articles(search_term, n_articles, start, end=None): start_date = datetime.datetime.strptime(start, "%Y-%m-%d") end_date = start_date if end is None else datetime.datetime.strptime( end, "%Y-%m-%d") tbs = get_tbs(start_date, end_date) urls = find_urls(search_term, tbs, 10) valid_articles = [] while (len(valid_articles) < n_articles and len(urls) > 0): articles_left = 5 - len(valid_articles) articles = NewsPlease.from_urls(urls[:articles_left]) empty, articles = detect_empty_articles(articles) for new in articles: valid_articles.append(articles.get(new)) urls = urls[articles_left:] # print("valid_articles", len(valid_articles)) return { "search_term": search_term, "start": start, "end": start if end is None else end, "articles": valid_articles }
def createTBS(): startDate = datetime.datetime.now() endPeriod = inputDate("Insert end period(enter of current day):\n") tbs = get_tbs(startDate, endPeriod) settings.tbs = tbs