def googleNewsCrawler(self): result_list = [] googlenews = GoogleNews() for i in range(self.__numDays): startDateTime = self.__dateTime + timedelta(days=i) endDateTime = self.__dateTime + timedelta(days=i + self.__daysSpan) googlenews.setTimeRange( start=str(startDateTime.month) + '/' + str(startDateTime.day) + '/' + str(startDateTime.year), end=str(endDateTime.month) + '/' + str(endDateTime.day) + '/' + str(endDateTime.year)) googlenews.search(self.__keyWords) for j in range(self.__pagsEveryDay - 1): googlenews.getpage(j + 2) logging.info( str(self.__keyWords + '__' + str(startDateTime.date()) + " append " + str(int(self.__pagsEveryDay * 10)) + " items")) result_list = result_list + googlenews.result() googlenews.clear() if (i + 1) % 10 == 0: self.toJson(result_list) result_list = [] continue self.toJson(result_list)
def extract_google(query_terms, startDate, endDate): if len(startDate) == 0: startDate = datetime.datetime.today().strftime('%d/%m/%Y') if len(endDate) == 0: endDate = datetime.datetime.strftime( datetime.datetime.today().date() - datetime.timedelta(days=7), '%d/%m/%Y') startDate = datetime.datetime.strptime(startDate, '%Y-%m-%d').strftime('%d/%m/%y') endDate = datetime.datetime.strptime(endDate, '%Y-%m-%d').strftime('%d/%m/%y') final_articles = [] print(startDate) print(endDate) print("Crawling Starting") # here extracting news from google news googlenews = GoogleNews() googlenews.setTimeRange(startDate, endDate) for query in query_terms: googlenews.clear() #forming the search term googlenews.search("India Technology " + query) result = googlenews.result() for n in range(len(result)): source = result[n]['media'] url = result[n]['link'] try: article = Article(url) article.download() article.parse() except Exception as e: print("Trouble downloading so skipping") continue content = article.text # summarize the content temp_content = re.sub(r'^\s*[\(\[].*?[\)\]]\s*', '', content) sentences = sent_detector.tokenize(temp_content) summary = (" ".join(sentences[:2]).strip()) date = result[n]['date'] if (date.find('ago') != -1): date = current.date() title = result[n]['title'] # content=result[n]['desc'] img = result[n]['img'] #adding the extracted info in final_articles list final_articles.append({ 'source': source, 'url': url, 'date': date, 'title': title, 'content': content, 'img': img }) return final_articles
def initalize_google_news(start_date, end_date): """Initializes the googlenews object.""" print("initalize_google_news...") googlenews = GoogleNews(encode="utf-8") # create googlenews object googlenews.setlang("en") googlenews.setperiod("d") googlenews.setencode("utf-8") googlenews.setTimeRange(start_date, end_date) # using user specified date range return googlenews
class GoogleNewsMethods(): # Creates a googlenews object def __init__(self): self.googlenews = GoogleNews(lang="en") # This will return a list of news for perticular stock on a given date def newscollection(self, stock, date): self.googlenews.search(stock) self.googlenews.setTimeRange('05/01/2020', '05/28/2020') self.googlenews.setperiod('05/15/2020') self.newsList = self.googlenews.result() return (self.newsList)
def news(topic: str, start_date: str = None, end_date: str = None): help_text = "news: use this to fetch news<br><br>"\ "Usage: news topic<br>"\ "options:<br>"\ "--help: get help (this screen)<br><br>"\ "Followup: After fetching a set of news articles, enter<br>"\ "n: fetch the next set of articles<br>"\ "number: fetch the details of the article" googlenews = GoogleNews() page_num = 1 detail = None if start_date is not None and end_date is not None: googlenews.setTimeRange(start_date, end_date) if topic.split()[0] == '--help': return {'response': help_text} if topic.count('~') > 0: followup = topic.split('~')[1] if followup.split()[0] == 'n': page_num = int(followup.split()[1]) + 1 print(f"Page number: {page_num}") elif followup.split()[0].isnumeric(): detail = int(followup.split()[0]) topic = topic.split('~')[0] googlenews.search(topic) googlenews.getpage(1) news_results = googlenews.result() if detail is not None: news_details = news_results[detail + 1] print(news_details) details = f'{news_details["title"]}<br>{news_details["desc"]}<br>'\ '<a href="{news_details["link"]}" target="_blank">Read full article</a>' return {'response': details} articles = [] start_num = (page_num - 1) * 3 end_num = page_num * 3 for i, article in enumerate(news_results[start_num:end_num]): serial_number = str(i + 1 + (page_num - 1) * 3) article_summary = (serial_number, f"{article['date']}, {article['media']}", article['title']) articles.append(article_summary) all_articles = "<br>".join([", ".join(i) for i in articles]) return {'response': all_articles, 'followup': True}
def extract_links(dir_c, dir_k, lang): for t in topics: print('Current topic: ', t + '\n') kw = get_keywords(dir_k, t) print('Keywords: ', kw + '\n') f_clean = open(dir_c + t + '.txt', 'r') fp = f_clean.readlines() min_d, max_d, num_d = get_date_range(fp) print('Date range: ', min_d, max_d + '\n') f_out = open(lang + '/links/' + t + '_links.txt', 'w') key_enc = quote(kw.encode('utf8')) googlenews = GoogleNews() googlenews.setlang(lang) googlenews.setTimeRange(min_d, max_d) googlenews.search(key_enc) result = googlenews.result() page = 1 num_art = len(result) curr_art = num_art while curr_art < 10*num_d: page += 1 googlenews.getpage(page) result = googlenews.result() num_art = len(result) if curr_art < num_art: curr_art = num_art else: break for i in range(curr_art): date = str(dateparser.parse(result[i]['date']).date()) link = result[i]['link'] f_out.write(date + '\n' + link) f_out.write('\n--------------------------------\n') print('--------------------------------\n') f_out.close()
if 0 < choice < 5: break else: print("That is not between 1 and 4! Try again:") print ("You entered: {} ") # Good to use format instead of string formatting with % mydict = {1:go_to_stackoverflow, 2:import_from_phone, 3:import_from_camcorder, 4:import_from_camcorder} mydict[choice]() print(askUser()) s_req = input("Enter the term you would like to search") st_date = input("Please enter your desired start date (MM-DD-YYY): ") en_date = input("Please enter your desired end date (MM-DD-YYY): ") googlenews = GoogleNews() googlenews.setlang('en') googlenews.setTimeRange(st_date,en_date) googlenews.search(s_req) googlenews.result() #create a least squares regression model using the variablles all_adj_close= all_data[['Adj Close']] all_returns = np.log(all_adj_close / all_adj_close.shift(1)) #isolate the returns you want to value for the OLS print("As a reminder, you have selected the following: " + input_string) sample_stocks = input("Please choose 2 of the stocks you have chosen to calculate a OLS regression: ") reg_choices = sample_stocks.split(",")
return dataset if __name__ == '__main__': import time import requests from bs4 import BeautifulSoup from GoogleNews import GoogleNews googlenews = GoogleNews() googlenews = GoogleNews(lang='en') googlenews = GoogleNews(period='d') googlenews = GoogleNews(start='02/01/2020',end='02/28/2020') googlenews.setlang('en') googlenews.setperiod('d') googlenews.setTimeRange('02/01/2020','02/28/2020') googlenews.search('APPL') googlenews.getpage(2) x = googlenews.result() for item in x: web_link = item['link'] start = time.time() page_source = requests.get(web_link) soup = BeautifulSoup(page_source.text, "lxml") print('s: ', time.time()-start) try: text = soup.find('article').text #print(text) except: continue
searchInput = st.sidebar.text_input('search query') val = len(searchInput) if val > 0: agree = st.sidebar.checkbox('frequency') if agree: option = st.sidebar.selectbox('How would you like to be contacted?', ('1h', '1d', '7d', '1y')) googlenews.setperiod(option) else: st.sidebar.markdown('Select the time range for the search') dt1 = st.sidebar.date_input('from date', datetime.date.today()) dt2 = st.sidebar.date_input('till date', datetime.date.today()) if dt1 > dt2: st.sidebar.error('SELECT A VALID "FROM" DATE') else: googlenews.setTimeRange(dt1, dt2) with st.spinner('Getting data...'): googlenews.search(searchInput) news_content = [] ## ''' Google News start ''' for i in range(1, 1 + 1): googlenews.getpage(i) for i in googlenews.result(): news_content.append(i['desc']) googlenews.clear() ## ''' Twitter handle ''' q = '%40' + '#' + searchInput + ' -filter:retweets -filter:replies' # count : no of tweets to be retrieved per one call and parameters according to twitter API params = {'q': q, 'count': 1000, 'lang': 'en', 'result_type': 'recent'}
for single_date in daterange(start_date, end_date): all_date.append(single_date.strftime("%m/%d/%Y")) googlenews = GoogleNews() googlenews.setlang('en') webscraper = WebScraper() now = datetime.now() save_file_name = now.strftime("googlenews_results_%H+%M_%m-%d-%Y") save_file_name = save_file_name + '.csv' count = 0 for cur_date in all_date: print("The current date searching ",cur_date) googlenews.setTimeRange(cur_date, cur_date) googlenews.search('tesla') webscraper.save_csv(save_file_name, googlenews) page_counter = 2 while True: googlenews.getpage(page_counter) if(not googlenews.result()): print("last page is ", str(page_counter - 1)) break webscraper.save_csv(save_file_name, googlenews) page_counter += 1 if count == 1: break