def indiaTodayNews(url, depth, CITY_LIST): # calling for given URL. # checking if further depth allowed. final_time = time.time() # print("INDIA TODAY") # print(initial_time, "\n", final_time) if depth > 0 and (final_time - initial_time) < TOTAL_TIME: #print("DEPTH SATISFIED") # is this url already present or not if not db.IsUrlExists(url): # if not db.IsUrlExists(url): # reading url pgsrc, index = wp.read_webpage(url) # if url is opened and read if pgsrc: url_soup = wp.html_parser(pgsrc) # extracting the meta data of the url to check whether it is article or non-article url url_type = url_soup.find('meta', {'property': 'og:type'}) # if the url is article type. if url_type and url_type.get('content').lower() in ['story', 'article']: # if the url is article type then extract required details and do further processing. try: article = newspaper.Article(url) article.download() article.parse() location = None for city in CITY_LIST: if city in article.url: location = city break if location: extractToiNews(article.title, article.text, article.publish_date, article.url, location) except Exception as e: print(str(e)) # extract all anchor tags in this webpage and then send ecah url for further procesing . returned_url_list = url_soup.find_all('a') for return_url in returned_url_list: link = return_url.get('href') if link and link != URL: # checking if the url is complete or not i.e. starting with http ot not. if 'http' not in link: link = return_link(URL, link) # addition of all the urls present in this web page if any(item in link for item in CITY_LIST): indiaTodayNews(link, depth-1, CITY_LIST) elif 'indiatoday.in' in link: # addition of all the urls present in this web page if any(item in link for item in CITY_LIST): indiaTodayNews(link, depth-1, CITY_LIST)
def theHinduNews(url, depth, CITY_LIST): # calling for given URL. # checking if further depth allowed. final_time = time.time() if depth > 0 and (final_time - initial_time) < TOTAL_TIME: # is this url already present or not if not db.IsUrlExists(url): # if not db.IsUrlExists(url): pgsrc, index = wp.read_webpage(url) # if url is opened and read if pgsrc: url_soup = wp.html_parser(pgsrc) # extracting the meta data of the url to check whether it is article or non-article url url_type = url_soup.find('meta', {'property': 'og:type'}) # if the url is article type. if url_type and url_type.get('content').lower() in ['article', 'story']: # if the url is article type then extract required details and do further processing. try: article = newspaper.Article(url) article.download() article.parse() publish_date = url_soup.find('meta', {'name': 'publish-date'}) location = url_soup.find('meta', {'property': 'article:section'}) if publish_date: publish_date = publish_date.get('content') if location: location = location.get('content') print(location) if any(item.lower() in location.lower() for item in CITY_LIST): extractToiNews(article.title, article.text, publish_date, article.url, location) except Exception as e: print(str(e)) # extract all anchor tags in this webpage and then send ecah url for further procesing . returned_url_list = url_soup.find_all('a') for return_url in returned_url_list: link = return_url.get('href') if link and link != URL: # checking if the url is complete or not i.e. starting with http ot not. # checking if the url is complete or not i.e. starting with http ot not. if 'http' not in link: link = return_link(URL, link) # addition of all the urls present in this web page if any(item in link for item in CITY_LIST): theHinduNews(link, depth-1, CITY_LIST) elif 'thehindu.com' in link: # addition of all the urls present in this web page if any(item in link for item in CITY_LIST): theHinduNews(link, depth-1, CITY_LIST)
def HindustanTimesNewsSiteCrawler(url, depth, CITY_LIST): print("Opening the Website: ", url) # reading given url page pgsrc, index = wp.read_webpage(url) # if source page read is not None if pgsrc: # parsing the page soup = wp.html_parser(pgsrc) # read all anchor tags a_tags = soup.find_all('a') # print(a_tags) for tag in a_tags: link = tag.get('href') if 'http' not in link: link = return_link(url, link) hindustanTimesNews(link, depth, CITY_LIST) else: print("can't read")
def TheHinduNewsSiteCrawler(url, depth, CITY_LIST): print("Opening the Website: ", url) # reading given url page pgsrc, index = wp.read_webpage(url) # if source page read is not None if pgsrc: # parsing the page soup = wp.html_parser(pgsrc) # read all anchor tags a_tags = soup.find_all('a') for tag in a_tags: if tag: link = tag.get('href') if link and 'http' not in link: link = return_link(url, link) theHinduNews(link, depth, CITY_LIST)
def hindustanTimesNews(url, depth, CITY_LIST): # calling for given URL. # checking if further depth allowed. final_time = time.time() if depth > 0 and (final_time - initial_time) < TOTAL_TIME: # is this url already present or not if not db.IsUrlExists(url): # reading url pgsrc, index = wp.read_webpage(url) # if url is opened and read if pgsrc: url_soup = wp.html_parser(pgsrc) # extracting the meta data of the url to check whether it is article or non-article url tag = url_soup.find('meta', {'property': 'og:type'}) if tag and tag.get('content') == 'article': print("Article") try: # if the url is article type then extract required details and do further processing. article = newspaper.Article(url) article.download() article.parse() location = url_soup.find('meta', {'name': 'section'}) location = location.get('content').split()[0].split( '-')[0] if any(item.lower() in location.lower() for item in CITY_LIST): extract_details(article.title, article.text, article.publish_date, article.url, location) except Exception as e: print("Exception occurred: ", e) else: print("Non Article") # extract all anchor tags in this webpage and then send each url for further procesing . returned_url_list = url_soup.find_all('a') # print(returned_url_list) for return_url in returned_url_list: link = return_url.get('href') if link and link != URL: # checking if the url is complete or not i.e. starting with http ot not. if 'http' not in link: link = return_link(URL, link) # addition of all the urls present in this web page if any(item in link for item in CITY_LIST): hindustanTimesNews(link, depth - 1, CITY_LIST) elif 'hindustantimes.com' in link: # addition of all the urls present in this web page if any(item in link for item in CITY_LIST): hindustanTimesNews(link, depth - 1, CITY_LIST) else: pass #print("NO LINK") else: pass # print("ERROR READING PAGE") else: pass #print("ALREADY PRESENT") else: pass
def news18(url, depth, CITY_LIST): # calling for given URL. # checking if further depth allowed. final_time = time.time() if depth > 0 and (final_time - initial_time) < TOTAL_TIME: # is this url already present or not if not db.IsUrlExists(url): # if not db.IsUrlExists(url): # reading url pgsrc, index = wp.read_webpage(url) # if url is opened and read if pgsrc: url_soup = wp.html_parser(pgsrc) # extracting the meta data of the url to check whether it is article or non-article url url_type = url_soup.find('meta', {'property': 'og:type'}) # if the url is article type. if url_type and url_type.get('content').lower() in [ 'article', 'story' ]: # if the url is article type then extract required details and do further processing. try: article = newspaper.Article(url) article.download() article.parse() date_tags = url_soup.find_all( 'script', {'type': 'application/ld+json'}) for date_tag in date_tags: contentDict = eval(date_tag.get_text()) if 'datePublished' in contentDict.keys(): articleDate = contentDict['datePublished'] break location = None for city in CITY_LIST: if city in article.url: location = city break if location: extractToiNews(article.title, article.text, articleDate, article.url, location) except Exception as e: print(str(e)) # extract all anchor tags in this webpage and then send ecah url for further procesing . returned_url_list = url_soup.find_all('a') for return_url in returned_url_list: link = return_url.get('href') if link and link != URL: # checking if the url is complete or not i.e. starting with http ot not. if 'http' not in link: link = return_link(URL, link) # addition of all the urls present in this web page if any(item in link for item in CITY_LIST): news18(link, depth - 1, CITY_LIST) elif 'news18.com' in link: # addition of all the urls present in this web page if any(item in link for item in CITY_LIST): news18(link, depth - 1, CITY_LIST)