def obtain_articles(dir, kws, ak, dates): start_date = dates[0] end_date = dates[1] duration = end_date-start_date start_date = start_date - timedelta(days=int(duration.days*0.1)) end_date = end_date + timedelta(days=int(duration.days*0.1)) query = kws[0] for kw in kws[1:]: query += ' AND ' + kw print(query) content = theguardian_content.Content(api=ak, q=query, from_date=str(start_date.date()), to_date=str(end_date.date()), page=1) headers = content.response_headers() pages = headers['pages'] if pages <= 1: print(pages) return out_file = open(dir + '/articles.jsonl', 'w', encoding='UTF-8') index = 1 for i in range(1, pages+1): content = theguardian_content.Content(api=ak, q=query, from_date=str(start_date.date()), to_date=str(end_date.date()), page=i) json_content = content.get_content_response() try: all_results = content.get_results(json_content) except: continue for r in all_results: if r['type'] != 'article': continue text = get_content(r['fields']) if len(text) < 10: continue publish_date = datetime.strptime(r['webPublicationDate'], '%Y-%m-%dT%H:%M:%SZ') publish_date.strftime('%Y-%m-%d %H:%M:%S') art = {'id': r['id'], 'time': str(publish_date), 'text': text, 'title': r['webTitle']} json.dump(art, out_file) out_file.write('\n') index += 1 if index > 1000: break if index > 1000: break print(index) out_file.close()
def get_current_news(self): headers = { "q": self.active_category } # q=query parameter/search parameter section = theguardian_section.Section(api='test', **headers) section_content = section.get_content_response() results = section.get_results(section_content) editions = results[0]['editions'] articles = [ edi["apiUrl"] for edi in editions if self.active_category in edi['id'] and self.active_edition == edi['code'] ] if not articles: articles = editions[0][ "apiUrl"] # there was only a default edition or categories weren't found else: articles = articles[0] content = theguardian_content.Content(api='test', url=articles) content_response = content.get_content_response() results = content_response['response']['results'] newest_result = results[0] title = get_article_title(newest_result['apiUrl']) return title, newest_result['webUrl']
def transforData(self): #STEP 1: Obtener el número de páginas auxContent = self.content.response_headers() totalPages = auxContent['pages'] headers = self.mainHeader anotherContent = theguardian_content.Content(api='test', **headers) #STEP 2: Barrer página por página for page in range(1, totalPages + 1): res = anotherContent.get_content_response(headers={'pages': page}) self.result = anotherContent.get_results(res) pageData = [] for i in range(0, len(self.result)): element = [] refDate = self.result[i]['webPublicationDate'] newString = "" for x in refDate: if x != 'T': newString = newString + x else: break sectionName = self.result[i]['sectionName'] webTitle = self.result[i]['webTitle'] webUrl = self.result[i]['webUrl'] element.insert(0, refDate) element.insert(1, sectionName) element.insert(2, webTitle) element.insert(3, webUrl) pageData.append(element) self.newsData.append(pageData)
def test_contents_find_by_id_correct_url(self): api_key = "test" ids = "technology/2014/feb/17/flappy-bird-clones-apple-google" res = theguardian_content.Content(api_key).find_by_id(ids) self.assertIs(type(res), dict) self.assertEqual(res['response']['results'][0]["id"], ids)
def getGuardianArticles(self, topic): header = {"q": topic, "type": "article"} content_articles = theguardian_content.Content(GUAR_KEY, **header) content_articles_data = content_articles.get_content_response() results = content_articles.get_results(content_articles_data) webUrls = [result["webUrl"] for result in results] return webUrls
def test_contents_find_by_id_incorrect_url(self): api_key = "test" ids = "technology/2014/feb/17/flappy-bird-clones-apple" res = theguardian_content.Content(api_key).find_by_id(ids) self.assertEqual(res['response']['results'], []) self.assertEqual(res['response']['pages'], 0) self.assertEqual(res['response']['total'], 0)
def getData(self, fromDate, section): # create content self.content = theguardian_content.Content(api='test') # create content with filters # for more filters refer # http://open-platform.theguardian.com/documentation/search self.mainHeader = { "section": section, "from-date": fromDate, "order-by": "relevance", "page-size": 200, "show-fields": "sectionName,webTitle,webUrl,short-url", } headers = self.mainHeader self.content = theguardian_content.Content(api='test', **headers)
def find_article_keyword(self, keyword): headers = {"q": keyword} content = theguardian_content.Content( api='68427e29-0cd0-4ba4-a7c9-92350224c80f', **headers) json_content = content.get_content_response() results = content.get_results(json_content) return self.news_task.get_article_title( results[0]['apiUrl']), results[0]['webUrl']
def test_content_results(self): api_key = "test" ids = "technology/2014/feb/17/flappy-bird-clones-apple-google" content = theguardian_content.Content(api_key) res = content.find_by_id(ids) result = content.get_results(res) self.assertIs(type(result), list) self.assertEqual(result[0]['id'], ids)
def test_content_response_header(self): api_key = "test" res = theguardian_content.Content(api_key).response_headers() self.assertIs(type(res), dict) self.assertIn("pageSize", res.keys()) self.assertIn("currentPage", res.keys()) self.assertIn("pages", res.keys()) self.assertNotIn("results", res.keys())
def getContent(p): headers = {"show-fields": "body", "page-size": "200", "page": str(p)} # create content content = theguardian_content.Content( api='2e433c1b-7fd1-46fc-be75-3f00732ca28c', **headers) # get all results of a page json_content = content.get_content_response() all_results = content.get_results(json_content) for result in all_results: print(result["fields"]["body"])
def test_section_get_references_correct_pages(self): api_key = "test" content = theguardian_content.Content( api_key, **{ "q": "apple", "section": "technology", }) refs = content.get_references_in_page(page_number=1) refs2 = content.get_references_in_page() self.assertIs(type(refs), list) self.assertIs(type(refs2), list)
def test_content_with_headers(self): headers = { "q": "12 years a slave", "tag": "film/film,tone/reviews", "from-date": "2010-01-01", "order-by": "relevance", "show-fields": "starRating,headline,thumbnail,short-url", } content = theguardian_content.Content(api='test', **headers) res = content.get_content_response() result = content.get_results(res) self.assertIs(type(result), list)
def test_section_get_references_incorrect_pages(self): api_key = "test" content = theguardian_content.Content( api_key, **{ "q": "apple", "section": "technology", }) head = content.response_headers() self.assertRaises(ValueError, content.get_references_in_page, page_number=head["pages"] + 1)
def _get_guardian_article_urls(self): response_articles = [] headers = self.paper_dic['guardian']['url_headers'] # create content content = theguardian_content.Content(api= self.paper_dic['guardian']['api_key'], **headers) response_headers = content.response_headers() print("request response: {}".format(response_headers) ) no_pages = response_headers['pages'] self.no_results = response_headers['total'] print("total results: {}".format(self.no_results )) ### get all results from all pages for page in range(1,no_pages+1): json_content = content.get_content_response(headers={"page":page}) all_results = content.get_results(json_content) for res in all_results: db_entity_dic ={} response_articles.append(res['webUrl']) return response_articles
from theguardian import theguardian_content # create content content = theguardian_content.Content(api='test') # gets raw_response raw_content = content.get_request_response() print("Request Response status code {status}.".format( status=raw_content.status_code)) print("Request Response headers {header}.".format(header=raw_content.headers)) # content print("Content Response headers {}.".format(content.response_headers())) # get all results of a page json_content = content.get_content_response() all_results = content.get_results(json_content) print("All results {}.".format(all_results)) # actual response print("Response {response}".format(response=json_content))
from theguardian import theguardian_content # create content headers = { "page-size": 5, "order-by": "newest", } content = theguardian_content.Content(api='test', **headers) # looping through pages response_headers = content.response_headers() total_pages = response_headers["pages"] # print apiUrls for all the results in first 5 pages required_pages = 5 required_urls = [] if total_pages > required_pages: headers = { "page-size": 5, "order-by": "newest", } content2 = theguardian_content.Content(api='test', **headers) for page in range(1, required_pages+1): res = content2.get_content_response(headers={"page": page}) page_results = res['response']['results'] for result in page_results: required_urls.append(result["apiUrl"])
def test_content_response_failure_incorrect_api_key(self): api_key = "tests" res = theguardian_content.Content(api_key).get_request_response() self.assertEqual(res.status_code, 403)
""" This example deals with returning content of specific tags. """ from theguardian import theguardian_tag from theguardian import theguardian_content # get the apple tags headers = { "q": "apple", "section": "technology", "show-references": "all", } tag = theguardian_tag.Tag(api='test', **headers) # get the results tag_content = tag.get_content_response() results = tag.get_results(tag_content) # get results for specific tag first_tag_apiUrl = results[0]["apiUrl"] # use this api url to content content = theguardian_content.Content(api='test', url=first_tag_apiUrl) # get content response content_response = content.get_content_response() print(content_response)
""" Print the web title of every tag a content item has for a single item. """ from theguardian import theguardian_content headers = { "ids": "environment/2014/sep/14/invest-in-monitoring-and-tagging-sharks-to-prevent-attacks", "show-tags": "all", } content = theguardian_content.Content(api="test", **headers) content_response = content.get_content_response() results = content.get_results(content_response) tags = results[0]["tags"] webTitles = [tag["webTitle"] for tag in tags] print("Title of tags {titles}" .format(titles=webTitles)) """ Print the web title of each content item in the editor's picks for the film tag. """ tag_headers = { "tag": "film/film" } tag_content = theguardian_content.Content(api="test", **tag_headers) tag_content_response = tag_content.get_content_response()
""" Query for a single content item and print its web title """ from theguardian import theguardian_content ids = "commentisfree/2013/jan/16/vegans-stomach-unpalatable-truth-quinoa" content = theguardian_content.Content(api="test") single_id_content = content.find_by_id(ids) results = content.get_results(single_id_content) print("web url for {id}: {url}\n".format(id=results[0]["id"], url=results[0]["webUrl"])) """ Print web title for a tag """ header = { "tag": "music/metal", } tag_content = theguardian_content.Content(api="test") tag_content_response = content.get_content_response(header) results = content.get_results(tag_content_response) print("web title for {id}: {url}\n".format(id=results[0]["id"], url=results[0]["webTitle"])) """ print web title for a section """
def test_content_get_result_with_exception(self): api_key = "test" section = theguardian_content.Content(api_key) self.assertRaises(TypeError, section.get_results, "some random text")
def get_article_title(apiUrl): content = theguardian_content.Content(api='test', url=apiUrl).get_content_response() response = content['response'] return response['content']['webTitle']
""" This example deals with returning content of section. """ from theguardian import theguardian_section from theguardian import theguardian_content # get the sports sections headers = {"q": "sports"} # q=query parameter/search parameter section = theguardian_section.Section(api='test', **headers) # get the results section_content = section.get_content_response() results = section.get_results(section_content) # get different editions from the results editions = results[0]['editions'] # get uk/sports edition apiUrl uk_sports = [edi["apiUrl"] for edi in editions if edi["id"] == "uk/sport"][0] # use this api url to sports content content = theguardian_content.Content(api='test', url=uk_sports) # get section response content_response = content.get_content_response() print(content_response)
def transformDataByDate(self, fromDate, section): toDate = fromDate hoy = datetime.now().date() #print(hoy) endDate = datetime.strptime(fromDate, "%Y-%m-%d").date() ##Get the information day by day while endDate <= hoy: try: # create content self.content = theguardian_content.Content(api='test') # create content with filters # for more filters refer # http://open-platform.theguardian.com/documentation/search self.mainHeader = { "section": section, "from-date": toDate, "to-date": toDate, "order-by": "relevance", "page-size": 20, "show-fields": "sectionName,webTitle,webUrl,short-url", } headers = self.mainHeader self.content = theguardian_content.Content(api='test', **headers) res = self.content.get_content_response() self.result = self.content.get_results(res) try: #Convert data from DataFrame to Vector of TUPLAS for i in range(0, len(self.result)): element = [] refDate = self.result[i]['webPublicationDate'] newString = "" for x in refDate: if x != 'T': newString = newString + x else: break sectionName = self.result[i]['sectionName'] webTitle = self.result[i]['webTitle'] webUrl = self.result[i]['webUrl'] element.insert(0, newString) element.insert(1, sectionName) element.insert(2, webTitle) element.insert(3, webUrl) self.newsData.append(element) except: print("Problemas con la carga de datos.") print("Dato: ", i) print("Último elemento en la matriz: ") print(self.newsData[i - 1]) except: print("Problemas con la recepción de los datos.") #Next day i = 0 #endDate = datetime.strptime(fromDate,"%Y-%m-%d").date() endDate = endDate + timedelta(days=1) toDate = endDate.strftime("%Y-%m-%d")
def test_content_response_success_correct_details(self): api_key = "test" res = theguardian_content.Content(api_key).get_request_response() self.assertEqual(res.status_code, 200)