示例#1
0
def obtain_articles(dir, kws, ak, dates):
    start_date = dates[0]
    end_date = dates[1]
    duration = end_date-start_date
    start_date = start_date - timedelta(days=int(duration.days*0.1))
    end_date = end_date + timedelta(days=int(duration.days*0.1))

    query = kws[0]
    for kw in kws[1:]:
        query += ' AND ' + kw

    print(query)

    content = theguardian_content.Content(api=ak, q=query, from_date=str(start_date.date()),
                                          to_date=str(end_date.date()), page=1)

    headers = content.response_headers()
    pages = headers['pages']
    if pages <= 1:
        print(pages)
        return

    out_file = open(dir + '/articles.jsonl', 'w', encoding='UTF-8')
    index = 1

    for i in range(1, pages+1):
        content = theguardian_content.Content(api=ak, q=query, from_date=str(start_date.date()),
                                              to_date=str(end_date.date()), page=i)
        json_content = content.get_content_response()
        try:
            all_results = content.get_results(json_content)
        except:
            continue
        for r in all_results:
            if r['type'] != 'article':
                continue
            text = get_content(r['fields'])
            if len(text) < 10:
                continue
            publish_date = datetime.strptime(r['webPublicationDate'], '%Y-%m-%dT%H:%M:%SZ')
            publish_date.strftime('%Y-%m-%d %H:%M:%S')

            art = {'id': r['id'],
                   'time': str(publish_date),
                   'text': text,
                   'title': r['webTitle']}
            json.dump(art, out_file)
            out_file.write('\n')
            index += 1
            if index > 1000:
                break
        if index > 1000:
            break

    print(index)
    out_file.close()
示例#2
0
    def get_current_news(self):
        headers = {
            "q": self.active_category
        }  # q=query parameter/search parameter
        section = theguardian_section.Section(api='test', **headers)

        section_content = section.get_content_response()
        results = section.get_results(section_content)

        editions = results[0]['editions']
        articles = [
            edi["apiUrl"] for edi in editions
            if self.active_category in edi['id']
            and self.active_edition == edi['code']
        ]

        if not articles:
            articles = editions[0][
                "apiUrl"]  # there was only a default edition or categories weren't found
        else:
            articles = articles[0]

        content = theguardian_content.Content(api='test', url=articles)

        content_response = content.get_content_response()
        results = content_response['response']['results']
        newest_result = results[0]
        title = get_article_title(newest_result['apiUrl'])
        return title, newest_result['webUrl']
示例#3
0
    def transforData(self):
        #STEP 1: Obtener el número de páginas
        auxContent = self.content.response_headers()
        totalPages = auxContent['pages']
        headers = self.mainHeader
        anotherContent = theguardian_content.Content(api='test', **headers)

        #STEP 2: Barrer página por página
        for page in range(1, totalPages + 1):
            res = anotherContent.get_content_response(headers={'pages': page})
            self.result = anotherContent.get_results(res)
            pageData = []
            for i in range(0, len(self.result)):
                element = []
                refDate = self.result[i]['webPublicationDate']
                newString = ""
                for x in refDate:
                    if x != 'T':
                        newString = newString + x
                    else:
                        break

                sectionName = self.result[i]['sectionName']
                webTitle = self.result[i]['webTitle']
                webUrl = self.result[i]['webUrl']
                element.insert(0, refDate)
                element.insert(1, sectionName)
                element.insert(2, webTitle)
                element.insert(3, webUrl)
                pageData.append(element)
            self.newsData.append(pageData)
示例#4
0
    def test_contents_find_by_id_correct_url(self):

        api_key = "test"
        ids = "technology/2014/feb/17/flappy-bird-clones-apple-google"
        res = theguardian_content.Content(api_key).find_by_id(ids)

        self.assertIs(type(res), dict)
        self.assertEqual(res['response']['results'][0]["id"], ids)
示例#5
0
    def getGuardianArticles(self, topic):
        header = {"q": topic, "type": "article"}
        content_articles = theguardian_content.Content(GUAR_KEY, **header)
        content_articles_data = content_articles.get_content_response()
        results = content_articles.get_results(content_articles_data)

        webUrls = [result["webUrl"] for result in results]
        return webUrls
示例#6
0
    def test_contents_find_by_id_incorrect_url(self):

        api_key = "test"
        ids = "technology/2014/feb/17/flappy-bird-clones-apple"
        res = theguardian_content.Content(api_key).find_by_id(ids)

        self.assertEqual(res['response']['results'], [])
        self.assertEqual(res['response']['pages'], 0)
        self.assertEqual(res['response']['total'], 0)
示例#7
0
    def getData(self, fromDate, section):

        # create content
        self.content = theguardian_content.Content(api='test')

        # create content with filters
        # for more filters refer
        # http://open-platform.theguardian.com/documentation/search

        self.mainHeader = {
            "section": section,
            "from-date": fromDate,
            "order-by": "relevance",
            "page-size": 200,
            "show-fields": "sectionName,webTitle,webUrl,short-url",
        }
        headers = self.mainHeader
        self.content = theguardian_content.Content(api='test', **headers)
示例#8
0
    def find_article_keyword(self, keyword):
        headers = {"q": keyword}
        content = theguardian_content.Content(
            api='68427e29-0cd0-4ba4-a7c9-92350224c80f', **headers)

        json_content = content.get_content_response()
        results = content.get_results(json_content)

        return self.news_task.get_article_title(
            results[0]['apiUrl']), results[0]['webUrl']
示例#9
0
    def test_content_results(self):

        api_key = "test"
        ids = "technology/2014/feb/17/flappy-bird-clones-apple-google"
        content = theguardian_content.Content(api_key)
        res = content.find_by_id(ids)
        result = content.get_results(res)

        self.assertIs(type(result), list)
        self.assertEqual(result[0]['id'], ids)
示例#10
0
    def test_content_response_header(self):

        api_key = "test"
        res = theguardian_content.Content(api_key).response_headers()

        self.assertIs(type(res), dict)
        self.assertIn("pageSize", res.keys())
        self.assertIn("currentPage", res.keys())
        self.assertIn("pages", res.keys())
        self.assertNotIn("results", res.keys())
示例#11
0
def getContent(p):
    headers = {"show-fields": "body", "page-size": "200", "page": str(p)}

    # create content
    content = theguardian_content.Content(
        api='2e433c1b-7fd1-46fc-be75-3f00732ca28c', **headers)

    # get all results of a page
    json_content = content.get_content_response()
    all_results = content.get_results(json_content)
    for result in all_results:
        print(result["fields"]["body"])
示例#12
0
    def test_section_get_references_correct_pages(self):

        api_key = "test"
        content = theguardian_content.Content(
            api_key, **{
                "q": "apple",
                "section": "technology",
            })
        refs = content.get_references_in_page(page_number=1)
        refs2 = content.get_references_in_page()

        self.assertIs(type(refs), list)
        self.assertIs(type(refs2), list)
示例#13
0
    def test_content_with_headers(self):

        headers = {
            "q": "12 years a slave",
            "tag": "film/film,tone/reviews",
            "from-date": "2010-01-01",
            "order-by": "relevance",
            "show-fields": "starRating,headline,thumbnail,short-url",
        }
        content = theguardian_content.Content(api='test', **headers)
        res = content.get_content_response()
        result = content.get_results(res)

        self.assertIs(type(result), list)
示例#14
0
    def test_section_get_references_incorrect_pages(self):

        api_key = "test"
        content = theguardian_content.Content(
            api_key, **{
                "q": "apple",
                "section": "technology",
            })

        head = content.response_headers()

        self.assertRaises(ValueError,
                          content.get_references_in_page,
                          page_number=head["pages"] + 1)
示例#15
0
 def _get_guardian_article_urls(self):
     response_articles = []
     headers = self.paper_dic['guardian']['url_headers']
     # create content
     content = theguardian_content.Content(api= self.paper_dic['guardian']['api_key'], **headers)
     response_headers = content.response_headers()
     print("request response: {}".format(response_headers) )
     no_pages = response_headers['pages']
     self.no_results = response_headers['total']
     print("total results: {}".format(self.no_results ))
     
     ### get all results from all pages
     for page in range(1,no_pages+1):
         json_content = content.get_content_response(headers={"page":page})
         all_results = content.get_results(json_content)
         for res in all_results:               
             db_entity_dic ={}
             response_articles.append(res['webUrl'])
     return response_articles
示例#16
0
from theguardian import theguardian_content

# create content
content = theguardian_content.Content(api='test')

# gets raw_response
raw_content = content.get_request_response()
print("Request Response status code {status}.".format(
    status=raw_content.status_code))
print("Request Response headers {header}.".format(header=raw_content.headers))

# content
print("Content Response headers {}.".format(content.response_headers()))

# get all results of a page
json_content = content.get_content_response()
all_results = content.get_results(json_content)
print("All results {}.".format(all_results))

# actual response
print("Response {response}".format(response=json_content))
from theguardian import theguardian_content


# create content
headers = {
    "page-size": 5,
    "order-by": "newest",
}
content = theguardian_content.Content(api='test', **headers)

# looping through pages
response_headers = content.response_headers()
total_pages = response_headers["pages"]

# print apiUrls for all the results in first 5 pages
required_pages = 5
required_urls = []

if total_pages > required_pages:

    headers = {
        "page-size": 5,
        "order-by": "newest",
    }
    content2 = theguardian_content.Content(api='test', **headers)

    for page in range(1, required_pages+1):
        res = content2.get_content_response(headers={"page": page})
        page_results = res['response']['results']
        for result in page_results:
            required_urls.append(result["apiUrl"])
示例#18
0
    def test_content_response_failure_incorrect_api_key(self):

        api_key = "tests"
        res = theguardian_content.Content(api_key).get_request_response()
        self.assertEqual(res.status_code, 403)
"""
This example deals with returning content of specific tags.
"""
from theguardian import theguardian_tag
from theguardian import theguardian_content

# get the apple tags
headers = {
    "q": "apple",
    "section": "technology",
    "show-references": "all",
}
tag = theguardian_tag.Tag(api='test', **headers)

# get the results
tag_content = tag.get_content_response()
results = tag.get_results(tag_content)

# get results for specific tag
first_tag_apiUrl = results[0]["apiUrl"]

# use this api url to content
content = theguardian_content.Content(api='test', url=first_tag_apiUrl)

# get content response
content_response = content.get_content_response()
print(content_response)
"""
Print the web title of every tag a content item has
for a single item.
"""
from theguardian import theguardian_content


headers = {
    "ids": "environment/2014/sep/14/invest-in-monitoring-and-tagging-sharks-to-prevent-attacks",
    "show-tags": "all",
}

content = theguardian_content.Content(api="test", **headers)
content_response = content.get_content_response()
results = content.get_results(content_response)
tags = results[0]["tags"]
webTitles = [tag["webTitle"] for tag in tags]

print("Title of tags {titles}" .format(titles=webTitles))

"""
Print the web title of each content item in the
editor's picks for the film tag.
"""

tag_headers = {
    "tag": "film/film"
}

tag_content = theguardian_content.Content(api="test", **tag_headers)
tag_content_response = tag_content.get_content_response()
示例#21
0
"""
Query for a single content item and print its web title
"""
from theguardian import theguardian_content

ids = "commentisfree/2013/jan/16/vegans-stomach-unpalatable-truth-quinoa"
content = theguardian_content.Content(api="test")

single_id_content = content.find_by_id(ids)
results = content.get_results(single_id_content)

print("web url for {id}: {url}\n".format(id=results[0]["id"],
                                         url=results[0]["webUrl"]))
"""
Print web title for a tag
"""

header = {
    "tag": "music/metal",
}
tag_content = theguardian_content.Content(api="test")

tag_content_response = content.get_content_response(header)
results = content.get_results(tag_content_response)

print("web title for {id}: {url}\n".format(id=results[0]["id"],
                                           url=results[0]["webTitle"]))
"""
print web title for a section
"""
示例#22
0
    def test_content_get_result_with_exception(self):

        api_key = "test"
        section = theguardian_content.Content(api_key)

        self.assertRaises(TypeError, section.get_results, "some random text")
示例#23
0
def get_article_title(apiUrl):
    content = theguardian_content.Content(api='test',
                                          url=apiUrl).get_content_response()
    response = content['response']
    return response['content']['webTitle']
示例#24
0
"""
This example deals with returning content of section.
"""
from theguardian import theguardian_section
from theguardian import theguardian_content

# get the sports sections
headers = {"q": "sports"}  # q=query parameter/search parameter
section = theguardian_section.Section(api='test', **headers)

# get the results
section_content = section.get_content_response()
results = section.get_results(section_content)

# get different editions from the results
editions = results[0]['editions']

# get uk/sports edition apiUrl
uk_sports = [edi["apiUrl"] for edi in editions if edi["id"] == "uk/sport"][0]

# use this api url to sports content
content = theguardian_content.Content(api='test', url=uk_sports)

# get section response
content_response = content.get_content_response()
print(content_response)
示例#25
0
    def transformDataByDate(self, fromDate, section):

        toDate = fromDate
        hoy = datetime.now().date()
        #print(hoy)
        endDate = datetime.strptime(fromDate, "%Y-%m-%d").date()

        ##Get the information day by day
        while endDate <= hoy:
            try:
                # create content
                self.content = theguardian_content.Content(api='test')

                # create content with filters
                # for more filters refer
                # http://open-platform.theguardian.com/documentation/search

                self.mainHeader = {
                    "section": section,
                    "from-date": toDate,
                    "to-date": toDate,
                    "order-by": "relevance",
                    "page-size": 20,
                    "show-fields": "sectionName,webTitle,webUrl,short-url",
                }
                headers = self.mainHeader
                self.content = theguardian_content.Content(api='test',
                                                           **headers)
                res = self.content.get_content_response()
                self.result = self.content.get_results(res)
                try:
                    #Convert data from DataFrame to Vector of TUPLAS
                    for i in range(0, len(self.result)):
                        element = []
                        refDate = self.result[i]['webPublicationDate']
                        newString = ""
                        for x in refDate:
                            if x != 'T':
                                newString = newString + x
                            else:
                                break
                        sectionName = self.result[i]['sectionName']
                        webTitle = self.result[i]['webTitle']
                        webUrl = self.result[i]['webUrl']
                        element.insert(0, newString)
                        element.insert(1, sectionName)
                        element.insert(2, webTitle)
                        element.insert(3, webUrl)
                        self.newsData.append(element)
                except:
                    print("Problemas con la carga de datos.")
                    print("Dato: ", i)
                    print("Último elemento en la matriz: ")
                    print(self.newsData[i - 1])
            except:
                print("Problemas con la recepción de los datos.")
            #Next day
            i = 0
            #endDate = datetime.strptime(fromDate,"%Y-%m-%d").date()
            endDate = endDate + timedelta(days=1)
            toDate = endDate.strftime("%Y-%m-%d")
示例#26
0
    def test_content_response_success_correct_details(self):

        api_key = "test"
        res = theguardian_content.Content(api_key).get_request_response()
        self.assertEqual(res.status_code, 200)