Пример #1
0
def second_level_topics():
    second_level_topics = []

    for topic in top_level_topics:

        second_level_topic_data = parser.parse_url(topic["api_url"])

        for second_level_topic in second_level_topic_data["links"]["children"]:
            second_level_topics.append(second_level_topic)

    return second_level_topics
Пример #2
0
    "specialist_sectors": ["array of items"],
    "title": "the titles",
    "document_type": "document type"
}

Then go through all the links with https://www.gov.uk/api/content/#{link}
and collect content ids
'''

from api_paginator import ApiPaginator
import page_parser as parser

ROOT_URL_OF_ITEMS_WITHOUT_TAG = "https://www.gov.uk/api/search.json?filter_specialist_sectors=_MISSING"
ROOT_URL_OF_ITEMS_WITH_TAG = "https://www.gov.uk/api/search.json?reject_specialist_sectors=_MISSING"

parsed_root = parser.parse_url(ROOT_URL_OF_ITEMS_WITH_TAG)
paginator = ApiPaginator(ROOT_URL_OF_ITEMS_WITH_TAG)

# count items, I don't wanna store this variable here,
# so this method should set an internal variable rather than returning a value
item_count = paginator.items_total(parsed_root)

# tell paginator to count items, this should be done by the pagintaor itself
# move this within another method in the paginator
paginator.calculate_pages(item_count)

page_urls = paginator.page_urls()

import json
fo = open('data/items_with_tag.jsonl', "w+")
for page_url in page_urls:
Пример #3
0
'''
This script parses first and second level topic's data from GOV.UK's API.
Current output:
some logging and the total number of first and second level topics.
'''

import page_parser as parser

ROOT_URL = "https://www.gov.uk/api/content/topic"

def second_level_topics():
    second_level_topics = []

    for topic in top_level_topics:

        second_level_topic_data = parser.parse_url(topic["api_url"])

        for second_level_topic in second_level_topic_data["links"]["children"]:
            second_level_topics.append(second_level_topic)

    return second_level_topics

top_level_topics = parser.parse_url(ROOT_URL)["links"]["children"]
second_level_topics = second_level_topics()

print("\nTotal number of first level topics: %d" % len(top_level_topics))
print("\nTotal number of second level topics: %d" % len(second_level_topics))
Пример #4
0
redirects and the content is available on the regular site
'''

import page_parser as parser
import json
import time

ROOT_URL = "https://www.gov.uk/api/content"
# this is bad because it stores the whole file in a variable
lines = [line.rstrip('\n') for line in open('items_with_tag.jsonl')]
fo = open('items_with_tag_and_content_id.jsonl', "w+")

for line in lines:
    try:
        json_line = json.loads(line)
        if 'content_id' not in json_line:
            link = json_line['link']
            time.sleep(1)
            parsed_item = parser.parse_url(ROOT_URL + link)
            item_content_id = parsed_item['content_id']
            json_line['content_id'] = item_content_id
            fo.write( json.dumps(json_line) + "\n" )
            print("processed: " + item_content_id)
        else:
            print("key present in:" + line[:50])
    except:
        fo.write( item_content_id + "\n" )
        print("failed: " + item_content_id)

fo.close()