"Associated Document Series": ""}, "statistics") dt.create_index(["Title", "Old URL"], "statistics", unique=True) for link in doc.xpath("//div[@class='wrapper']/ul/li/a"): series_title, series_url = link.text, urlparse.urljoin(URL, link.attrib["href"]) print series_title series_req = requests.get(series_url) series_doc = lxml.html.fromstring(series_req.text) for table_line in series_doc.xpath("//tr[not(@bgcolor) or @bgcolor!='#004093']"): file_pub_date = table_line.xpath("./td[3]")[0].text for file_node in table_line.xpath("./td[2]//a"): file_title = etree.tostring(file_node, method="text", encoding="utf8") file_link = file_node.attrib["href"] if not file_link.startswith("http"): file_link = urlparse.urljoin(URL, file_link) file_data = {"Old URL": series_url, "Title": file_title, "Body": file_title, "Publication date": datetool.parsedate(file_pub_date), "Attachment": file_link, "Attachment title": file_title, "Associated organisations": "Scotland Office", "Associated Document Series": series_title} dt.upsert(file_data, "statistics") dumptruck_to_csv(dt, "statistics", "/home/http/scotland/stats.csv")
["http://www.scotlandoffice.gov.uk/scotlandoffice/16668.141.html?tID=16677&mon=jul", "Latest releases"], ["http://www.scotlandoffice.gov.uk/scotlandoffice/16668.141.html?tID=16676&mon=aug", "Latest releases"], ["http://www.scotlandoffice.gov.uk/scotlandoffice/16668.141.html?tID=16678&mon=sep", "Latest releases"], ["http://www.scotlandoffice.gov.uk/scotlandoffice/16668.141.html?tID=16679&mon=oct", "Latest releases"], ["http://www.scotlandoffice.gov.uk/scotlandoffice/16668.141.html?tID=16680&mon=nov", "Latest releases"], ["http://www.scotlandoffice.gov.uk/scotlandoffice/10804.146.html", "Archive releases"], # 2005 ["http://www.scotlandoffice.gov.uk/scotlandoffice/10805.145.html", "Archive releases"], # 2006 ["http://www.scotlandoffice.gov.uk/scotlandoffice/10806.144.html", "Archive releases"], # 2007 ["http://www.scotlandoffice.gov.uk/scotlandoffice/10807.143.html", "Archive releases"], # 2008 ["http://www.scotlandoffice.gov.uk/scotlandoffice/13342.html", "Archive releases"], # 2009 ["http://www.scotlandoffice.gov.uk/scotlandoffice/13661.html", "Archive releases"], # 2010 ["http://www.scotlandoffice.gov.uk/scotlandoffice/15263.html", "Archive releases"], # 2011 ] dt = DumpTruck(dbname="scotland.db") dt.create_table({"Title": "", "Publication date": "", "Old URL": "", "Summary": "", "Attachments": "", "Type": "", "Associated organisations": ""}, "publications") dt.create_index(["Title", "Old URL"], "publications", unique=True) for url, page_type in URLS: for publication in scrape_list_page(url): publication['Type'] = page_type dt.upsert(publication, "publications") dumptruck_to_csv(dt, "publications", "/home/http/scotland/publications.csv")
def scrape_main_article(url): req = requests.get(url) doc = lxml.html.fromstring(req.text) div = doc.xpath("//*[@class='wrapper']")[0] div.remove(div.find("h1")) for para in div.findall("p"): if para.find("strong") is not None: div.remove(para) return htmlize(etree.tostring(div)) dt = DumpTruck(dbname="scotland.db") dt.create_table({"Title": "", "Publication date": "", "Old URL": "", "Summary": "", "Body": "", "Associated organisations": ""}, "news") dt.create_index(["Title", "Old URL"], "news", unique=True) for url in URLS: for news_item in scrape_list_page(url): attachments = json.loads(news_item.pop("Attachments")) link = attachments[0]["link"] news_item["Old URL"] = link news_item["Body"] = scrape_main_article(link) dt.upsert(news_item, "news") dumptruck_to_csv(dt, "news", "/home/http/scotland/news.csv")