forked from frappe/data-gov-in
/
getdata.py
105 lines (89 loc) · 2.87 KB
/
getdata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from __future__ import unicode_literals
import requests, re, os, xlrd, csv, cStringIO
import setproperties
sources = [
"http://data.gov.in/hackathon/sectors",
"http://data.gov.in/catalogs"
]
def download():
setproperties.load_properties()
for i in xrange(35):
print "page %s" % i
response = requests.get(sources[1] + "?page=%s" % i, verify=False)
properties = get_url_title_and_description_from_html(response.text)
for filename in properties:
print filename
filepath = os.path.join("data", "raw", filename)
if not os.path.exists(filepath):
with open(filepath, "wb") as datafile:
r = requests.get(properties[filename]["url"])
for chunk in r.iter_content(1024):
datafile.write(chunk)
setproperties.update_for_file(filename, properties[filename])
setproperties.save_properties()
def get_url_title_and_description_from_html(text):
properties = {}
for row in text.split("ds-list-item")[1:-1]:
row = row.replace("\n", "")
url = re.findall("url=([^&]+)", row)[0]
file_name = url.split("/")[-1]
properties[file_name] = {
"file_name": file_name,
"url": url,
"title": re.findall('title="([^"]+)"', row)[0],
"description": re.findall("<br />([^<]+)", row)[0]
}
return properties
def convert_to_csv():
import xlrd, shutil
raw_path = os.path.join("data", "raw")
for fname in os.listdir(raw_path):
fpath = os.path.join(raw_path, fname)
extn = fname.split(".")[-1].lower()
if extn == "xls":
print fpath
wb = xlrd.open_workbook(fpath)
sheets = wb.sheet_names()
for i in xrange(len(sheets)):
file_name = (len(sheets) > 1 and (".".join(fname.split(".")[:-1]) + \
"-" + sheets[i].lower().replace(" ", "_")) or \
".".join(fname.split(".")[:-1])) + ".csv"
sheet = wb.sheet_by_index(i)
write_csv(sheet, file_name)
elif extn == "csv":
print fpath
shutil.copy(fpath, os.path.join("data", "csv"))
def write_csv(sheet, file_name):
if os.path.exists(os.path.join("data", "csv", file_name)):
return
csv_writer = UnicodeWriter()
for row_no in xrange(sheet.nrows):
csv_writer.writerow(sheet.row_values(row_no))
with open(os.path.join("data", "csv", file_name), "w") as csv_file:
csv_file.write(csv_writer.getvalue())
class UnicodeWriter:
def __init__(self, encoding="utf-8"):
self.encoding = encoding
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, quoting=csv.QUOTE_NONNUMERIC)
def writerow(self, row):
row = self.encode(row, self.encoding)
self.writer.writerow(row)
def getvalue(self):
return self.queue.getvalue()
def encode(self, obj, encoding="utf-8"):
if isinstance(obj, list):
out = []
for o in obj:
if isinstance(o, unicode):
out.append(o.encode(encoding))
else:
out.append(o)
return out
elif isinstance(obj, unicode):
return obj.encode(encoding)
else:
return obj
if __name__=="__main__":
download()
#convert_to_csv()