def get_station_data(self, resp): body = resp.body root = etree.fromstring(body) row_data = root.xpath(u"//Measurement") station = dict() date = root.xpath(u"//Measurement[1]/Data/E/T/text()") data_time = parse(date[0]).replace(tzinfo=timezone(self.tz)) for st in row_data: name = st.xpath(u"@SiteName") pol_name = st.xpath(u"DataSource/@Name") pol_val = st.xpath(u"Data/E/I1/text()") pol_time = st.xpath(u"Data/E/T/text()") print(name, pol_name, pol_val, pol_time) if name[0] not in station: station[name[0]] = dict() _tmp_dict = Kind(self.name).get_dict(r_key=pol_name[0], r_val=pol_val[0]) if _tmp_dict: station[name[0]][_tmp_dict[u"key"]] = _tmp_dict[u"val"] for st_data in station: if st_data: items = AppItem() items[u"scrap_time"] = datetime.now( tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time items[u"data_value"] = station[st_data] items[u"source"] = self.source items[u"source_id"] = st_data yield items
def get_st_data(self, resp): regex = u"Data/(.+)_Line\.xml" st_id = findall(regex, resp.url) st_id = str(st_id[0]) row_data = resp.xpath(u"tname[1]/child::*") row_dt = resp.xpath(u"tname[1]/DATE_TIME/text()").extract_first() new_dt = self.check_date(row_dt) st_data = dict() for el in row_data: tag_name = el.xpath(u"name()").extract_first() tag_val = el.xpath(u"text()").extract_first() _name = tag_name _val = tag_val _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val) if _tmp_dict: st_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"] if st_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = new_dt items[u"data_value"] = st_data items[u"source"] = self.source items[u"source_id"] = st_id yield items
def get_station_data(self, resp): row_rows = resp.xpath( u"/html/body/div[7]/div/div[1]/div[2]/div/div[3]/table/tbody/tr[2]" ) print(row_rows) station_name = row_rows.xpath(u"td[1]/b/text()").extract_first() station_id = station_name aqi = row_rows.xpath(u"td[2]/p/text()").extract_first() pm25 = row_rows.xpath(u"td[3]/p/text()").extract_first() row_data_time = row_rows.xpath(u"td[4]/span/text()").extract_first() print(row_data_time) data_time = self.parse_date(row_data_time) _tmp_dict = Kind(self.name).get_dict(r_key=u"pm25", r_val=pm25) station_data = dict() if _tmp_dict: station_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"] _tmp_dict = Kind(self.name).get_dict(r_key=u"aqi", r_val=aqi) if _tmp_dict: station_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"] if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time items[u"data_value"] = station_data items[u"source"] = self.source_name items[u"source_id"] = str(station_id) yield items
def get_st_data(self, resp): regex = u"Details/(.+)\?type=1" st_id = findall(regex, resp.url) st_id = str(st_id[0]) row_data = resp.xpath(u'//*[@id="recentResults"]/tbody/tr') row_dt_hour = resp.xpath( u'//*[@id="recentResults"]/thead/tr[2]/th[1]/text()').re_first( u"Current \((.+)\)") new_dt = self.check_date(row_dt_hour) st_data = dict() for data in row_data: _name = data.xpath(u"td[1]/text()").re_first(u"(.+[\S])") _val = data.xpath(u"td[2]/text()").re_first(u"(.+[\S])") _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val) if _tmp_dict: st_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"] if st_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = new_dt items[u"data_value"] = st_data items[u"source"] = self.source items[u"source_id"] = st_id yield items
def get_station_data(self, resp): _station_id = Selector(text=resp.url).re(u"site_id=(\d+)") station_id = _station_id[0] data_time = self.get_date(resp) table = resp.xpath(u'//*[@id="tab1"]/table/tr') station_data = dict() for row in table: col = row.xpath(u"td/text()").extract() col = col[:2] _name = col[0] _val = col[1] _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val) if _tmp_dict: station_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"] if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = station_id yield items
def get_station_data(self, resp): data_time = resp.xpath( u'//*[@id="EnvitechGrid1_GridTable"]/tr[2]/td[1]/span/text()' ).extract_first() data_time = parser.parse(data_time) # pollutant_name = resp.xpath( u'//*[@id="EnvitechGrid1_GridTable"]/tr[1]/td')[1:] pollutant_name = [ el.xpath(u"span/text()").extract_first() for el in pollutant_name ] pollutant_data = resp.xpath( u'//*[@id="EnvitechGrid1_GridTable"]/tr[2]/td')[1:] pollutant_data = [ el.xpath(u"span/text()").extract_first() for el in pollutant_data ] data = zip(pollutant_name, pollutant_data) station_data = dict() for record in data: pollutant = Kind(self.name).get_dict(r_key=record[0], r_val=record[1]) if pollutant: station_data[pollutant[u"key"]] = pollutant[u"val"] if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time.replace(tzinfo=timezone(self.tz)) items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = resp.meta[u"code"] yield items
def get_st_data(self, resp): json = self.get_page(resp) for obj in json: body = Selector(text=obj[u"description"]) rows = body.xpath(u"//html/body/table/tr[3]/td/table/tr") st_id = str(body.xpath(u"//html/body/table/tr[1]/td/text()").extract_first()) data_date = body.xpath(u"//html/body/table/tr[2]/td/text()").extract_first() data_date = str(data_date.rstrip(u" (IST)")) data_date = parse(data_date) # print(data_date) new_dt = data_date.replace(tzinfo=pytz.timezone(self.tz)) st_data = dict() for row in rows: col = row.xpath(u"td/text()").extract() try: _name = col[0] _val = col[1] _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val) if _tmp_dict: st_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"] except IndexError: pass if st_data: items = AppItem() items[u"scrap_time"] = datetime.datetime.now(tz=pytz.timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = new_dt items[u"data_value"] = st_data items[u"source"] = self.source items[u"source_id"] = st_id yield items
def get_st_data(self, resp): row_json = resp.body.decode(u"utf-8") row_json = row_json.replace(u"\ufeff", u"") json = js_loads(row_json) data_time = self.get_date(json[u"DateTime"]) stations = json[u"Stations"] for st in stations: name = st[u"Station"] params = st[u"ParameterValueList"] st_data = dict() for p in params: pol_name = p[u"Id"] pol_value = p[u"Value"] _tmp_dict = Kind(self.name).get_dict(r_key=pol_name, r_val=pol_value) if _tmp_dict: st_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"] if st_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time items[u"data_value"] = st_data items[u"source"] = self.source items[u"source_id"] = name yield items
def get_station_data(self, resp): data_time = resp.xpath(u'//*[@id="main"]/div[1]/div[2]/p[3]/text()').re(u"(\d\d\/\d\d\/\d\d\d\d\s\d\d:\d\d)") data_time = parser.parse(data_time[0]).replace(tzinfo=timezone(self.tz)) if data_time else None table = resp.xpath(u'//*[@id="tabs-content-data"]/table/tbody/tr') station_data = dict() for row in table: pollutant_index = row.xpath(u"td[1]/sub/text()").extract_first() if row.xpath(u"td[1]/sub/text()").extract_first() != None else u"" pollutant_name = u" ".join(( row.xpath(u"td[1]/text()").extract_first().split(u" (")[0], pollutant_index, row.xpath(u"td[4]/text()").extract_first() )).replace(u" ", u" ") pollutant_value = row.xpath(u"td[3]/text()").extract_first().split(u" ")[0] if row.xpath(u"td[3]/text()").extract_first().split(u" ")[0] != u"No" else None pollutant = Kind(self.name).get_dict(r_key=pollutant_name, r_val=pollutant_value) if pollutant: station_data[pollutant[u"key"]] = pollutant[u"val"] if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = resp.meta[u"code"] yield items
def get_st_data(self, resp): regex = u".*ST_ID=(.+)" st_id = findall(regex, resp.url) st_id = str(st_id[0]) table_name = resp.xpath( u'//*[@id="EnvitechGrid1_GridTable"]/tr[1]/td/span') table_val = resp.xpath( u'//*[@id="EnvitechGrid1_GridTable"]/tr[2]/td/span') names = list() for row in table_name: # print(row) row_name = row.xpath(u"@title").extract_first() regex_name = u"(.+)" _name = findall(regex_name, row_name) try: names.append(_name[0]) except IndexError: names.append(None) values = list() for row in table_val: row_val = row.xpath(u"@title").extract_first() regex_val = u"\) (.+)" _val = findall(regex_val, row_val) try: values.append(_val[0]) except IndexError: values.append(None) # for n in names: # self.tmp_set.add(n) # open("manitoba_names.txt", "a").write(str(self.tmp_set) + "\n") try: new_dt = self.check_date(values[0]) except IndexError: new_dt = None data = zip(names, values) data.pop(0) # print(data) st_data = dict() for val in data: _name = val[0] _val = val[1] _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val) if _tmp_dict: st_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"] if st_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = new_dt items[u"data_value"] = st_data items[u"source"] = self.source items[u"source_id"] = st_id yield items
def get_st_data(self, resp): body = resp.body body = body.replace(u"\n", u"") json = js_parse(body) exception = (u"name", u"gps", u"date", u"datetime", u"time") for station in json: station_name = station[u"name"] data_time = self.get_date(station[u"datetime"]) station_data = dict() for attr_name in station: if attr_name not in exception: pol_name = attr_name pol_value = station[attr_name] _tmp_dict = Kind(self.name).get_dict(r_key=pol_name, r_val=pol_value) if _tmp_dict: station_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"] if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = station_name yield items
def get_station_data(self, resp): raw_text = resp.xpath(u'/html/head/script[last()]/text()') raw_pollutant_data = raw_text.re(u"var data = (.+);") raw_pollutant_data = [ el.replace(u"'", u'"').replace(u"],]", u"]]") for el in raw_pollutant_data ] pollutant_data = [ujson.loads(el) for el in raw_pollutant_data] pollutant_data = [el[-1] for el in pollutant_data] pollutant_value = [el[1] for el in pollutant_data] # pollutant_date = [el[0] for el in pollutant_data] pollutant_date = [ parser.parse(el[0]).replace(tzinfo=timezone(self.tz)) for el in pollutant_data ] # max value as current date current_data_time = max(pollutant_date) # print(current_data_time) # data_time = parser.parse(raw_data_time).replace(tzinfo=timezone(self.tz)) raw_pollution_name = raw_text.re(u"series: \[\s+\{\s+name: '(.+)',?") pollutant_name = [ Selector(text=el).xpath(u"/html/body/p/text()").re(u"(.+)\(")[0] for el in raw_pollution_name ] pollutant_units = [ Selector(text=el).xpath(u"/html/body/p/text()").re(u"\((.+)\)")[0] for el in raw_pollution_name ] data = zip(pollutant_name, pollutant_value, pollutant_units, pollutant_date) station_data = dict() for record in data: pollutant = Feature(self.name) pollutant.set_source(self.source) # print("record", record) pollutant.set_raw_name(record[0]) pollutant.set_raw_value(record[1]) pollutant.set_raw_units(record[2]) # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units()) if pollutant.get_name() is not None and pollutant.get_value( ) is not None and record[3] == current_data_time: station_data[pollutant.get_name()] = pollutant.get_value() if station_data and current_data_time: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = current_data_time items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = resp.meta[u"code"] yield items
def get_station_data(self, resp): raw_col_names = resp.xpath(u"/html/body/div[1]/div[2]/table/tr[1]/td").extract() col_names = [re.sub(u"<.+?>", u"", el) for el in raw_col_names] # for el in col_names: # print el table = resp.xpath(u'/html/body/div[1]/div[2]/table//td') table_data = [el.xpath(u".").re(u"<td>(.+)<\/td>")[0] if el.xpath(u".").re(u"<td>(.+)<\/td>") else None for el in table] # print(table_data) # print(len(table_data)) table_data = np.asarray(table_data).reshape(len(table_data)/len(col_names), len(col_names)) df = pd.DataFrame(table_data[1:, ], columns=col_names) # print(df) raw_data = df.iloc[0].to_dict() raw_data_time = raw_data.pop(u"Дата і час", None) data_time = parser.parse(raw_data_time, dayfirst=True).replace(tzinfo=timezone(self.tz)) data = raw_data units = { u"Температура повітря": u"degc", u"Опади": u"mm", u"Рівень №2": u"NA", u"Рівень №1": u"NA", u"Рівень": u"NA", u"Температура води": u"degc", } station_data = dict() for key, val in data.items(): # print(key) poll_name = key poll_value = val poll_units = units[key] # print(poll_name, poll_value, poll_units) pollutant = Feature(self.name) pollutant.set_source(self.source) # print("record", record) pollutant.set_raw_name(poll_name) pollutant.set_raw_value(poll_value) pollutant.set_raw_units(poll_units) # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units()) if pollutant.get_name() is not None and pollutant.get_value() is not None: station_data[pollutant.get_name()] = pollutant.get_value() if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = resp.meta[u"code"] yield items
def get_station_data(self, resp): # print(resp.text) all_data = pd.read_csv(StringIO(resp.text), names=(u"station_name", u"str_date", u"pollutant_name", u"pollutant value")).dropna(axis=0) all_data[u"date_time"] = [ parser.parse(x) for x in all_data[u'str_date'] ] current_data_time = all_data[u"date_time"].max() # print(current_data_time) curr_all_data = all_data[all_data[u"date_time"] == current_data_time] idx = curr_all_data.groupby( by=u"pollutant_name")[u"date_time"].transform( max) == curr_all_data[u"date_time"] units = { u"PM2.5": u"ug/m3", u"PM10-NEW": u"ug/m3", u"Sulfur Dioxide": u"ppb", u"Carbon Monoxide": u"ppb", u"Ozone 1 hour": u"ppb", u"Nitrogen Dioxide": u"ppb", } data = curr_all_data[idx].copy() station_data = dict() for el in data.itertuples(): pollutant_name = el[3] pollutant_value = el[4] pollutant_units = units.get(pollutant_name) # print(pollutant_name, pollutant_value, pollutant_units) pollutant = Feature(self.name) pollutant.set_source(self.source) pollutant.set_raw_name(pollutant_name) pollutant.set_raw_value(pollutant_value) pollutant.set_raw_units(pollutant_units) # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units()) if pollutant.get_name() is not None and pollutant.get_value( ) is not None: station_data[pollutant.get_name()] = pollutant.get_value() # print(station_data) if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = pd.to_datetime(current_data_time).replace( tzinfo=timezone(self.tz)) items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = resp.meta[u"code"] yield items
def get_st_data(self, resp): date = resp.xpath(u'//*[@id="MainContent"]/div[2]/div[2]/ul/li/span/a/text()').extract_first() date = date.replace(u"\t", u"") date = date.replace(u"\nAir Quality Index ", u"") all_tables = resp.xpath(u'//*[@class="table table-alternate table-condensed"]') tables = [x for i, x in enumerate(all_tables) if i % 2 == 0] tables_date = [x for i, x in enumerate(all_tables) if i % 2 != 0] tables = tables[:len(tables) - 1] for index, table in enumerate(tables): hour = tables_date[index].xpath(u"tbody/tr/td/text()").re_first(u"AQI at (\d+) hrs") row_data_time = date + u" " + hour data_time = parse(row_data_time).replace(tzinfo=timezone(self.tz)) station_id = table.xpath(u"thead/tr/th[1]/text()").re_first(u"(.+) A.Q.M.S.") if u" Particles" in station_id: station_id = station_id.replace(u" Particles", u"") if u" Mobile" in station_id: station_id = station_id.replace(u" Mobile", u"") rows = table.xpath(u"tbody/tr") station_data = dict() for row in rows: name = row.xpath(u"td[1]/text()").extract_first() name = name.replace(u" ", u"") aqi = float(row.xpath(u"td[3]/text()").extract_first()) if name == u"SO2": val_name = u"so2" elif name == u"NO2": val_name = u"no2" elif name == u"O3": val_name = u"o3" elif name == u"PM10": val_name = u"pm10" elif name == u"PM2.5": val_name = u"pm25" elif name == u"CO": val_name = u"co" else: val_name = None if val_name: val = Aqi().aqi_to_val(aqi, val_name) _tmp_dict = Kind(self.name).get_dict(r_key=name, r_val=val) if _tmp_dict: station_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"] if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = station_id yield items
def get_station_data(self, resp): records = resp.xpath(u"//record") table = list() for rec in records: date = rec.xpath(u"./@date").extract_first() hour = rec.xpath(u"./@hour").extract_first() if hour == u"2400": hour = u"0000" raw_data_date = u" ".join((date, hour)) data_date = parser.parse(raw_data_date) values = rec.xpath(u"child::node()") for val in values: pollutant_name = val.xpath(u"name(.)").extract_first() pollutant_value = val.xpath(u"./text()").extract_first() pollutant_unit = val.xpath(u"./@unit").extract_first() row = { u"date": data_date, u"pollutant_name": pollutant_name, u"pollutant_value": pollutant_value, u"pollutant_unit": pollutant_unit, } table.append(row) data = pd.DataFrame(table) data = data.dropna(axis=0) current_data_time = data[u"date"].max() curr_data = data[data[u"date"] == current_data_time] station_data = dict() for el in curr_data[[u"pollutant_name", u"pollutant_value", u"pollutant_unit"]].itertuples(index=False): pollutant_name = el[0] pollutant_value = el[1] pollutant_units = el[2] pollutant = Feature(self.name) pollutant.set_source(self.source) pollutant.set_raw_name(pollutant_name) pollutant.set_raw_value(pollutant_value) pollutant.set_raw_units(pollutant_units) # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units()) if pollutant.get_name() is not None and pollutant.get_value() is not None: station_data[pollutant.get_name()] = pollutant.get_value() if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = pd.to_datetime(current_data_time).replace(tzinfo=timezone(self.tz)) items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = resp.meta[u"code"] return items
def get_additional_data(self, resp): weather_data = resp.meta[u"data"] body = resp.body body = body.split(u"\r\n") col_names = body[8].lstrip(u"#") col_names = col_names.split(u", ") col_names = col_names[1:] # print(col_names) data_time = self.get_date(body[1]) table = body[9:len(body) - 1] for row in table: col = row.split(u",") col_values = list() for el in col: if u" " in el: el = el.replace(u" ", u"") if u"/" in el: el = None if u"-99" == el: el = None if u"-999" == el: el = None if u"-9999" == el: el = None col_values.append(el) station_id = col_values[0] col_values = col_values[1:] # print(col_values, station_id) data = zip(col_names, col_values) station_data = dict() for st in data: _name = st[0] _val = st[1] _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val) if _tmp_dict: station_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"] # доклеюємо дані із першої відповіді station_data.update(weather_data[station_id]) if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = station_id yield items
def get_station_data(self, resp): raw_poll_name = resp.xpath( u'//*[@id="C1WebGrid1"]/tr[1]/td')[1:].extract() poll_name = [ re.findall(u"\r\n\t(.+)\r\n", re.sub(u"<.+?>", u"", el))[0] for el in raw_poll_name ] raw_poll_unit = resp.xpath( u'//*[@id="C1WebGrid1"]/tr[2]/td')[1:].extract() poll_unit = [ re.findall(u"\r\n\t(.+)\r\n", re.sub(u"<.+?>", u"", el))[0] for el in raw_poll_unit ] raw_data = resp.xpath(u'//*[@id="C1WebGrid1"]/tr[last()]/td') data_time = raw_data[0].xpath(u'.//div[1]/text()').extract_first() data_time = data_time.replace(u"24:00", u"00:00") data_time = parser.parse(data_time).replace(tzinfo=timezone(self.tz)) # print(data_time) raw_pollutant_value = raw_data[1:] pollutant_value = list() for el in raw_pollutant_value: value = el.xpath(u'.//div/text()').extract_first() value = clean(value) if u"\xa0" in value: value = None pollutant_value.append(value) data = zip(poll_name, pollutant_value, poll_unit) # print(data) station_data = dict() for record in data: pollutant = Feature(self.name) pollutant.set_source(self.source) # print("record", record) pollutant.set_raw_name(record[0]) pollutant.set_raw_value(record[1]) pollutant.set_raw_units(record[2]) # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units()) if pollutant.get_name() is not None and pollutant.get_value( ) is not None: station_data[pollutant.get_name()] = pollutant.get_value() if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = resp.meta[u"code"] yield items
def get_station_data(self, resp): row_names = ('Date', 'HOP', 'HGC', 'BSY', 'MEX', 'MTC', 'HEW', 'CLY') # опрацювання pdf документу stream = StringIO.StringIO(resp.body) rsrcmgr = PDFResourceManager() retstr = StringIO.StringIO() laparams = LAParams() device = TextConverter(rsrcmgr, retstr, laparams=laparams) process_pdf(rsrcmgr, device, stream) device.close() doc_str = retstr.getvalue() retstr.close() # розбиваємо построчно row_data = doc_str.split('\n') # data_time = row_data[len(row_data)-7] # print(data_time) first_row = row_data[17:25] data = zip(row_names, first_row) data_time = self.get_date(data[0][1]) data = data[1:] for st in data: station_id = st[0] _name = 'PM10_24HR' _val = st[1] if '*' in _val: _val = _val.replace('*', '') _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val) station_data = dict() if _tmp_dict: station_data[_tmp_dict['key']] = _tmp_dict['val'] # print(station_data) if station_data: items = AppItem() items['scrap_time'] = datetime.now( tz=timezone(SCRAPER_TIMEZONE)) items['data_time'] = data_time items['data_value'] = station_data items['source'] = 'http://superpit.com.au' items['source_id'] = station_id yield items
def get_station_data(self, resp): raw_poll_name = resp.xpath(u'//*[@id="C1WebGrid1"]/tr[1]/td')[1:] poll_name = [ el.xpath(u".//div[1]/text()").re(u"\r\n\t(.+)\r\n")[0] for el in raw_poll_name ] raw_poll_units = resp.xpath(u'//*[@id="C1WebGrid1"]/tr[2]/td')[1:] poll_units = [ el.xpath(u".//div[1]/text()").re(u"\r\n\t(.+)\r\n")[0] for el in raw_poll_units ] raw_poll_value_data = resp.xpath( u'//*[@id="C1WebGrid1"]/tr[last()]/td') raw_poll_value = raw_poll_value_data[1:] poll_value = [ el.xpath(u".//div[1]/text()").re(u"\r\n\t(.+)\r\n")[0] for el in raw_poll_value ] raw_data_time = raw_poll_value_data[0].xpath(u".//div[1]/text()").re( u"\r\n\t(.+)\r\n")[0] data_time = parser.parse(raw_data_time).replace( tzinfo=timezone(self.tz)) data = zip(poll_name, poll_value, poll_units) station_data = dict() for record in data: pollutant = Feature(self.name) pollutant.set_source(self.source) # print("record", record) pollutant.set_raw_name(record[0]) pollutant.set_raw_value(record[1]) pollutant.set_raw_units(record[2]) # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units()) if pollutant.get_name() is not None and pollutant.get_value( ) is not None: station_data[pollutant.get_name()] = pollutant.get_value() if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = resp.meta[u"code"] yield items
def get_st_data(self, resp): regex = u"AP=(.+)" st_id = findall(regex, resp.url) st_id = str(st_id[0]) row_names = resp.xpath(u'//*[@id="apTable"]/table/tr[1]/th/span') row_data = resp.xpath(u'//*[@id="apTable"]/table/tr[2]/td') new_dt = self.check_date(resp) print(new_dt) names = list() for name in row_names: _name = name.xpath(u"text()").extract() try: names.append(_name[0]) except IndexError: names.append(None) # print(_name) vals = list() for val in row_data: _val = val.xpath(u"text()").extract() try: vals.append(_val[0]) except IndexError: vals.append(None) data = zip(names, vals) data.pop(0) st_data = dict() for val in data: _name = val[0] _val = val[1] _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val) if _tmp_dict: st_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"] if st_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = new_dt items[u"data_value"] = st_data items[u"source"] = self.source items[u"source_id"] = st_id yield items
def get_station_data(self, resp): all_data = self.get_clean_data(resp) current_data_time = all_data[u"date"].max() curr_all_data = all_data[all_data[u"date"] == current_data_time] # idx = curr_all_data.groupby(by=u"pollutant_name")[u"date"].transform(max) == curr_all_data[u"date"] # data = curr_all_data[idx].copy() data = curr_all_data data = data[[ u"station_name", u"pollutant_name", u"pollutant_value", u"unit" ]] grouped = data.groupby(by=u"station_name") for name, gr in grouped: station_data = dict() station_id = None for record in gr.itertuples(index=False): if station_id is None: station_id = record[0] pollutant_name = record[1] pollutant_value = record[2] pollutant_units = record[3] # print(station_id, pollutant_name, pollutant_value, pollutant_units) pollutant = Feature(self.name) pollutant.set_source(self.source) pollutant.set_raw_name(pollutant_name) pollutant.set_raw_value(pollutant_value) pollutant.set_raw_units(pollutant_units) # # print("answare", station_id, pollutant.get_name(), pollutant.get_value(), pollutant.get_units()) if pollutant.get_name() is not None and pollutant.get_value( ) is not None: station_data[pollutant.get_name()] = pollutant.get_value() if station_data: items = AppItem() items[u"scrap_time"] = datetime.now( tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = pd.to_datetime( current_data_time).replace(tzinfo=timezone(self.tz)) items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = station_id yield items
def get_station_data(self, resp): raw_pollutant_name = resp.xpath( u'//*[@id="EnvitechGrid1_GridTable"]/tr[1]/td')[1:] pollutant_name_data = [ el.xpath(u".//span[1]/@title").extract_first().split(u"\n\n") for el in raw_pollutant_name ] pollutant_name = [el[0] for el in pollutant_name_data] pollutant_units = [el[1] for el in pollutant_name_data] pollutant_name = map(lambda x: u" ".join(x.split()), pollutant_name) raw_data = resp.xpath(u'//*[@id="EnvitechGrid1_GridTable"]/tr[2]/td') data_time = raw_data[0].xpath(u".//span[1]/text()").extract_first() data_time = data_time.replace(u"24:00", u"00:00") data_time = parser.parse(data_time).replace(tzinfo=timezone(self.tz)) raw_pollutant_value = raw_data[1:] pollutant_value = [ el.xpath(u'.//span[1]/text()').extract_first() for el in raw_pollutant_value ] data = zip(pollutant_name, pollutant_value, pollutant_units) # print(data) station_data = dict() for record in data: pollutant = Feature(self.name) pollutant.set_source(self.source) # print("record", record) pollutant.set_raw_name(record[0]) pollutant.set_raw_value(record[1]) pollutant.set_raw_units(record[2]) # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units()) if pollutant.get_name() is not None and pollutant.get_value( ) is not None: station_data[pollutant.get_name()] = pollutant.get_value() if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = resp.meta[u"code"] yield items
def get_st_data(self, resp): regex = u".*ST_ID=(.+)" station_id = findall(regex, resp.url) station_id = str(station_id[0]) # формужмо значення назв забрудників таблиці row_col_names = resp.xpath( u'//*[@id="C1WebGrid1"]/tbody/tr[1]/td/div/text()').extract() col_names = list() for col_name in row_col_names: col_name = col_name.lstrip(u'\n\t') col_name = col_name.rstrip(u'\n\t') col_names.append(col_name) col_names = col_names[1:] # витягуємо значення забрудників row_data = resp.xpath(u'//*[@id="C1WebGrid1"]/tbody/tr[5]/td') data_values = list() for data in row_data: row_value = data.xpath(u"div/text()").re(u"([\S].+[\S])") try: data_values.append(row_value[0]) except IndexError: data_values.append(None) # значення дати даних data_date = parse(data_values[0]) data_date = data_date.replace(tzinfo=timezone(self.tz)) data_values = data_values[1:] data = zip(col_names, data_values) station_data = dict() for st in data: _name = st[0] _val = st[1] _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val) if _tmp_dict: station_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"] if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_date items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = station_id return items
def get_st_data(self, resp): table = resp.xpath(u'//*[@id="right_column"]/div/table/tbody/tr') col_names = resp.xpath(u'//*[@id="right_column"]/div/table/thead//th/@abbr').extract() data_date = resp.xpath(u'//*[@id="right_column"]/div/h1/text()').extract() data_date = str(data_date[0]).lstrip(u"Pollutant Concentrations for ") dt = parse(data_date) new_dt = dt.replace(tzinfo=timezone(self.tz)) # get correct values, if there is no value add "" for row in table: cols = row.xpath(u"td") # get id from href url = cols[0].xpath(u"div/a/@href").extract() url = str(url[0]) regex_station = u"stationid=(.+)" st_id = re.findall(regex_station, url) st_id = str(st_id[0]) row_data = [] for col in cols: text = col.xpath(u"div[1]/text()").extract() try: row_data.append(text[0]) except IndexError: row_data.append(None) gen_data = zip(col_names, row_data) st_data = dict() for data in gen_data: _key = data[0] _val = data[1] _tmp_dict = Kind(self.name).get_dict(r_key=_key, r_val=_val) if _tmp_dict: st_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"] if st_data: items = AppItem() items[u"scrap_time"] = datetime.datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = new_dt items[u"data_value"] = st_data items[u"source"] = self.source items[u"source_id"] = st_id yield items
def get_st_data(self, resp): data = self.read_st_data(resp) st_data = data[0] new_dt = data[1] # print(st_data) for st_id in st_data: print(st_id) items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = new_dt items[u"data_value"] = st_data[st_id] items[u"source"] = self.source items[u"source_id"] = st_id items[u"st_name"] = st_id yield items
def push_data(self, resp): # print("PUSH!!!!!!!!!!!!!!!!!!!!!!") data = resp.meta["data"] df = pd.DataFrame(data) # print(df.groupby(by=["unit", "name"]).size()) df["value"] = pd.to_numeric(df["value"]) df = df[pd.notnull(df["value"])] current_time = self.get_max_valid_date(df) # print(current_time) current_data = df[df["time"] == current_time] print(current_data) grouped = current_data.groupby(by="station_id") # print(df.groupby(by="name").size()) for station_id, gr in grouped: station_data = dict() for poll in gr[["name", "value", "unit"]].itertuples(index=False): pollutant_name = poll[0] pollutant_value = poll[1] pollutant_units = poll[2] # print(pollutant_name, pollutant_value, pollutant_units) pollutant = Feature(self.name) pollutant.set_source(self.source) pollutant.set_raw_name(pollutant_name) pollutant.set_raw_value(pollutant_value) pollutant.set_raw_units(pollutant_units) # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units()) if pollutant.get_name() is not None and pollutant.get_value() is not None: station_data[pollutant.get_name()] = pollutant.get_value() if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = pd.to_datetime(current_time).replace(tzinfo=timezone(self.tz)) items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = station_id yield items
def get_st_data(self, resp): data_date = self.get_date(resp) for st in self.get_page(resp): st_id = self.get_st_name(st[u"data"]) st_data = self.get_value(st[u"data"]) if st_data: items = AppItem() items[u"scrap_time"] = datetime.now( tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_date items[u"data_value"] = st_data items[u"source"] = self.source items[u"source_id"] = st_id yield items
def get_st_data(self, resp): # get local id from url regex_state = u"StateId=(.+?)&" regex_city = u"CityId=(.+)" regex_station = u"StationName=(.+?)&" _state_id = re.findall(regex_state, resp.url) _city_id = re.findall(regex_city, resp.url) _station_id = re.findall(regex_station, resp.url) st_id = u"".join( (_station_id[0].replace(u"%20", u" "), _state_id[0], _city_id[0])) table = resp.xpath(u'//*[@id="lblReportCurrentData"]/table/child::*') data_date = resp.xpath( u'//*[@id="lblCurrentDateTime"]/text()').extract_first() st_data = {} for el in table: col = el.xpath(u"child::td") try: name = col[0].xpath(u"text()").extract() _name = name[0] _val = col[3].xpath(u"span/text()").extract_first() _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val) if _tmp_dict: st_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"] except IndexError: pass if data_date: data_date = data_date.replace(u"Date Time : ", u"") new_dt = parse(data_date) new_dt = new_dt.replace(tzinfo=timezone(self.tz)) else: new_dt = None if st_data: items = AppItem() items[u"scrap_time"] = datetime.datetime.now( tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = new_dt items[u"data_value"] = st_data items[u"source"] = self.source items[u"source_id"] = st_id return items
def validate_station_data(self, meta): """validate dictionary from response META attribute""" results = meta.get(u"results") res1 = results.get(u"res1") res2 = results.get(u"res2") res3 = results.get(u"res3") df1 = DataFrame.from_dict(res1).sort_values(by=u"date") df2 = DataFrame.from_dict(res2).sort_values(by=u"date") df3 = DataFrame.from_dict(res3).sort_values(by=u"date") df = merge(df1, df2, on=u"date") df = merge(df, df3, on=u"date") # print(df) source_code = meta.get(u"station_code") for obs in df.itertuples(): st_data = dict() co = Kind(self.name).get_dict(r_key=u"co", r_val=obs.co) pm10 = Kind(self.name).get_dict(r_key=u"pm10", r_val=obs.pm10) no2 = Kind(self.name).get_dict(r_key=u"no2", r_val=obs.no2) if co: st_data[co[u"key"]] = co[u"val"] if pm10: st_data[pm10[u"key"]] = pm10[u"val"] if no2: st_data[no2[u"key"]] = no2[u"val"] data_time = obs.date.to_datetime() data_time = data_time.replace(tzinfo=timezone(self.tz)) if st_data: items = AppItem() items[u"scrap_time"] = datetime.now( tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time items[u"data_value"] = st_data items[u"source"] = self.source items[u"source_id"] = source_code yield items