예제 #1
0
    def get_station_data(self, resp):
        body = resp.body
        root = etree.fromstring(body)
        row_data = root.xpath(u"//Measurement")

        station = dict()
        date = root.xpath(u"//Measurement[1]/Data/E/T/text()")
        data_time = parse(date[0]).replace(tzinfo=timezone(self.tz))

        for st in row_data:
            name = st.xpath(u"@SiteName")
            pol_name = st.xpath(u"DataSource/@Name")
            pol_val = st.xpath(u"Data/E/I1/text()")
            pol_time = st.xpath(u"Data/E/T/text()")
            print(name, pol_name, pol_val, pol_time)

            if name[0] not in station:
                station[name[0]] = dict()

            _tmp_dict = Kind(self.name).get_dict(r_key=pol_name[0],
                                                 r_val=pol_val[0])
            if _tmp_dict:
                station[name[0]][_tmp_dict[u"key"]] = _tmp_dict[u"val"]

        for st_data in station:
            if st_data:
                items = AppItem()
                items[u"scrap_time"] = datetime.now(
                    tz=timezone(SCRAPER_TIMEZONE))
                items[u"data_time"] = data_time
                items[u"data_value"] = station[st_data]
                items[u"source"] = self.source
                items[u"source_id"] = st_data

                yield items
예제 #2
0
    def get_st_data(self, resp):
        regex = u"Data/(.+)_Line\.xml"
        st_id = findall(regex, resp.url)
        st_id = str(st_id[0])

        row_data = resp.xpath(u"tname[1]/child::*")
        row_dt = resp.xpath(u"tname[1]/DATE_TIME/text()").extract_first()
        new_dt = self.check_date(row_dt)

        st_data = dict()
        for el in row_data:
            tag_name = el.xpath(u"name()").extract_first()
            tag_val = el.xpath(u"text()").extract_first()
            _name = tag_name
            _val = tag_val

            _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val)
            if _tmp_dict:
                st_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"]

        if st_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = new_dt
            items[u"data_value"] = st_data
            items[u"source"] = self.source
            items[u"source_id"] = st_id

            yield items
예제 #3
0
    def get_station_data(self, resp):
        row_rows = resp.xpath(
            u"/html/body/div[7]/div/div[1]/div[2]/div/div[3]/table/tbody/tr[2]"
        )
        print(row_rows)
        station_name = row_rows.xpath(u"td[1]/b/text()").extract_first()
        station_id = station_name
        aqi = row_rows.xpath(u"td[2]/p/text()").extract_first()
        pm25 = row_rows.xpath(u"td[3]/p/text()").extract_first()
        row_data_time = row_rows.xpath(u"td[4]/span/text()").extract_first()
        print(row_data_time)
        data_time = self.parse_date(row_data_time)

        _tmp_dict = Kind(self.name).get_dict(r_key=u"pm25", r_val=pm25)
        station_data = dict()
        if _tmp_dict:
            station_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"]

        _tmp_dict = Kind(self.name).get_dict(r_key=u"aqi", r_val=aqi)
        if _tmp_dict:
            station_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"]

        if station_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = data_time
            items[u"data_value"] = station_data
            items[u"source"] = self.source_name
            items[u"source_id"] = str(station_id)

            yield items
예제 #4
0
    def get_st_data(self, resp):
        regex = u"Details/(.+)\?type=1"
        st_id = findall(regex, resp.url)
        st_id = str(st_id[0])

        row_data = resp.xpath(u'//*[@id="recentResults"]/tbody/tr')
        row_dt_hour = resp.xpath(
            u'//*[@id="recentResults"]/thead/tr[2]/th[1]/text()').re_first(
                u"Current \((.+)\)")
        new_dt = self.check_date(row_dt_hour)

        st_data = dict()
        for data in row_data:
            _name = data.xpath(u"td[1]/text()").re_first(u"(.+[\S])")
            _val = data.xpath(u"td[2]/text()").re_first(u"(.+[\S])")

            _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val)
            if _tmp_dict:
                st_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"]

        if st_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = new_dt
            items[u"data_value"] = st_data
            items[u"source"] = self.source
            items[u"source_id"] = st_id

            yield items
예제 #5
0
    def get_station_data(self, resp):
        _station_id = Selector(text=resp.url).re(u"site_id=(\d+)")
        station_id = _station_id[0]

        data_time = self.get_date(resp)
        table = resp.xpath(u'//*[@id="tab1"]/table/tr')

        station_data = dict()
        for row in table:
            col = row.xpath(u"td/text()").extract()
            col = col[:2]

            _name = col[0]
            _val = col[1]
            _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val)
            if _tmp_dict:
                station_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"]

        if station_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = data_time
            items[u"data_value"] = station_data
            items[u"source"] = self.source
            items[u"source_id"] = station_id

            yield items
예제 #6
0
    def get_station_data(self, resp):
        data_time = resp.xpath(
            u'//*[@id="EnvitechGrid1_GridTable"]/tr[2]/td[1]/span/text()'
        ).extract_first()
        data_time = parser.parse(data_time)
        #
        pollutant_name = resp.xpath(
            u'//*[@id="EnvitechGrid1_GridTable"]/tr[1]/td')[1:]
        pollutant_name = [
            el.xpath(u"span/text()").extract_first() for el in pollutant_name
        ]
        pollutant_data = resp.xpath(
            u'//*[@id="EnvitechGrid1_GridTable"]/tr[2]/td')[1:]
        pollutant_data = [
            el.xpath(u"span/text()").extract_first() for el in pollutant_data
        ]

        data = zip(pollutant_name, pollutant_data)

        station_data = dict()
        for record in data:
            pollutant = Kind(self.name).get_dict(r_key=record[0],
                                                 r_val=record[1])
            if pollutant:
                station_data[pollutant[u"key"]] = pollutant[u"val"]

        if station_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = data_time.replace(tzinfo=timezone(self.tz))
            items[u"data_value"] = station_data
            items[u"source"] = self.source
            items[u"source_id"] = resp.meta[u"code"]

            yield items
예제 #7
0
    def get_st_data(self, resp):
        json = self.get_page(resp)

        for obj in json:
            body = Selector(text=obj[u"description"])
            rows = body.xpath(u"//html/body/table/tr[3]/td/table/tr")
            st_id = str(body.xpath(u"//html/body/table/tr[1]/td/text()").extract_first())

            data_date = body.xpath(u"//html/body/table/tr[2]/td/text()").extract_first()
            data_date = str(data_date.rstrip(u" (IST)"))
            data_date = parse(data_date)
            # print(data_date)
            new_dt = data_date.replace(tzinfo=pytz.timezone(self.tz))

            st_data = dict()
            for row in rows:
                col = row.xpath(u"td/text()").extract()
                try:
                    _name = col[0]
                    _val = col[1]
                    _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val)
                    if _tmp_dict:
                        st_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"]
                except IndexError:
                    pass

            if st_data:
                items = AppItem()
                items[u"scrap_time"] = datetime.datetime.now(tz=pytz.timezone(SCRAPER_TIMEZONE))
                items[u"data_time"] = new_dt
                items[u"data_value"] = st_data
                items[u"source"] = self.source
                items[u"source_id"] = st_id
                yield items
예제 #8
0
    def get_st_data(self, resp):
        row_json = resp.body.decode(u"utf-8")
        row_json = row_json.replace(u"\ufeff", u"")
        json = js_loads(row_json)

        data_time = self.get_date(json[u"DateTime"])
        stations = json[u"Stations"]

        for st in stations:
            name = st[u"Station"]
            params = st[u"ParameterValueList"]

            st_data = dict()
            for p in params:
                pol_name = p[u"Id"]
                pol_value = p[u"Value"]

                _tmp_dict = Kind(self.name).get_dict(r_key=pol_name, r_val=pol_value)
                if _tmp_dict:
                    st_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"]

            if st_data:
                items = AppItem()
                items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
                items[u"data_time"] = data_time
                items[u"data_value"] = st_data
                items[u"source"] = self.source
                items[u"source_id"] = name

                yield items
예제 #9
0
    def get_station_data(self, resp):
        data_time = resp.xpath(u'//*[@id="main"]/div[1]/div[2]/p[3]/text()').re(u"(\d\d\/\d\d\/\d\d\d\d\s\d\d:\d\d)")
        data_time = parser.parse(data_time[0]).replace(tzinfo=timezone(self.tz)) if data_time else None

        table = resp.xpath(u'//*[@id="tabs-content-data"]/table/tbody/tr')

        station_data = dict()
        for row in table:
            pollutant_index = row.xpath(u"td[1]/sub/text()").extract_first() if row.xpath(u"td[1]/sub/text()").extract_first() != None else u""
            pollutant_name = u" ".join((
                row.xpath(u"td[1]/text()").extract_first().split(u" (")[0],
                pollutant_index,
                row.xpath(u"td[4]/text()").extract_first()
            )).replace(u"  ", u" ")

            pollutant_value = row.xpath(u"td[3]/text()").extract_first().split(u" ")[0] if row.xpath(u"td[3]/text()").extract_first().split(u" ")[0] != u"No" else None

            pollutant = Kind(self.name).get_dict(r_key=pollutant_name, r_val=pollutant_value)
            if pollutant:
                station_data[pollutant[u"key"]] = pollutant[u"val"]

        if station_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = data_time
            items[u"data_value"] = station_data
            items[u"source"] = self.source
            items[u"source_id"] = resp.meta[u"code"]

            yield items
예제 #10
0
    def get_st_data(self, resp):
        regex = u".*ST_ID=(.+)"
        st_id = findall(regex, resp.url)
        st_id = str(st_id[0])

        table_name = resp.xpath(
            u'//*[@id="EnvitechGrid1_GridTable"]/tr[1]/td/span')
        table_val = resp.xpath(
            u'//*[@id="EnvitechGrid1_GridTable"]/tr[2]/td/span')
        names = list()
        for row in table_name:
            # print(row)
            row_name = row.xpath(u"@title").extract_first()
            regex_name = u"(.+)"
            _name = findall(regex_name, row_name)
            try:
                names.append(_name[0])
            except IndexError:
                names.append(None)

        values = list()
        for row in table_val:
            row_val = row.xpath(u"@title").extract_first()
            regex_val = u"\) (.+)"
            _val = findall(regex_val, row_val)
            try:
                values.append(_val[0])
            except IndexError:
                values.append(None)

        # for n in names:
        #     self.tmp_set.add(n)
        #     open("manitoba_names.txt", "a").write(str(self.tmp_set) + "\n")

        try:
            new_dt = self.check_date(values[0])
        except IndexError:
            new_dt = None

        data = zip(names, values)
        data.pop(0)
        # print(data)

        st_data = dict()
        for val in data:
            _name = val[0]
            _val = val[1]
            _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val)
            if _tmp_dict:
                st_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"]

        if st_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = new_dt
            items[u"data_value"] = st_data
            items[u"source"] = self.source
            items[u"source_id"] = st_id

            yield items
예제 #11
0
    def get_st_data(self, resp):
        body = resp.body
        body = body.replace(u"\n", u"")
        json = js_parse(body)
        exception = (u"name", u"gps", u"date", u"datetime", u"time")
        for station in json:
            station_name = station[u"name"]
            data_time = self.get_date(station[u"datetime"])

            station_data = dict()
            for attr_name in station:
                if attr_name not in exception:
                    pol_name = attr_name
                    pol_value = station[attr_name]

                    _tmp_dict = Kind(self.name).get_dict(r_key=pol_name, r_val=pol_value)
                    if _tmp_dict:
                        station_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"]

            if station_data:
                items = AppItem()
                items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
                items[u"data_time"] = data_time
                items[u"data_value"] = station_data
                items[u"source"] = self.source
                items[u"source_id"] = station_name

                yield items
예제 #12
0
    def get_station_data(self, resp):
        raw_text = resp.xpath(u'/html/head/script[last()]/text()')
        raw_pollutant_data = raw_text.re(u"var data = (.+);")
        raw_pollutant_data = [
            el.replace(u"'", u'"').replace(u"],]", u"]]")
            for el in raw_pollutant_data
        ]
        pollutant_data = [ujson.loads(el) for el in raw_pollutant_data]
        pollutant_data = [el[-1] for el in pollutant_data]

        pollutant_value = [el[1] for el in pollutant_data]

        # pollutant_date = [el[0] for el in pollutant_data]
        pollutant_date = [
            parser.parse(el[0]).replace(tzinfo=timezone(self.tz))
            for el in pollutant_data
        ]

        # max value as current date
        current_data_time = max(pollutant_date)
        # print(current_data_time)
        # data_time = parser.parse(raw_data_time).replace(tzinfo=timezone(self.tz))

        raw_pollution_name = raw_text.re(u"series: \[\s+\{\s+name: '(.+)',?")
        pollutant_name = [
            Selector(text=el).xpath(u"/html/body/p/text()").re(u"(.+)\(")[0]
            for el in raw_pollution_name
        ]
        pollutant_units = [
            Selector(text=el).xpath(u"/html/body/p/text()").re(u"\((.+)\)")[0]
            for el in raw_pollution_name
        ]

        data = zip(pollutant_name, pollutant_value, pollutant_units,
                   pollutant_date)

        station_data = dict()
        for record in data:
            pollutant = Feature(self.name)
            pollutant.set_source(self.source)
            # print("record", record)
            pollutant.set_raw_name(record[0])
            pollutant.set_raw_value(record[1])
            pollutant.set_raw_units(record[2])
            # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units())
            if pollutant.get_name() is not None and pollutant.get_value(
            ) is not None and record[3] == current_data_time:
                station_data[pollutant.get_name()] = pollutant.get_value()

        if station_data and current_data_time:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = current_data_time
            items[u"data_value"] = station_data
            items[u"source"] = self.source
            items[u"source_id"] = resp.meta[u"code"]

            yield items
예제 #13
0
    def get_station_data(self, resp):
        raw_col_names = resp.xpath(u"/html/body/div[1]/div[2]/table/tr[1]/td").extract()
        col_names = [re.sub(u"<.+?>", u"", el) for el in raw_col_names]
        # for el in col_names:
        #     print el

        table = resp.xpath(u'/html/body/div[1]/div[2]/table//td')
        table_data = [el.xpath(u".").re(u"<td>(.+)<\/td>")[0] if el.xpath(u".").re(u"<td>(.+)<\/td>") else None for el
                      in table]
        # print(table_data)
        # print(len(table_data))
        table_data = np.asarray(table_data).reshape(len(table_data)/len(col_names), len(col_names))

        df = pd.DataFrame(table_data[1:, ], columns=col_names)
        # print(df)
        raw_data = df.iloc[0].to_dict()
        raw_data_time = raw_data.pop(u"Дата і час", None)

        data_time = parser.parse(raw_data_time, dayfirst=True).replace(tzinfo=timezone(self.tz))

        data = raw_data

        units = {
            u"Температура повітря": u"degc",
            u"Опади": u"mm",
            u"Рівень №2": u"NA",
            u"Рівень №1": u"NA",
            u"Рівень": u"NA",
            u"Температура води": u"degc",
        }

        station_data = dict()
        for key, val in data.items():
            # print(key)
            poll_name = key
            poll_value = val
            poll_units = units[key]
            # print(poll_name, poll_value, poll_units)

            pollutant = Feature(self.name)
            pollutant.set_source(self.source)
            # print("record", record)
            pollutant.set_raw_name(poll_name)
            pollutant.set_raw_value(poll_value)
            pollutant.set_raw_units(poll_units)
            # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units())
            if pollutant.get_name() is not None and pollutant.get_value() is not None:
                station_data[pollutant.get_name()] = pollutant.get_value()

        if station_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = data_time
            items[u"data_value"] = station_data
            items[u"source"] = self.source
            items[u"source_id"] = resp.meta[u"code"]

            yield items
예제 #14
0
    def get_station_data(self, resp):
        # print(resp.text)
        all_data = pd.read_csv(StringIO(resp.text),
                               names=(u"station_name", u"str_date",
                                      u"pollutant_name",
                                      u"pollutant value")).dropna(axis=0)
        all_data[u"date_time"] = [
            parser.parse(x) for x in all_data[u'str_date']
        ]

        current_data_time = all_data[u"date_time"].max()
        # print(current_data_time)
        curr_all_data = all_data[all_data[u"date_time"] == current_data_time]

        idx = curr_all_data.groupby(
            by=u"pollutant_name")[u"date_time"].transform(
                max) == curr_all_data[u"date_time"]

        units = {
            u"PM2.5": u"ug/m3",
            u"PM10-NEW": u"ug/m3",
            u"Sulfur Dioxide": u"ppb",
            u"Carbon Monoxide": u"ppb",
            u"Ozone 1 hour": u"ppb",
            u"Nitrogen Dioxide": u"ppb",
        }

        data = curr_all_data[idx].copy()

        station_data = dict()
        for el in data.itertuples():
            pollutant_name = el[3]
            pollutant_value = el[4]
            pollutant_units = units.get(pollutant_name)

            # print(pollutant_name, pollutant_value, pollutant_units)

            pollutant = Feature(self.name)
            pollutant.set_source(self.source)
            pollutant.set_raw_name(pollutant_name)
            pollutant.set_raw_value(pollutant_value)
            pollutant.set_raw_units(pollutant_units)
            # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units())
            if pollutant.get_name() is not None and pollutant.get_value(
            ) is not None:
                station_data[pollutant.get_name()] = pollutant.get_value()

        # print(station_data)
        if station_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = pd.to_datetime(current_data_time).replace(
                tzinfo=timezone(self.tz))
            items[u"data_value"] = station_data
            items[u"source"] = self.source
            items[u"source_id"] = resp.meta[u"code"]

            yield items
예제 #15
0
    def get_st_data(self, resp):
        date = resp.xpath(u'//*[@id="MainContent"]/div[2]/div[2]/ul/li/span/a/text()').extract_first()
        date = date.replace(u"\t", u"")
        date = date.replace(u"\nAir Quality Index ", u"")

        all_tables = resp.xpath(u'//*[@class="table table-alternate table-condensed"]')
        tables = [x for i, x in enumerate(all_tables) if i % 2 == 0]
        tables_date = [x for i, x in enumerate(all_tables) if i % 2 != 0]
        tables = tables[:len(tables) - 1]
        for index, table in enumerate(tables):
            hour = tables_date[index].xpath(u"tbody/tr/td/text()").re_first(u"AQI at (\d+) hrs")
            row_data_time = date + u" " + hour
            data_time = parse(row_data_time).replace(tzinfo=timezone(self.tz))

            station_id = table.xpath(u"thead/tr/th[1]/text()").re_first(u"(.+) A.Q.M.S.")
            if u" Particles" in station_id:
                station_id = station_id.replace(u" Particles", u"")
            if u" Mobile" in station_id:
                station_id = station_id.replace(u" Mobile", u"")

            rows = table.xpath(u"tbody/tr")

            station_data = dict()
            for row in rows:
                name = row.xpath(u"td[1]/text()").extract_first()
                name = name.replace(u" ", u"")
                aqi = float(row.xpath(u"td[3]/text()").extract_first())

                if name == u"SO2":
                    val_name = u"so2"
                elif name == u"NO2":
                    val_name = u"no2"
                elif name == u"O3":
                    val_name = u"o3"
                elif name == u"PM10":
                    val_name = u"pm10"
                elif name == u"PM2.5":
                    val_name = u"pm25"
                elif name == u"CO":
                    val_name = u"co"
                else:
                    val_name = None

                if val_name:
                    val = Aqi().aqi_to_val(aqi, val_name)

                    _tmp_dict = Kind(self.name).get_dict(r_key=name, r_val=val)
                    if _tmp_dict:
                        station_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"]

            if station_data:
                items = AppItem()
                items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
                items[u"data_time"] = data_time
                items[u"data_value"] = station_data
                items[u"source"] = self.source
                items[u"source_id"] = station_id
                yield items
예제 #16
0
    def get_station_data(self, resp):
        records = resp.xpath(u"//record")
        table = list()
        for rec in records:
            date = rec.xpath(u"./@date").extract_first()
            hour = rec.xpath(u"./@hour").extract_first()
            if hour == u"2400":
                hour = u"0000"

            raw_data_date = u" ".join((date, hour))
            data_date = parser.parse(raw_data_date)

            values = rec.xpath(u"child::node()")
            for val in values:
                pollutant_name = val.xpath(u"name(.)").extract_first()
                pollutant_value = val.xpath(u"./text()").extract_first()
                pollutant_unit = val.xpath(u"./@unit").extract_first()
                row = {
                    u"date": data_date,
                    u"pollutant_name": pollutant_name,
                    u"pollutant_value": pollutant_value,
                    u"pollutant_unit": pollutant_unit,

                }
                table.append(row)

        data = pd.DataFrame(table)
        data = data.dropna(axis=0)

        current_data_time = data[u"date"].max()
        curr_data = data[data[u"date"] == current_data_time]

        station_data = dict()
        for el in curr_data[[u"pollutant_name", u"pollutant_value", u"pollutant_unit"]].itertuples(index=False):
            pollutant_name = el[0]
            pollutant_value = el[1]
            pollutant_units = el[2]

            pollutant = Feature(self.name)
            pollutant.set_source(self.source)
            pollutant.set_raw_name(pollutant_name)
            pollutant.set_raw_value(pollutant_value)
            pollutant.set_raw_units(pollutant_units)

            # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units())
            if pollutant.get_name() is not None and pollutant.get_value() is not None:
                station_data[pollutant.get_name()] = pollutant.get_value()

        if station_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = pd.to_datetime(current_data_time).replace(tzinfo=timezone(self.tz))
            items[u"data_value"] = station_data
            items[u"source"] = self.source
            items[u"source_id"] = resp.meta[u"code"]

            return items
예제 #17
0
    def get_additional_data(self, resp):
        weather_data = resp.meta[u"data"]
        body = resp.body
        body = body.split(u"\r\n")
        col_names = body[8].lstrip(u"#")
        col_names = col_names.split(u", ")
        col_names = col_names[1:]
        # print(col_names)

        data_time = self.get_date(body[1])

        table = body[9:len(body) - 1]
        for row in table:
            col = row.split(u",")

            col_values = list()
            for el in col:
                if u" " in el:
                    el = el.replace(u" ", u"")
                if u"/" in el:
                    el = None
                if u"-99" == el:
                    el = None
                if u"-999" == el:
                    el = None
                if u"-9999" == el:
                    el = None
                col_values.append(el)

            station_id = col_values[0]
            col_values = col_values[1:]
            # print(col_values, station_id)

            data = zip(col_names, col_values)

            station_data = dict()
            for st in data:
                _name = st[0]
                _val = st[1]
                _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val)
                if _tmp_dict:
                    station_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"]

            # доклеюємо дані із першої відповіді
            station_data.update(weather_data[station_id])

            if station_data:
                items = AppItem()
                items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
                items[u"data_time"] = data_time
                items[u"data_value"] = station_data
                items[u"source"] = self.source
                items[u"source_id"] = station_id

                yield items
예제 #18
0
    def get_station_data(self, resp):
        raw_poll_name = resp.xpath(
            u'//*[@id="C1WebGrid1"]/tr[1]/td')[1:].extract()
        poll_name = [
            re.findall(u"\r\n\t(.+)\r\n", re.sub(u"<.+?>", u"", el))[0]
            for el in raw_poll_name
        ]

        raw_poll_unit = resp.xpath(
            u'//*[@id="C1WebGrid1"]/tr[2]/td')[1:].extract()
        poll_unit = [
            re.findall(u"\r\n\t(.+)\r\n", re.sub(u"<.+?>", u"", el))[0]
            for el in raw_poll_unit
        ]

        raw_data = resp.xpath(u'//*[@id="C1WebGrid1"]/tr[last()]/td')
        data_time = raw_data[0].xpath(u'.//div[1]/text()').extract_first()
        data_time = data_time.replace(u"24:00", u"00:00")
        data_time = parser.parse(data_time).replace(tzinfo=timezone(self.tz))
        # print(data_time)

        raw_pollutant_value = raw_data[1:]
        pollutant_value = list()
        for el in raw_pollutant_value:
            value = el.xpath(u'.//div/text()').extract_first()
            value = clean(value)
            if u"\xa0" in value:
                value = None
            pollutant_value.append(value)

        data = zip(poll_name, pollutant_value, poll_unit)
        # print(data)

        station_data = dict()
        for record in data:
            pollutant = Feature(self.name)
            pollutant.set_source(self.source)
            # print("record", record)
            pollutant.set_raw_name(record[0])
            pollutant.set_raw_value(record[1])
            pollutant.set_raw_units(record[2])
            # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units())
            if pollutant.get_name() is not None and pollutant.get_value(
            ) is not None:
                station_data[pollutant.get_name()] = pollutant.get_value()

        if station_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = data_time
            items[u"data_value"] = station_data
            items[u"source"] = self.source
            items[u"source_id"] = resp.meta[u"code"]

            yield items
예제 #19
0
    def get_station_data(self, resp):
        row_names = ('Date', 'HOP', 'HGC', 'BSY', 'MEX', 'MTC', 'HEW', 'CLY')

        # опрацювання pdf документу
        stream = StringIO.StringIO(resp.body)

        rsrcmgr = PDFResourceManager()
        retstr = StringIO.StringIO()
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, laparams=laparams)

        process_pdf(rsrcmgr, device, stream)
        device.close()

        doc_str = retstr.getvalue()
        retstr.close()

        # розбиваємо построчно
        row_data = doc_str.split('\n')

        # data_time = row_data[len(row_data)-7]
        # print(data_time)

        first_row = row_data[17:25]
        data = zip(row_names, first_row)

        data_time = self.get_date(data[0][1])
        data = data[1:]

        for st in data:
            station_id = st[0]
            _name = 'PM10_24HR'
            _val = st[1]
            if '*' in _val:
                _val = _val.replace('*', '')

            _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val)

            station_data = dict()
            if _tmp_dict:
                station_data[_tmp_dict['key']] = _tmp_dict['val']

            # print(station_data)

            if station_data:
                items = AppItem()
                items['scrap_time'] = datetime.now(
                    tz=timezone(SCRAPER_TIMEZONE))
                items['data_time'] = data_time
                items['data_value'] = station_data
                items['source'] = 'http://superpit.com.au'
                items['source_id'] = station_id

                yield items
예제 #20
0
    def get_station_data(self, resp):
        raw_poll_name = resp.xpath(u'//*[@id="C1WebGrid1"]/tr[1]/td')[1:]
        poll_name = [
            el.xpath(u".//div[1]/text()").re(u"\r\n\t(.+)\r\n")[0]
            for el in raw_poll_name
        ]

        raw_poll_units = resp.xpath(u'//*[@id="C1WebGrid1"]/tr[2]/td')[1:]
        poll_units = [
            el.xpath(u".//div[1]/text()").re(u"\r\n\t(.+)\r\n")[0]
            for el in raw_poll_units
        ]

        raw_poll_value_data = resp.xpath(
            u'//*[@id="C1WebGrid1"]/tr[last()]/td')

        raw_poll_value = raw_poll_value_data[1:]
        poll_value = [
            el.xpath(u".//div[1]/text()").re(u"\r\n\t(.+)\r\n")[0]
            for el in raw_poll_value
        ]

        raw_data_time = raw_poll_value_data[0].xpath(u".//div[1]/text()").re(
            u"\r\n\t(.+)\r\n")[0]
        data_time = parser.parse(raw_data_time).replace(
            tzinfo=timezone(self.tz))

        data = zip(poll_name, poll_value, poll_units)

        station_data = dict()
        for record in data:
            pollutant = Feature(self.name)
            pollutant.set_source(self.source)
            # print("record", record)
            pollutant.set_raw_name(record[0])
            pollutant.set_raw_value(record[1])
            pollutant.set_raw_units(record[2])
            # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units())
            if pollutant.get_name() is not None and pollutant.get_value(
            ) is not None:
                station_data[pollutant.get_name()] = pollutant.get_value()

        if station_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = data_time
            items[u"data_value"] = station_data
            items[u"source"] = self.source
            items[u"source_id"] = resp.meta[u"code"]

            yield items
예제 #21
0
    def get_st_data(self, resp):
        regex = u"AP=(.+)"
        st_id = findall(regex, resp.url)
        st_id = str(st_id[0])

        row_names = resp.xpath(u'//*[@id="apTable"]/table/tr[1]/th/span')
        row_data = resp.xpath(u'//*[@id="apTable"]/table/tr[2]/td')

        new_dt = self.check_date(resp)
        print(new_dt)

        names = list()
        for name in row_names:
            _name = name.xpath(u"text()").extract()

            try:
                names.append(_name[0])

            except IndexError:
                names.append(None)
            # print(_name)

        vals = list()
        for val in row_data:
            _val = val.xpath(u"text()").extract()
            try:
                vals.append(_val[0])
            except IndexError:
                vals.append(None)

        data = zip(names, vals)
        data.pop(0)

        st_data = dict()
        for val in data:
            _name = val[0]
            _val = val[1]
            _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val)
            if _tmp_dict:
                st_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"]

        if st_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = new_dt
            items[u"data_value"] = st_data
            items[u"source"] = self.source
            items[u"source_id"] = st_id

            yield items
예제 #22
0
    def get_station_data(self, resp):
        all_data = self.get_clean_data(resp)

        current_data_time = all_data[u"date"].max()
        curr_all_data = all_data[all_data[u"date"] == current_data_time]

        # idx = curr_all_data.groupby(by=u"pollutant_name")[u"date"].transform(max) == curr_all_data[u"date"]
        # data = curr_all_data[idx].copy()
        data = curr_all_data
        data = data[[
            u"station_name", u"pollutant_name", u"pollutant_value", u"unit"
        ]]
        grouped = data.groupby(by=u"station_name")

        for name, gr in grouped:
            station_data = dict()
            station_id = None

            for record in gr.itertuples(index=False):
                if station_id is None:
                    station_id = record[0]

                pollutant_name = record[1]
                pollutant_value = record[2]
                pollutant_units = record[3]

                # print(station_id, pollutant_name, pollutant_value, pollutant_units)

                pollutant = Feature(self.name)
                pollutant.set_source(self.source)
                pollutant.set_raw_name(pollutant_name)
                pollutant.set_raw_value(pollutant_value)
                pollutant.set_raw_units(pollutant_units)
                #
                # print("answare", station_id, pollutant.get_name(), pollutant.get_value(), pollutant.get_units())
                if pollutant.get_name() is not None and pollutant.get_value(
                ) is not None:
                    station_data[pollutant.get_name()] = pollutant.get_value()

            if station_data:
                items = AppItem()
                items[u"scrap_time"] = datetime.now(
                    tz=timezone(SCRAPER_TIMEZONE))
                items[u"data_time"] = pd.to_datetime(
                    current_data_time).replace(tzinfo=timezone(self.tz))
                items[u"data_value"] = station_data
                items[u"source"] = self.source
                items[u"source_id"] = station_id

                yield items
예제 #23
0
    def get_station_data(self, resp):
        raw_pollutant_name = resp.xpath(
            u'//*[@id="EnvitechGrid1_GridTable"]/tr[1]/td')[1:]
        pollutant_name_data = [
            el.xpath(u".//span[1]/@title").extract_first().split(u"\n\n")
            for el in raw_pollutant_name
        ]
        pollutant_name = [el[0] for el in pollutant_name_data]
        pollutant_units = [el[1] for el in pollutant_name_data]

        pollutant_name = map(lambda x: u" ".join(x.split()), pollutant_name)

        raw_data = resp.xpath(u'//*[@id="EnvitechGrid1_GridTable"]/tr[2]/td')

        data_time = raw_data[0].xpath(u".//span[1]/text()").extract_first()
        data_time = data_time.replace(u"24:00", u"00:00")
        data_time = parser.parse(data_time).replace(tzinfo=timezone(self.tz))

        raw_pollutant_value = raw_data[1:]
        pollutant_value = [
            el.xpath(u'.//span[1]/text()').extract_first()
            for el in raw_pollutant_value
        ]

        data = zip(pollutant_name, pollutant_value, pollutant_units)

        # print(data)

        station_data = dict()
        for record in data:
            pollutant = Feature(self.name)
            pollutant.set_source(self.source)
            # print("record", record)
            pollutant.set_raw_name(record[0])
            pollutant.set_raw_value(record[1])
            pollutant.set_raw_units(record[2])
            # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units())
            if pollutant.get_name() is not None and pollutant.get_value(
            ) is not None:
                station_data[pollutant.get_name()] = pollutant.get_value()

        if station_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = data_time
            items[u"data_value"] = station_data
            items[u"source"] = self.source
            items[u"source_id"] = resp.meta[u"code"]

            yield items
예제 #24
0
    def get_st_data(self, resp):
        regex = u".*ST_ID=(.+)"
        station_id = findall(regex, resp.url)
        station_id = str(station_id[0])

        # формужмо значення назв забрудників таблиці
        row_col_names = resp.xpath(
            u'//*[@id="C1WebGrid1"]/tbody/tr[1]/td/div/text()').extract()
        col_names = list()
        for col_name in row_col_names:
            col_name = col_name.lstrip(u'\n\t')
            col_name = col_name.rstrip(u'\n\t')
            col_names.append(col_name)

        col_names = col_names[1:]

        # витягуємо значення забрудників
        row_data = resp.xpath(u'//*[@id="C1WebGrid1"]/tbody/tr[5]/td')
        data_values = list()
        for data in row_data:
            row_value = data.xpath(u"div/text()").re(u"([\S].+[\S])")
            try:
                data_values.append(row_value[0])
            except IndexError:
                data_values.append(None)

        # значення дати даних
        data_date = parse(data_values[0])
        data_date = data_date.replace(tzinfo=timezone(self.tz))

        data_values = data_values[1:]
        data = zip(col_names, data_values)

        station_data = dict()
        for st in data:
            _name = st[0]
            _val = st[1]
            _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val)
            if _tmp_dict:
                station_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"]

        if station_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = data_date
            items[u"data_value"] = station_data
            items[u"source"] = self.source
            items[u"source_id"] = station_id

            return items
예제 #25
0
    def get_st_data(self, resp):
        table = resp.xpath(u'//*[@id="right_column"]/div/table/tbody/tr')
        col_names = resp.xpath(u'//*[@id="right_column"]/div/table/thead//th/@abbr').extract()

        data_date = resp.xpath(u'//*[@id="right_column"]/div/h1/text()').extract()
        data_date = str(data_date[0]).lstrip(u"Pollutant Concentrations for ")
        dt = parse(data_date)
        new_dt = dt.replace(tzinfo=timezone(self.tz))

        #  get correct values, if there is no value add ""
        for row in table:
            cols = row.xpath(u"td")

            #  get id from href
            url = cols[0].xpath(u"div/a/@href").extract()
            url = str(url[0])
            regex_station = u"stationid=(.+)"
            st_id = re.findall(regex_station, url)
            st_id = str(st_id[0])

            row_data = []
            for col in cols:
                text = col.xpath(u"div[1]/text()").extract()
                try:
                    row_data.append(text[0])
                except IndexError:
                    row_data.append(None)

            gen_data = zip(col_names, row_data)
            st_data = dict()
            for data in gen_data:
                _key = data[0]
                _val = data[1]

                _tmp_dict = Kind(self.name).get_dict(r_key=_key, r_val=_val)

                if _tmp_dict:
                    st_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"]

            if st_data:
                items = AppItem()
                items[u"scrap_time"] = datetime.datetime.now(tz=timezone(SCRAPER_TIMEZONE))
                items[u"data_time"] = new_dt
                items[u"data_value"] = st_data
                items[u"source"] = self.source
                items[u"source_id"] = st_id

                yield items
예제 #26
0
    def get_st_data(self, resp):
        data = self.read_st_data(resp)
        st_data = data[0]
        new_dt = data[1]
        # print(st_data)
        for st_id in st_data:
            print(st_id)
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = new_dt
            items[u"data_value"] = st_data[st_id]
            items[u"source"] = self.source
            items[u"source_id"] = st_id
            items[u"st_name"] = st_id

            yield items
    def push_data(self, resp):
        # print("PUSH!!!!!!!!!!!!!!!!!!!!!!")
        data = resp.meta["data"]

        df = pd.DataFrame(data)

        # print(df.groupby(by=["unit", "name"]).size())

        df["value"] = pd.to_numeric(df["value"])
        df = df[pd.notnull(df["value"])]

        current_time = self.get_max_valid_date(df)
        # print(current_time)
        current_data = df[df["time"] == current_time]
        print(current_data)
        grouped = current_data.groupby(by="station_id")

        # print(df.groupby(by="name").size())

        for station_id, gr in grouped:
            station_data = dict()
            for poll in gr[["name", "value", "unit"]].itertuples(index=False):
                pollutant_name = poll[0]
                pollutant_value = poll[1]
                pollutant_units = poll[2]

                # print(pollutant_name, pollutant_value, pollutant_units)

                pollutant = Feature(self.name)
                pollutant.set_source(self.source)
                pollutant.set_raw_name(pollutant_name)
                pollutant.set_raw_value(pollutant_value)
                pollutant.set_raw_units(pollutant_units)

                # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units())
                if pollutant.get_name() is not None and pollutant.get_value() is not None:
                    station_data[pollutant.get_name()] = pollutant.get_value()

            if station_data:
                items = AppItem()
                items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
                items[u"data_time"] = pd.to_datetime(current_time).replace(tzinfo=timezone(self.tz))
                items[u"data_value"] = station_data
                items[u"source"] = self.source
                items[u"source_id"] = station_id

                yield items
예제 #28
0
    def get_st_data(self, resp):
        data_date = self.get_date(resp)

        for st in self.get_page(resp):
            st_id = self.get_st_name(st[u"data"])

            st_data = self.get_value(st[u"data"])

            if st_data:
                items = AppItem()
                items[u"scrap_time"] = datetime.now(
                    tz=timezone(SCRAPER_TIMEZONE))
                items[u"data_time"] = data_date
                items[u"data_value"] = st_data
                items[u"source"] = self.source
                items[u"source_id"] = st_id
                yield items
예제 #29
0
    def get_st_data(self, resp):
        #  get local id from url
        regex_state = u"StateId=(.+?)&"
        regex_city = u"CityId=(.+)"
        regex_station = u"StationName=(.+?)&"
        _state_id = re.findall(regex_state, resp.url)
        _city_id = re.findall(regex_city, resp.url)
        _station_id = re.findall(regex_station, resp.url)
        st_id = u"".join(
            (_station_id[0].replace(u"%20", u" "), _state_id[0], _city_id[0]))

        table = resp.xpath(u'//*[@id="lblReportCurrentData"]/table/child::*')
        data_date = resp.xpath(
            u'//*[@id="lblCurrentDateTime"]/text()').extract_first()

        st_data = {}
        for el in table:
            col = el.xpath(u"child::td")
            try:
                name = col[0].xpath(u"text()").extract()
                _name = name[0]
                _val = col[3].xpath(u"span/text()").extract_first()

                _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val)
                if _tmp_dict:
                    st_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"]

            except IndexError:
                pass

        if data_date:
            data_date = data_date.replace(u"Date Time : ", u"")
            new_dt = parse(data_date)
            new_dt = new_dt.replace(tzinfo=timezone(self.tz))
        else:
            new_dt = None

        if st_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.datetime.now(
                tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = new_dt
            items[u"data_value"] = st_data
            items[u"source"] = self.source
            items[u"source_id"] = st_id
            return items
예제 #30
0
    def validate_station_data(self, meta):
        """validate dictionary from response META attribute"""
        results = meta.get(u"results")

        res1 = results.get(u"res1")
        res2 = results.get(u"res2")
        res3 = results.get(u"res3")

        df1 = DataFrame.from_dict(res1).sort_values(by=u"date")
        df2 = DataFrame.from_dict(res2).sort_values(by=u"date")
        df3 = DataFrame.from_dict(res3).sort_values(by=u"date")

        df = merge(df1, df2, on=u"date")
        df = merge(df, df3, on=u"date")

        # print(df)

        source_code = meta.get(u"station_code")

        for obs in df.itertuples():
            st_data = dict()
            co = Kind(self.name).get_dict(r_key=u"co", r_val=obs.co)
            pm10 = Kind(self.name).get_dict(r_key=u"pm10", r_val=obs.pm10)
            no2 = Kind(self.name).get_dict(r_key=u"no2", r_val=obs.no2)

            if co:
                st_data[co[u"key"]] = co[u"val"]
            if pm10:
                st_data[pm10[u"key"]] = pm10[u"val"]
            if no2:
                st_data[no2[u"key"]] = no2[u"val"]

            data_time = obs.date.to_datetime()
            data_time = data_time.replace(tzinfo=timezone(self.tz))

            if st_data:
                items = AppItem()
                items[u"scrap_time"] = datetime.now(
                    tz=timezone(SCRAPER_TIMEZONE))
                items[u"data_time"] = data_time
                items[u"data_value"] = st_data
                items[u"source"] = self.source
                items[u"source_id"] = source_code

                yield items