示例#1
0
    def get_station_data(self, resp):
        # print(resp.text)
        all_data = pd.read_csv(StringIO(resp.text),
                               names=(u"station_name", u"str_date",
                                      u"pollutant_name",
                                      u"pollutant value")).dropna(axis=0)
        all_data[u"date_time"] = [
            parser.parse(x) for x in all_data[u'str_date']
        ]

        current_data_time = all_data[u"date_time"].max()
        # print(current_data_time)
        curr_all_data = all_data[all_data[u"date_time"] == current_data_time]

        idx = curr_all_data.groupby(
            by=u"pollutant_name")[u"date_time"].transform(
                max) == curr_all_data[u"date_time"]

        units = {
            u"PM2.5": u"ug/m3",
            u"PM10-NEW": u"ug/m3",
            u"Sulfur Dioxide": u"ppb",
            u"Carbon Monoxide": u"ppb",
            u"Ozone 1 hour": u"ppb",
            u"Nitrogen Dioxide": u"ppb",
        }

        data = curr_all_data[idx].copy()

        station_data = dict()
        for el in data.itertuples():
            pollutant_name = el[3]
            pollutant_value = el[4]
            pollutant_units = units.get(pollutant_name)

            # print(pollutant_name, pollutant_value, pollutant_units)

            pollutant = Feature(self.name)
            pollutant.set_source(self.source)
            pollutant.set_raw_name(pollutant_name)
            pollutant.set_raw_value(pollutant_value)
            pollutant.set_raw_units(pollutant_units)
            # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units())
            if pollutant.get_name() is not None and pollutant.get_value(
            ) is not None:
                station_data[pollutant.get_name()] = pollutant.get_value()

        # print(station_data)
        if station_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = pd.to_datetime(current_data_time).replace(
                tzinfo=timezone(self.tz))
            items[u"data_value"] = station_data
            items[u"source"] = self.source
            items[u"source_id"] = resp.meta[u"code"]

            yield items
示例#2
0
    def get_station_data(self, resp):
        raw_text = resp.xpath(u'/html/head/script[last()]/text()')
        raw_pollutant_data = raw_text.re(u"var data = (.+);")
        raw_pollutant_data = [
            el.replace(u"'", u'"').replace(u"],]", u"]]")
            for el in raw_pollutant_data
        ]
        pollutant_data = [ujson.loads(el) for el in raw_pollutant_data]
        pollutant_data = [el[-1] for el in pollutant_data]

        pollutant_value = [el[1] for el in pollutant_data]

        # pollutant_date = [el[0] for el in pollutant_data]
        pollutant_date = [
            parser.parse(el[0]).replace(tzinfo=timezone(self.tz))
            for el in pollutant_data
        ]

        # max value as current date
        current_data_time = max(pollutant_date)
        # print(current_data_time)
        # data_time = parser.parse(raw_data_time).replace(tzinfo=timezone(self.tz))

        raw_pollution_name = raw_text.re(u"series: \[\s+\{\s+name: '(.+)',?")
        pollutant_name = [
            Selector(text=el).xpath(u"/html/body/p/text()").re(u"(.+)\(")[0]
            for el in raw_pollution_name
        ]
        pollutant_units = [
            Selector(text=el).xpath(u"/html/body/p/text()").re(u"\((.+)\)")[0]
            for el in raw_pollution_name
        ]

        data = zip(pollutant_name, pollutant_value, pollutant_units,
                   pollutant_date)

        station_data = dict()
        for record in data:
            pollutant = Feature(self.name)
            pollutant.set_source(self.source)
            # print("record", record)
            pollutant.set_raw_name(record[0])
            pollutant.set_raw_value(record[1])
            pollutant.set_raw_units(record[2])
            # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units())
            if pollutant.get_name() is not None and pollutant.get_value(
            ) is not None and record[3] == current_data_time:
                station_data[pollutant.get_name()] = pollutant.get_value()

        if station_data and current_data_time:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = current_data_time
            items[u"data_value"] = station_data
            items[u"source"] = self.source
            items[u"source_id"] = resp.meta[u"code"]

            yield items
    def get_station_data(self, resp):
        raw_col_names = resp.xpath(u"/html/body/div[1]/div[2]/table/tr[1]/td").extract()
        col_names = [re.sub(u"<.+?>", u"", el) for el in raw_col_names]
        # for el in col_names:
        #     print el

        table = resp.xpath(u'/html/body/div[1]/div[2]/table//td')
        table_data = [el.xpath(u".").re(u"<td>(.+)<\/td>")[0] if el.xpath(u".").re(u"<td>(.+)<\/td>") else None for el
                      in table]
        # print(table_data)
        # print(len(table_data))
        table_data = np.asarray(table_data).reshape(len(table_data)/len(col_names), len(col_names))

        df = pd.DataFrame(table_data[1:, ], columns=col_names)
        # print(df)
        raw_data = df.iloc[0].to_dict()
        raw_data_time = raw_data.pop(u"Дата і час", None)

        data_time = parser.parse(raw_data_time, dayfirst=True).replace(tzinfo=timezone(self.tz))

        data = raw_data

        units = {
            u"Температура повітря": u"degc",
            u"Опади": u"mm",
            u"Рівень №2": u"NA",
            u"Рівень №1": u"NA",
            u"Рівень": u"NA",
            u"Температура води": u"degc",
        }

        station_data = dict()
        for key, val in data.items():
            # print(key)
            poll_name = key
            poll_value = val
            poll_units = units[key]
            # print(poll_name, poll_value, poll_units)

            pollutant = Feature(self.name)
            pollutant.set_source(self.source)
            # print("record", record)
            pollutant.set_raw_name(poll_name)
            pollutant.set_raw_value(poll_value)
            pollutant.set_raw_units(poll_units)
            # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units())
            if pollutant.get_name() is not None and pollutant.get_value() is not None:
                station_data[pollutant.get_name()] = pollutant.get_value()

        if station_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = data_time
            items[u"data_value"] = station_data
            items[u"source"] = self.source
            items[u"source_id"] = resp.meta[u"code"]

            yield items
示例#4
0
    def get_station_data(self, resp):
        records = resp.xpath(u"//record")
        table = list()
        for rec in records:
            date = rec.xpath(u"./@date").extract_first()
            hour = rec.xpath(u"./@hour").extract_first()
            if hour == u"2400":
                hour = u"0000"

            raw_data_date = u" ".join((date, hour))
            data_date = parser.parse(raw_data_date)

            values = rec.xpath(u"child::node()")
            for val in values:
                pollutant_name = val.xpath(u"name(.)").extract_first()
                pollutant_value = val.xpath(u"./text()").extract_first()
                pollutant_unit = val.xpath(u"./@unit").extract_first()
                row = {
                    u"date": data_date,
                    u"pollutant_name": pollutant_name,
                    u"pollutant_value": pollutant_value,
                    u"pollutant_unit": pollutant_unit,

                }
                table.append(row)

        data = pd.DataFrame(table)
        data = data.dropna(axis=0)

        current_data_time = data[u"date"].max()
        curr_data = data[data[u"date"] == current_data_time]

        station_data = dict()
        for el in curr_data[[u"pollutant_name", u"pollutant_value", u"pollutant_unit"]].itertuples(index=False):
            pollutant_name = el[0]
            pollutant_value = el[1]
            pollutant_units = el[2]

            pollutant = Feature(self.name)
            pollutant.set_source(self.source)
            pollutant.set_raw_name(pollutant_name)
            pollutant.set_raw_value(pollutant_value)
            pollutant.set_raw_units(pollutant_units)

            # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units())
            if pollutant.get_name() is not None and pollutant.get_value() is not None:
                station_data[pollutant.get_name()] = pollutant.get_value()

        if station_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = pd.to_datetime(current_data_time).replace(tzinfo=timezone(self.tz))
            items[u"data_value"] = station_data
            items[u"source"] = self.source
            items[u"source_id"] = resp.meta[u"code"]

            return items
示例#5
0
    def get_station_data(self, resp):
        raw_poll_name = resp.xpath(
            u'//*[@id="C1WebGrid1"]/tr[1]/td')[1:].extract()
        poll_name = [
            re.findall(u"\r\n\t(.+)\r\n", re.sub(u"<.+?>", u"", el))[0]
            for el in raw_poll_name
        ]

        raw_poll_unit = resp.xpath(
            u'//*[@id="C1WebGrid1"]/tr[2]/td')[1:].extract()
        poll_unit = [
            re.findall(u"\r\n\t(.+)\r\n", re.sub(u"<.+?>", u"", el))[0]
            for el in raw_poll_unit
        ]

        raw_data = resp.xpath(u'//*[@id="C1WebGrid1"]/tr[last()]/td')
        data_time = raw_data[0].xpath(u'.//div[1]/text()').extract_first()
        data_time = data_time.replace(u"24:00", u"00:00")
        data_time = parser.parse(data_time).replace(tzinfo=timezone(self.tz))
        # print(data_time)

        raw_pollutant_value = raw_data[1:]
        pollutant_value = list()
        for el in raw_pollutant_value:
            value = el.xpath(u'.//div/text()').extract_first()
            value = clean(value)
            if u"\xa0" in value:
                value = None
            pollutant_value.append(value)

        data = zip(poll_name, pollutant_value, poll_unit)
        # print(data)

        station_data = dict()
        for record in data:
            pollutant = Feature(self.name)
            pollutant.set_source(self.source)
            # print("record", record)
            pollutant.set_raw_name(record[0])
            pollutant.set_raw_value(record[1])
            pollutant.set_raw_units(record[2])
            # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units())
            if pollutant.get_name() is not None and pollutant.get_value(
            ) is not None:
                station_data[pollutant.get_name()] = pollutant.get_value()

        if station_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = data_time
            items[u"data_value"] = station_data
            items[u"source"] = self.source
            items[u"source_id"] = resp.meta[u"code"]

            yield items
示例#6
0
    def get_station_data(self, resp):
        raw_poll_name = resp.xpath(u'//*[@id="C1WebGrid1"]/tr[1]/td')[1:]
        poll_name = [
            el.xpath(u".//div[1]/text()").re(u"\r\n\t(.+)\r\n")[0]
            for el in raw_poll_name
        ]

        raw_poll_units = resp.xpath(u'//*[@id="C1WebGrid1"]/tr[2]/td')[1:]
        poll_units = [
            el.xpath(u".//div[1]/text()").re(u"\r\n\t(.+)\r\n")[0]
            for el in raw_poll_units
        ]

        raw_poll_value_data = resp.xpath(
            u'//*[@id="C1WebGrid1"]/tr[last()]/td')

        raw_poll_value = raw_poll_value_data[1:]
        poll_value = [
            el.xpath(u".//div[1]/text()").re(u"\r\n\t(.+)\r\n")[0]
            for el in raw_poll_value
        ]

        raw_data_time = raw_poll_value_data[0].xpath(u".//div[1]/text()").re(
            u"\r\n\t(.+)\r\n")[0]
        data_time = parser.parse(raw_data_time).replace(
            tzinfo=timezone(self.tz))

        data = zip(poll_name, poll_value, poll_units)

        station_data = dict()
        for record in data:
            pollutant = Feature(self.name)
            pollutant.set_source(self.source)
            # print("record", record)
            pollutant.set_raw_name(record[0])
            pollutant.set_raw_value(record[1])
            pollutant.set_raw_units(record[2])
            # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units())
            if pollutant.get_name() is not None and pollutant.get_value(
            ) is not None:
                station_data[pollutant.get_name()] = pollutant.get_value()

        if station_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = data_time
            items[u"data_value"] = station_data
            items[u"source"] = self.source
            items[u"source_id"] = resp.meta[u"code"]

            yield items
示例#7
0
    def get_station_data(self, resp):
        all_data = self.get_clean_data(resp)

        current_data_time = all_data[u"date"].max()
        curr_all_data = all_data[all_data[u"date"] == current_data_time]

        # idx = curr_all_data.groupby(by=u"pollutant_name")[u"date"].transform(max) == curr_all_data[u"date"]
        # data = curr_all_data[idx].copy()
        data = curr_all_data
        data = data[[
            u"station_name", u"pollutant_name", u"pollutant_value", u"unit"
        ]]
        grouped = data.groupby(by=u"station_name")

        for name, gr in grouped:
            station_data = dict()
            station_id = None

            for record in gr.itertuples(index=False):
                if station_id is None:
                    station_id = record[0]

                pollutant_name = record[1]
                pollutant_value = record[2]
                pollutant_units = record[3]

                # print(station_id, pollutant_name, pollutant_value, pollutant_units)

                pollutant = Feature(self.name)
                pollutant.set_source(self.source)
                pollutant.set_raw_name(pollutant_name)
                pollutant.set_raw_value(pollutant_value)
                pollutant.set_raw_units(pollutant_units)
                #
                # print("answare", station_id, pollutant.get_name(), pollutant.get_value(), pollutant.get_units())
                if pollutant.get_name() is not None and pollutant.get_value(
                ) is not None:
                    station_data[pollutant.get_name()] = pollutant.get_value()

            if station_data:
                items = AppItem()
                items[u"scrap_time"] = datetime.now(
                    tz=timezone(SCRAPER_TIMEZONE))
                items[u"data_time"] = pd.to_datetime(
                    current_data_time).replace(tzinfo=timezone(self.tz))
                items[u"data_value"] = station_data
                items[u"source"] = self.source
                items[u"source_id"] = station_id

                yield items
示例#8
0
    def get_station_data(self, resp):
        raw_pollutant_name = resp.xpath(
            u'//*[@id="EnvitechGrid1_GridTable"]/tr[1]/td')[1:]
        pollutant_name_data = [
            el.xpath(u".//span[1]/@title").extract_first().split(u"\n\n")
            for el in raw_pollutant_name
        ]
        pollutant_name = [el[0] for el in pollutant_name_data]
        pollutant_units = [el[1] for el in pollutant_name_data]

        pollutant_name = map(lambda x: u" ".join(x.split()), pollutant_name)

        raw_data = resp.xpath(u'//*[@id="EnvitechGrid1_GridTable"]/tr[2]/td')

        data_time = raw_data[0].xpath(u".//span[1]/text()").extract_first()
        data_time = data_time.replace(u"24:00", u"00:00")
        data_time = parser.parse(data_time).replace(tzinfo=timezone(self.tz))

        raw_pollutant_value = raw_data[1:]
        pollutant_value = [
            el.xpath(u'.//span[1]/text()').extract_first()
            for el in raw_pollutant_value
        ]

        data = zip(pollutant_name, pollutant_value, pollutant_units)

        # print(data)

        station_data = dict()
        for record in data:
            pollutant = Feature(self.name)
            pollutant.set_source(self.source)
            # print("record", record)
            pollutant.set_raw_name(record[0])
            pollutant.set_raw_value(record[1])
            pollutant.set_raw_units(record[2])
            # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units())
            if pollutant.get_name() is not None and pollutant.get_value(
            ) is not None:
                station_data[pollutant.get_name()] = pollutant.get_value()

        if station_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = data_time
            items[u"data_value"] = station_data
            items[u"source"] = self.source
            items[u"source_id"] = resp.meta[u"code"]

            yield items
    def push_data(self, resp):
        # print("PUSH!!!!!!!!!!!!!!!!!!!!!!")
        data = resp.meta["data"]

        df = pd.DataFrame(data)

        # print(df.groupby(by=["unit", "name"]).size())

        df["value"] = pd.to_numeric(df["value"])
        df = df[pd.notnull(df["value"])]

        current_time = self.get_max_valid_date(df)
        # print(current_time)
        current_data = df[df["time"] == current_time]
        print(current_data)
        grouped = current_data.groupby(by="station_id")

        # print(df.groupby(by="name").size())

        for station_id, gr in grouped:
            station_data = dict()
            for poll in gr[["name", "value", "unit"]].itertuples(index=False):
                pollutant_name = poll[0]
                pollutant_value = poll[1]
                pollutant_units = poll[2]

                # print(pollutant_name, pollutant_value, pollutant_units)

                pollutant = Feature(self.name)
                pollutant.set_source(self.source)
                pollutant.set_raw_name(pollutant_name)
                pollutant.set_raw_value(pollutant_value)
                pollutant.set_raw_units(pollutant_units)

                # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units())
                if pollutant.get_name() is not None and pollutant.get_value() is not None:
                    station_data[pollutant.get_name()] = pollutant.get_value()

            if station_data:
                items = AppItem()
                items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
                items[u"data_time"] = pd.to_datetime(current_time).replace(tzinfo=timezone(self.tz))
                items[u"data_value"] = station_data
                items[u"source"] = self.source
                items[u"source_id"] = station_id

                yield items
示例#10
0
    def get_station_data(self, resp):
        data = resp.xpath(u"//pre/text()").extract_first()
        json = ujson.loads(data)
        stations = json[u"d"]
        for st in stations:
            data_time = st[u"AqiTimeString"]
            data_time = parser.parse(data_time)

            station_id = st[u"SourceSiteID"]

            station_data = dict()
            for record in st[u"LayerInfos"]:
                pollutant_name = record.get(u"ParameterName")
                pollutant_unit = record.get(u"UnitName")
                pollutant_value = record.get(u"Concentration")

                # print(pollutant_name, pollutant_value, pollutant_unit)

                pollutant = Feature(self.name)
                pollutant.set_source(self.source)
                pollutant.set_raw_name(pollutant_name)
                pollutant.set_raw_value(pollutant_value)
                pollutant.set_raw_units(pollutant_unit)

                if pollutant.get_name() is not None and pollutant.get_value(
                ) is not None:
                    station_data[pollutant.get_name()] = pollutant.get_value()

            # print(station_data)

            data_time = data_time.replace(tzinfo=timezone(self.tz))

            if station_data and data_time:
                items = AppItem()
                items[u"scrap_time"] = datetime.now(
                    tz=timezone(SCRAPER_TIMEZONE))
                items[u"data_time"] = data_time
                items[u"data_value"] = station_data
                items[u"source"] = self.source
                items[u"source_id"] = station_id

                yield items
示例#11
0
    def get_station_data(self, resp):
        raw_poll_name = resp.xpath(
            u'//*[@id="main"]/div[1]/div[1]/table/thead/tr/th')[1:]
        poll_name = [
            el.xpath(u"abbr/text()").extract_first() for el in raw_poll_name
        ]
        poll_unit = [
            el.xpath(u"span/abbr/text()").extract_first()
            for el in raw_poll_name
        ]

        raw_poll_value = resp.xpath(
            u"//*[@id='main']/div[1]/div[1]/table/tbody/tr[last()]/td")[1:]
        poll_value = [
            el.xpath(u"text()").extract_first() for el in raw_poll_value
        ]

        data = zip(poll_name, poll_value, poll_unit)

        station_data = dict()
        for record in data:
            pollutant = Feature(self.name)
            pollutant.set_source(self.source)
            # print(record)
            pollutant.set_raw_name(record[0])
            pollutant.set_raw_value(record[1])
            pollutant.set_raw_units(record[2])
            if pollutant.get_name() is not None and pollutant.get_value(
            ) is not None:
                station_data[pollutant.get_name()] = pollutant.get_value()

        # print(station_data)
        if station_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = resp.meta[u"date_time"]
            items[u"data_value"] = station_data
            items[u"source"] = self.source
            items[u"source_id"] = resp.meta[u"code"]

            yield items
示例#12
0
    def get_station_data(self, resp):
        raw_record = resp.xpath(
            u"//*[@id='ContentPlaceHolder1_tablebody']/tr[last()]/td")
        hour = raw_record[0].xpath(u"./text()").re(u"-(\d?\d:\d\d)")[0]

        pollutant_value = raw_record[1].xpath(u"./text()").extract_first()
        # only one pollutant
        pollutant_name = u"PM25"
        pollutant_units = u"µg/m"

        cor_date = resp.meta["date"].strftime("%d-%m-%Y")
        raw_data_time = " ".join((cor_date, hour))
        date_time = parser.parse(
            raw_data_time, dayfirst=True).replace(tzinfo=timezone(self.tz))

        # print(date_time, hour, pollutant_value)

        station_data = dict()

        pollutant = Feature(self.name)
        pollutant.set_source(self.source)
        # print(record)
        pollutant.set_raw_name(pollutant_name)
        pollutant.set_raw_value(pollutant_value)
        pollutant.set_raw_units(pollutant_units)

        if pollutant.get_name() is not None and pollutant.get_value(
        ) is not None:
            station_data[pollutant.get_name()] = pollutant.get_value()

        if station_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = date_time
            items[u"data_value"] = station_data
            items[u"source"] = self.source
            items[u"source_id"] = resp.meta[u"code"]

            yield items
    def get_station_data(self, resp):
        data = self.data_to_df(resp)
        data = data.dropna(axis=0)
        # print(data)

        current_data_time = data[u"time"].max()
        current_data = data[data[u"time"] == current_data_time]

        station_data = dict()
        for record in current_data[[u"name", u"value",
                                    u"unit"]].itertuples(index=False):

            pollutant_name = record[0]
            pollutant_value = record[1]
            pollutant_units = record[2]

            # print(station_id, pollutant_name, pollutant_value, pollutant_units)

            pollutant = Feature(self.name)
            pollutant.set_source(self.source)
            pollutant.set_raw_name(pollutant_name)
            pollutant.set_raw_value(pollutant_value)
            pollutant.set_raw_units(pollutant_units)
            #
            # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units())
            if pollutant.get_name() is not None and pollutant.get_value(
            ) is not None:
                station_data[pollutant.get_name()] = pollutant.get_value()

        if station_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = pd.to_datetime(current_data_time).replace(
                tzinfo=timezone(self.tz))
            items[u"data_value"] = station_data
            items[u"source"] = self.source
            items[u"source_id"] = resp.meta["code"]

            yield items
示例#14
0
    def get_station_data(self, resp):
        raw_data_time = resp.xpath(u'//*[@id="ctl00_ContentPlaceHolder2_lbCaption"]/text()').re(u"Current Max Pollution Level \((\d\d?/\d\d?/\d\d\d\d \d\d?..)")[0]
        data_time = parser.parse(raw_data_time).replace(tzinfo=timezone(self.tz))

        res = resp.xpath(u'//*/script/text()').extract()
        script = res[2]
        raw_part = re.findall(u"^.+?function initSiteList\(\)\s+\{(.+?)}\s+/\*", script, re.DOTALL)

        station_datas = re.findall(u"\.params = (\[.+?\]);", raw_part[0])
        station_ids = re.findall(u"new SiteInfo\(\"(.+?)\"", raw_part[0])
        stations = zip(station_ids, station_datas)
        for st in stations:
            station_id = st[0]

            data_json = ujson.loads(st[1])
            data = [(el[u"name"], el[u"val"], el[u"unit"]) for el in data_json]

            station_data = dict()
            for record in data:
                pollutant = Feature(self.name)
                pollutant.set_source(self.source)
                # print(record)
                pollutant.set_raw_name(record[0])
                pollutant.set_raw_value(record[1])
                pollutant.set_raw_units(record[2])
                if pollutant.get_name() is not None and pollutant.get_value() is not None:
                    station_data[pollutant.get_name()] = pollutant.get_value()

            if station_data:
                items = AppItem()
                items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
                items[u"data_time"] = data_time
                items[u"data_value"] = station_data
                items[u"source"] = self.source
                items[u"source_id"] = station_id

                yield items
示例#15
0
    def get_station_data(self, resp):
        raw_date = resp.xpath(
            u"id('col2')/div[1]/center[1]/h3/text()").extract_first()

        raw__all_data = resp.xpath(
            u"//*[@id='col2']/div[1]/center[2]/table/tr")
        raw_poll_name_p1 = raw__all_data[0].xpath(u"td")[1:]
        raw_poll_name_p2 = raw__all_data[1].xpath(u"td")[1:]
        raw_units = raw__all_data[2].xpath(u"td")[1:]

        poll_name_p1 = list()
        for el in raw_poll_name_p1:
            poll_name = el.xpath(u"./text()").extract()
            if not poll_name:
                poll_name = [u""]

            dup_count = el.xpath(u"./@colspan").extract_first(default=1)
            poll_name_p1.extend(poll_name * int(dup_count))

        poll_name_p1 = [u" ".join(el.split()) for el in poll_name_p1]

        poll_name_p2 = list()
        for el in raw_poll_name_p2:
            poll_name = el.xpath(u"./text()").extract()
            if not poll_name:
                poll_name = [u""]

            dup_count = el.xpath(u"./@colspan").extract_first(default=1)
            poll_name_p2.extend(poll_name * int(dup_count))

        poll_name_p2 = [u" ".join(el.split()) for el in poll_name_p2]

        pollutant_name = map(u" ".join, zip(poll_name_p1, poll_name_p2))
        pollutant_name = [u" ".join(el.split()) for el in pollutant_name]
        pollutant_name = [None if el == u"" else el for el in pollutant_name]

        units = list()
        for el in raw_units:
            unit_name = el.xpath(u"./text()").extract()
            dup_count = el.xpath(u"./@colspan").extract_first(default=1)
            units.extend(unit_name * int(dup_count))

        units = [u" ".join(el.split()) for el in units]
        units = [None if el == u"" else el for el in units]

        # print(units)
        # print(pollutant_name)

        raw_table = raw__all_data[3:-18]
        records = list()
        for el in raw_table:

            col = el.xpath(u"td")
            hour = col[0].xpath(u"center/text()").extract_first()

            # print(hour)

            raw_values = [
                el.xpath(u"./text()").extract_first(default=u"")
                for el in col[1:]
            ]
            values = [
                u" ".join(el.replace(u"\n", u"").split()) for el in raw_values
            ]
            values = [None if el == u"" else el for el in values]

            raw_data_date = u" ".join((raw_date, hour))
            raw_data_date = u" ".join(raw_data_date.split())
            data_time = parser.parse(raw_data_date)

            data = zip(pollutant_name, values, units)
            # print(data)
            for rec in data:
                _rec = {
                    u"name": rec[0],
                    u"value": rec[1],
                    u"unit": rec[2],
                    u"date": data_time,
                }

                records.append(_rec)

        df = pd.DataFrame(records)
        df = df.dropna(axis=0)
        # df.replace(r'\s*', None, regex=True)
        # df.replace(to_replace="", value=None)
        grouped = df.groupby(by="date", as_index=False).count()
        current_date = grouped[grouped["name"] > 1]["date"].max()

        curr_data = df[df[u"date"] == current_date]
        print(curr_data)

        station_data = dict()
        for el in curr_data[[u"name", u"value",
                             u"unit"]].itertuples(index=False):
            pollutant_name = el[0]
            pollutant_value = el[1]
            pollutant_units = el[2]

            # print(pollutant_name, pollutant_value, pollutant_units)

            pollutant = Feature(self.name)
            pollutant.set_source(self.source)
            pollutant.set_raw_name(pollutant_name)
            pollutant.set_raw_value(pollutant_value)
            pollutant.set_raw_units(pollutant_units)

            if pollutant.get_name() is not None and pollutant.get_value(
            ) is not None:
                station_data[pollutant.get_name()] = pollutant.get_value()

        if station_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = pd.to_datetime(current_date).replace(
                tzinfo=timezone(self.tz))
            items[u"data_value"] = station_data
            items[u"source"] = self.source
            items[u"source_id"] = resp.meta[u"code"]

            return items
示例#16
0
    def get_station_data(self, resp):

        raw_data_time = self.get_date(resp)

        table = resp.xpath(u"/html/body/div[1]/left/table/tbody/tr")
        # print(table)
        # hours = table.pop(0).xpath(u"td")[2:-3]
        col_names = table.pop(0).xpath(u"td")
        col_names = [el.xpath(u"p/b/text()").extract_first() for el in col_names]

        global_pollutant_name = None

        data = list()
        for row in table:
            col = row.xpath(u"td")

            pollutant_name = col.pop(0).xpath(u"tt/text()").extract_first()
            if pollutant_name:
                global_pollutant_name = pollutant_name

            pollutant_name = global_pollutant_name if pollutant_name is None else pollutant_name

            row_values = [el.xpath(u"tt/text()").extract_first() for el in col]
            row_values.insert(0, pollutant_name)

            data.append(row_values)

        df = pd.DataFrame(data, columns=col_names)
        df = df.dropna(thresh=1, axis=1)
        # df = df.dropna(thresh=1, axis=0)
        del_cols = list(df.columns.values)
        df.drop(labels=del_cols[-3:], axis=1, inplace=True)

        del_mid_cols = list(df.columns.values)
        df.drop(labels=del_mid_cols[2:-1], axis=1, inplace=True)
        # print(df)

        units = {
            u"o3": u"ppb",
            u"pm25": u"ug/m3",
            u"pm10": u"ug/m3",
            u"co": u"ppm",
            u"so2": u"ppb",
            u"no2": u"ppb",
        }

        grouped = df.groupby(by=u"Site Name")
        for name, gr in grouped:
            station_id = name
            hour = list(gr.columns.values)[-1]
            data_time = "{0} {1}:00".format(raw_data_time, hour)
            data_time = parser.parse(data_time)

            station_data = dict()
            for rec in gr.itertuples(index=False):
                pollutant_name = rec[0]
                pollutant_value = rec[2]

                pollutant = Feature(self.name)
                pollutant.set_source(self.source)
                pollutant.set_raw_name(pollutant_name)
                pollutant.set_raw_value(pollutant_value)
                try:
                    pollutant.set_raw_units(units[pollutant.get_name()])
                except KeyError:
                    print(
                        "There is no such pollutant in local units list <<<<<<<<{0}>>>>>>".format(pollutant.get_name()))

                # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units())

                if pollutant.get_name() is not None and pollutant.get_value() is not None:
                    station_data[pollutant.get_name()] = pollutant.get_value()

            if station_data:
                items = AppItem()
                items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
                items[u"data_time"] = data_time.replace(tzinfo=timezone(self.tz))
                items[u"data_value"] = station_data
                items[u"source"] = self.source
                items[u"source_id"] = station_id

                yield items
    def get_test(self, resp):

        local_units = {
            u"temp": u"degc",
            u"snow": u"",
            u"wd": u"deg",
            u"temp_dew_p": u"degc",
            u"hum": u"%",
            u"sn_d": u"mm",
            u"pres": u"gpa",
            u"sky": u"%",
        }

        json = ujson.loads(resp.text)

        lc = LayerContainer()

        min_forecast_date = None
        for rec in json["hourly_forecast"]:
            fdate = parser.parse(rec.pop("FCTTIME")["pretty"])
            if min_forecast_date is None:
                min_forecast_date = fdate

            if fdate < min_forecast_date:
                min_forecast_date = fdate

            layer = dict()
            for key, val in rec.items():

                pollutant = Feature(self.name)
                pollutant.set_source(self.source)
                # print("record", record)
                pollutant.set_raw_name(key)
                pollutant.set_raw_value(self.get_value(val))

                try:
                    pollutant.set_raw_units(local_units[pollutant.get_name()])
                except KeyError:
                    if pollutant.get_name() is not None:
                        print(
                            "There is no such pollutant in local units list <<<<<<<<{0}>>>>>>"
                            .format(pollutant.get_name()))
                    else:
                        print("Name is None: <<<<<<<<{0}>>>>>>".format(
                            pollutant.get_name()))

                # print(
                #     "answare",
                #     pollutant.get_name(),
                #     # pollutant.get_value(),
                #     pollutant.get_units()
                # )

                if pollutant.get_name() is not None and pollutant.get_value(
                ) is not None:
                    layer[pollutant.get_name()] = pollutant.get_value()

            lc.add_layer(fdate, layer)

        curr_date = min_forecast_date - timedelta(hours=1)

        forecast_data = lc.get_layers()

        if forecast_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = curr_date
            items[u"forecast_data"] = forecast_data
            items[u"source"] = self.source
            items[u"source_id"] = resp.meta[u"code"]

            yield items
示例#18
0
    def get_station_data(self, resp):
        table = resp.xpath(
            u'//*[@id="meteostar_wrapper"]/div[2]/table/tr')[2:-4]
        pollutants_hour = resp.xpath(
            u'//*[@id="meteostar_wrapper"]/div[2]/table/tr[2]/th/text()'
        ).extract()
        pollutants_hour = self.validate_hours(pollutants_hour)
        hour = pollutants_hour[
            len(pollutants_hour) -
            2] if len(pollutants_hour) > 1 else pollutants_hour[0]

        data_time = resp.xpath(
            u'//*[@id="meteostar_wrapper"]/p[5]/b/text()').extract_first()
        data_time = u" ".join((data_time, hour))
        data_time = parser.parse(
            data_time, dayfirst=True).replace(tzinfo=timezone(self.tz))

        units = {
            u"o3": u"ppb",
            u"pm25": u"ug/m3",
            u"co": u"ppm",
            u"so2": u"ppb",
            u"no2": u"ppb",
            u"no": u"ppb",
            u"n2o": u"ppb",
            u"ws": u"mph",
            u"wd": u"deg",
            u"temp": u"degf",
            u"pres": u"mbar",
        }

        station_data = dict()
        for row in table:
            pollutant_name = row.xpath(u"td[1]/a/b/text()").extract_first()
            # print(pollutant_name)
            pollutants_data = row.xpath(u"td[last()-3]").re(
                u">(\d+?)<|>(\d+?\.\d+)<")
            pollutant_value = [el for el in pollutants_data if el != u""]
            pollutant_value = pollutant_value[0] if pollutant_value else None

            pollutant = Feature(self.name)
            pollutant.set_source(self.source)
            # print("record", record)
            pollutant.set_raw_name(pollutant_name)
            pollutant.set_raw_value(pollutant_value)
            try:
                pollutant.set_raw_units(units[pollutant.get_name()])
            except KeyError:
                print(
                    "There is no such pollutant in local units list <<<<<<<<{0}>>>>>>"
                    .format(pollutant.get_name()))

            # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units())
            if pollutant.get_name() is not None and pollutant.get_value(
            ) is not None:
                station_data[pollutant.get_name()] = pollutant.get_value()

        if station_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = data_time
            items[u"data_value"] = station_data
            items[u"source"] = self.source
            items[u"source_id"] = resp.meta[u"code"]

            yield items
示例#19
0
    def get_station_data(self, resp):
        try:
            pollutant_name = {
            u"2": u"PM25",
            u"1": u"Ozone",
            u"5": u"CO",
            u"4": u"NO2",
            u"9": u"SO2"
        }

            pollutant_unit = {
                u"2": u"ug/m3",
                u"1": u"ppb",
                u"5": u"ppm",
                u"4": u"ppb",
                u"9": u"ppb"
            }

            json = ujson.loads(resp.text)
            one_pollutant_data = list()
            for station in json:
                raw_station_name = station[0]
                station_name = raw_station_name["siteName"]

                raw_data = station[1]
                station_data = pd.DataFrame(raw_data)
                station_data[u"date"] = [parser.parse(x) for x in station_data[u'date']]
                station_data[u"station_name"] = station_name
                station_data[u"pollutant_name"] = pollutant_name.get(resp.meta[u"code"])
                if "hexColor" in station_data.columns.values:
                    del station_data[u"hexColor"]

                station_data[u"unit"] = pollutant_unit.get(resp.meta[u"code"])

                one_pollutant_data.append(station_data)

            one_pollutant_data = pd.concat(one_pollutant_data, ignore_index=True)

            res_def = resp.meta["res_df"]
            res_def.append(one_pollutant_data)

            new_code = resp.meta[u"all_codes"].pop()
            url = add_or_replace_parameter(resp.meta["href"], "paramId", new_code)

            yield Request(
                url=url,
                callback=self.get_station_data,
                meta={
                    u"code": new_code,
                    u"all_codes": resp.meta[u"all_codes"],
                    u"res_df": res_def,
                    u"href": url
                }
            )

            # print(one_pollutant_data)
        except IndexError:
            all_data = pd.concat(resp.meta["res_df"], ignore_index=True)
            all_data[all_data["aqi"] == -999.0] = np.nan
            all_data = all_data.dropna(axis=0)

            current_date = all_data["date"].max() - timedelta(hours=2)
            current_data = all_data[all_data["date"] == current_date]

            grouped = current_data[["pollutant_name", "aqi", "unit", "station_name"]].groupby(by="station_name")
            # print(grouped)

            for name, gr in grouped:
                station_id = name

                station_data = dict()
                for rec in gr.itertuples(index=False):
                    pollutant_name = rec[0]
                    pollutant_value = rec[1]
                    pollutant_unit = rec[2]

                    # print(pollutant_name, pollutant_value, pollutant_unit)

                    pollutant = Feature(self.name)
                    pollutant.set_source(self.source)

                    pollutant.set_raw_name(pollutant_name)
                    pollutant.set_raw_value(pollutant_value)
                    pollutant.set_raw_units(pollutant_unit)

                    # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units())

                    if pollutant.get_name() is not None and pollutant.get_value() is not None:
                        station_data[pollutant.get_name()] = pollutant.get_value()
                #
                if station_data:
                    items = AppItem()
                    items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
                    items[u"data_time"] = pd.to_datetime(current_date).replace(tzinfo=timezone(self.tz))
                    items[u"data_value"] = station_data
                    items[u"source"] = self.source
                    items[u"source_id"] = station_id

                    yield items
    def get_station_data(self, resp):
        # //*[@id="datatable"]/tbody/tr[1]/th

        raw_data = resp.xpath(u"//*[@id='datatable'][last()]/tr[last()]/td")

        raw_date = resp.xpath(u"//*[@id='datatable'][last()]/tr[1]/th/text()").extract_first()
        date = " ".join(raw_date.split(" ")[-4:])

        raw_col_names = resp.xpath(u"//*[@id='datatable'][last()]/tr[2]/th")

        poll_values = [el.xpath(u"./text()").extract_first() for el in raw_data]
        raw_poll_names = [self.get_name_and_unit(" ".join(el.xpath(u"./text()").extract())) for el in raw_col_names]
        poll_names = [el[0] for el in raw_poll_names]
        poll_units = [el[1] for el in raw_poll_names]

        data = zip(poll_names, poll_values, poll_units)

        hour = data.pop(0)
        hour = hour[1]

        data_time = " ".join((date, hour))
        data_time = parser.parse(data_time).replace(tzinfo=timezone(self.get_tz(resp.meta[u"code"])))

        units = {
            u"wd": u"cardinals",
        }

        station_data = dict()
        for el in data:
            pollutant_name = el[0]
            pollutant_value = el[1]
            pollutant_units = el[2]

            pollutant = Feature(self.name)
            pollutant.set_source(self.source)
            pollutant.set_raw_name(pollutant_name)
            pollutant.set_raw_value(pollutant_value)

            if pollutant_units is not False:
                pollutant.set_raw_units(pollutant_units)
            else:
                try:
                    pollutant.set_raw_units(units[pollutant.get_name()])
                except KeyError:
                    print(
                    u"There is no such pollutant in local units list <<<<<<<<{0}>>>>>>".format(
                        pollutant.get_name()))

            # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units())
            if pollutant.get_name() is not None and pollutant.get_value() is not None:
                station_data[pollutant.get_name()] = pollutant.get_value()

        if station_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = data_time
            items[u"data_value"] = station_data
            items[u"source"] = self.source
            items[u"source_id"] = resp.meta[u"code"]

            yield items
    def get_station_data(self, resp):
        raw_col_names = resp.xpath(
            u'//*[@id="content1"]/div[1]/table/tr[1]/td').extract()
        col_names = [re.sub(u"<.+?>", u"", el) for el in raw_col_names]
        # for el in col_names:
        #     print el

        table = resp.xpath(u'//*[@id="content1"]/div[1]/table//td')

        table_data = [
            el.xpath(u".").re(u"<td>(.+)<\/td>")[0]
            if el.xpath(u".").re(u"<td>(.+)<\/td>") else None for el in table
        ]
        table_data = np.asarray(table_data).reshape(
            len(table_data) / len(col_names), len(col_names))

        df = pd.DataFrame(table_data[1:, ], columns=col_names)
        df[u"Опади"] = df[u"Опади"].apply(lambda x: re.search(
            u"(.+) \s*\(", x).group(1) if x is not None else 0)

        raw_data = df.iloc[0].to_dict()
        raw_data_time = raw_data.pop(u"Дата і час", None)

        data_time = parser.parse(
            raw_data_time, dayfirst=True).replace(tzinfo=timezone(self.tz))

        data = raw_data
        units = {
            u"Температура повітря": u"degc",
            u"Температура точки роси": u"degc",
            u"Опади": u"mm",
            u"Атмосферний тиск": u"mbar",
            u"Напрямок вітру": u"deg",
            u"Швидкість вітру": u"ms",
        }

        station_data = dict()
        for key, val in data.items():
            poll_name = key
            poll_value = val
            poll_units = units[key]
            # print(poll_name, poll_value, poll_units)

            pollutant = Feature(self.name)
            pollutant.set_source(self.source)
            # print("record", record)
            pollutant.set_raw_name(poll_name)
            pollutant.set_raw_value(poll_value)
            pollutant.set_raw_units(poll_units)
            # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units())
            if pollutant.get_name() is not None and pollutant.get_value(
            ) is not None:
                station_data[pollutant.get_name()] = pollutant.get_value()

        if station_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = data_time
            items[u"data_value"] = station_data
            items[u"source"] = self.source
            items[u"source_id"] = resp.meta[u"code"]

            yield items
    def get_station_data(self, resp):
        table = resp.xpath(
            u'//*[@id="meteostar_wrapper"]/div[3]/table/tr')[2:-4]

        pollutants_hour = resp.xpath(
            u'//*[@id="meteostar_wrapper"]//table[1]/tr[2]/th/text()').extract(
            )

        pollutants_hour = self.validate_hours(pollutants_hour)
        hour = pollutants_hour[
            len(pollutants_hour) -
            2] if len(pollutants_hour) > 1 else pollutants_hour[0]

        data_time = resp.xpath(
            u'//*[@id="meteostar_wrapper"]/p[5]/b/text()').extract_first()
        data_time = u" ".join((data_time, hour))
        data_time = parser.parse(data_time).replace(tzinfo=timezone(self.tz))

        station_data = dict()
        for row in table:
            raw_pollutant_name = row.xpath(u"td[1]/a/b/text()")
            pollutant_name = raw_pollutant_name.extract()[0] if len(
                raw_pollutant_name) == 1 else None
            # print(pollutant_name)

            raw_pollutant_unit = row.xpath(u"td[1]/a/@onmouseover")

            # print(raw_pollutant_unit.extract())

            pollutant_unit = raw_pollutant_unit.re(u"\d*\.\s+(\D+)',")
            pollutant_unit = pollutant_unit[0].replace(
                "Measured in ", "") if len(pollutant_unit) == 1 else None

            pollutants_data = row.xpath(u"td[last()-4]").re(
                u">(\d+?)<|>(\d+?\.\d+)<")
            # print(pollutants_data)
            pollutant_value = [el for el in pollutants_data if el != u""]
            pollutant_value = pollutant_value[0] if pollutant_value else None

            # print(pollutant_name, pollutant_value)

            pollutant = Feature(self.name)
            pollutant.set_source(self.source)
            # print("record", record)
            pollutant.set_raw_name(pollutant_name)
            pollutant.set_raw_value(pollutant_value)
            pollutant.set_raw_units(pollutant_unit)

            # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units())
            if pollutant.get_name() is not None and pollutant.get_value(
            ) is not None:
                station_data[pollutant.get_name()] = pollutant.get_value()

        if station_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = data_time
            items[u"data_value"] = station_data
            items[u"source"] = self.source
            items[u"source_id"] = resp.meta[u"code"]

            yield items
示例#23
0
    def get_station_data(self, resp):
        json = ujson.loads(resp.text)

        param = ("ParameterDescription", "Average", "Units", "AverageHour")

        all_records = list()
        for site in json["Sites"]:
            station_id = site["AQSSiteId"]

            st_records = list()
            readings = site.get("Readings")
            if readings is not None:
                for rec in readings:
                    res = {key: rec[key] for key in param}
                    res["station_id"] = station_id

                    st_records.append(res)

                all_records.extend(st_records)

        # print(all_records)
        all_data = pd.DataFrame(all_records)
        current_date_time = all_data["AverageHour"].max()
        current_data = all_data[all_data["AverageHour"] == current_date_time]

        grouped = current_data[[
            "station_id", "ParameterDescription", "Average", "Units"
        ]].groupby(by="station_id")

        for name, gr in grouped:
            # print(name)
            station_data = dict()
            station_id = name
            for record in gr.itertuples(index=False):
                # if station_id is None:
                #     station_id = record[0]

                pollutant_name = record[1]
                pollutant_value = record[2]
                pollutant_units = record[3]

                # print(pollutant_name, pollutant_value, pollutant_units)

                pollutant = Feature(self.name)
                pollutant.set_source(self.source)
                pollutant.set_raw_name(pollutant_name)
                pollutant.set_raw_value(pollutant_value)
                pollutant.set_raw_units(pollutant_units)

                # print("Validated", pollutant.get_name(), pollutant.get_value(), pollutant.get_units())
                if pollutant.get_name() is not None and pollutant.get_value(
                ) is not None:
                    station_data[pollutant.get_name()] = pollutant.get_value()

            if station_data:
                items = AppItem()
                items[u"scrap_time"] = datetime.now(
                    tz=timezone(SCRAPER_TIMEZONE))
                items[u"data_time"] = parser.parse(current_date_time).replace(
                    tzinfo=timezone(self.tz))
                items[u"data_value"] = station_data
                items[u"source"] = self.source
                items[u"source_id"] = station_id

                yield items
示例#24
0
    def get_station_data(self, resp):
        try:
            pollutant_unit = {u"pm": u"ug/m3", u"ozone": u"ppb"}

            pollutant_name = resp.url.split(u"/")[-1].replace(
                u"_monitors.aspx", u"")
            # print(pollutant_name)

            # raw_station_names = resp.xpath(u"//*[@id='tblGrid']/thead/tr[1]/td")
            raw_table = resp.xpath(u"//*[@id='tblGrid'][1]").extract_first()
            raw_table = re.sub(u"</?tbody>", u"", raw_table)
            raw_table = re.sub(u"</?thead>", u"", raw_table)
            raw_table = re.sub(u"</th>", u"</td>", raw_table)
            raw_table = re.sub(u"<th ", u"<td ", raw_table)

            table = Selector(text=raw_table)
            # print(table)
            raw_station_names = table.xpath(u"//tr[1]/td")[1:]
            # print(raw_station_names)

            station_names = [
                u" ".join(el.xpath(u"./b/text()").extract())
                for el in raw_station_names
            ]
            station_names = [u" ".join(el.split()) for el in station_names]
            # print(station_names)

            raw_poll_data = table.xpath(u"//*[@id='tblGrid'][1]/tr[last()]/td")

            raw_hour = raw_poll_data[0].xpath(u"./text()").extract_first()
            try:
                raw_date = resp.xpath(
                    u"//*[@id='mainContent']/div[1]/p[2]/text()").re(
                        u"(\d\d?/\d\d?/\d\d\d\d)")[0]
            except IndexError:
                raw_date = None

            # print(raw_date)
            raw_data_time = u" ".join((raw_date, raw_hour))
            data_time = parser.parse(raw_data_time).replace(
                tzinfo=timezone(self.tz))

            raw_poll_value = raw_poll_data[1:]
            poll_values = [
                el.xpath(u"font/text()").extract_first()
                for el in raw_poll_value
            ]

            data = zip(station_names, poll_values)

            table_data = [{
                u"station_id": el[0],
                u"pollutant_name": pollutant_name,
                u"pollutant_value": el[1],
                u"pollutant_unit": pollutant_unit.get(pollutant_name),
                u"date": data_time
            } for el in data]

            df = pd.DataFrame(table_data)

            if resp.meta.get(u"global_data") is not None:
                new_global_data = pd.concat(
                    [resp.meta.get(u"global_data"), df], ignore_index=True)
            else:
                new_global_data = df

            # print(new_global_data)
            resp.meta[u"global_data"] = new_global_data

            yield Request(url=resp.meta[u"urls"].pop(),
                          callback=self.get_station_data,
                          meta={
                              u"urls": resp.meta[u"urls"],
                              u"global_data": resp.meta[u"global_data"]
                          })

        except IndexError:
            # pass
            data = resp.meta[u"global_data"]

            current_data_time = data[u"date"].max()
            data = data[data[u"date"] == current_data_time]
            data = data[[
                u"station_id", u"pollutant_name", u"pollutant_value",
                u"pollutant_unit"
            ]]

            # print(data)

            grouped = data.groupby(by=u"station_id")

            for name, gr in grouped:
                station_data = dict()
                # print(name)
                station_id = None
                for record in gr.itertuples(index=False):
                    if station_id is None:
                        station_id = record[0]

                    pollutant_name = record[1]
                    pollutant_value = record[2]
                    pollutant_units = record[3]

                    # print(pollutant_name, pollutant_value, pollutant_units)

                    pollutant = Feature(self.name)
                    pollutant.set_source(self.source)
                    pollutant.set_raw_name(pollutant_name)
                    pollutant.set_raw_value(pollutant_value)
                    pollutant.set_raw_units(pollutant_units)

                    # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units())
                    if pollutant.get_name(
                    ) is not None and pollutant.get_value() is not None:
                        station_data[
                            pollutant.get_name()] = pollutant.get_value()

                # print(station_data)
                data_time = pd.to_datetime(current_data_time).replace(
                    tzinfo=timezone(self.tz))

                if station_data and data_time:
                    items = AppItem()
                    items[u"scrap_time"] = datetime.now(
                        tz=timezone(SCRAPER_TIMEZONE))
                    items[u"data_time"] = data_time
                    items[u"data_value"] = station_data
                    items[u"source"] = self.source
                    items[u"source_id"] = station_id

                    yield items
示例#25
0
    def get_station_data(self, resp):
        try:
            data_time = resp.xpath(u".//*[@id='Content_cphMain_pnLastUpdate']/p/text()").re(u"Last Updated on (.+)")[0]
            data_time = parser.parse(data_time).replace(tzinfo=timezone(self.tz))
        except IndexError:
            data_time = None

        raw_poll_data = resp.xpath(u".//*[@id='Content_cphMain_dlAirQualityParameters']/tr/td")
        data = list()
        for el in raw_poll_data:
            poll_name_part = el.xpath(u"h6/text()").extract_first()
            poll_name_part_add = el.xpath(u"h6/sub/text()").extract_first()
            poll_name_part = u"".join((poll_name_part, poll_name_part_add)) if poll_name_part_add is not None else poll_name_part
            poll_name_part = u" ".join(poll_name_part.split())
            # print(poll_name_part)

            _data = el.xpath(u'div[not(@class="clearfix")]')
            _data = [el.xpath(u"text()").extract_first() for el in _data]

            _data = [u" ".join(el.split()) for el in _data]
            _data = [el for el in _data if el != u""]
            poll_subnames = _data[::2]
            poll_names = [" ".join((poll_name_part, el)) for el in poll_subnames]

            raw_poll_values = _data[1::2]

            poll_values = list()
            poll_units = list()
            for el in raw_poll_values:
                value = re.findall(u"^([-]?\d*[.]?\d+|\d+[.]?\d*)", el)[0]
                unit = re.sub(u"^([-]?\d*[.]?\d+|\d+[.]?\d*)", u"", el)
                if unit is not None:
                    unit = u" ".join(unit.split())

                poll_values.append(value)
                poll_units.append(unit)
            subdata = zip(poll_names, poll_values, poll_units)
            data.extend(subdata)

        raw_weather_data = resp.xpath(u".//*[@id='Content_cphMain_gvMeteorology']/tr")

        wind_dir = {
            u"N": u"0",
            u"NNE": u"22.5",
            u"NE": u"45",
            u"ENE": u"68.5",
            u"E": u"90",
            u"ESE": u"112.5",
            u"SE": u"135",
            u"SSE": u"157.5",
            u"S": u"180",
            u"SSW": u"202.5",
            u"SW": u"225",
            u"WSW": u"247.5",
            u"W": u"270",
            u"WNW": u"292.5",
            u"NW": u"315",
            u"NNW": u"337.5",
        }
        w_data = list()
        for el in raw_weather_data:
            name = el.xpath(u"td[1]/text()").extract_first()
            name = u" ".join(name.split()) if name is not None else None

            raw_poll_val_1 = u" ".join(el.xpath(u"td[2]/text()").extract())
            raw_poll_val_2 = u" ".join(el.xpath(u"td[2]/sup/text()").extract())
            raw_poll_val = u" ".join((raw_poll_val_1, raw_poll_val_2))
            raw_poll_val = u" ".join(raw_poll_val.split())
            if wind_dir.get(raw_poll_val) is not None:
                raw_poll_val = wind_dir.get(raw_poll_val) + u" deg"

            value = re.findall(u"^([-]?\d*[.]?\d+|\d+[.]?\d*)", raw_poll_val)[0]
            unit = re.sub(u"^([-]?\d*[.]?\d+|\d+[.]?\d*)", u"", raw_poll_val)
            unit = u" ".join(unit.split())

            res = (name, value, unit)
            w_data.append(res)

        data.extend(w_data)
        # print(data)
        station_data = dict()
        for el in data:
            pollutant_name = el[0]
            pollutant_value = el[1]
            pollutant_units = el[2]

            # print(pollutant_name, pollutant_value, pollutant_units)

            pollutant = Feature(self.name)
            pollutant.set_source(self.source)
            pollutant.set_raw_name(pollutant_name)
            pollutant.set_raw_value(pollutant_value)
            pollutant.set_raw_units(pollutant_units)
            # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units())
            if pollutant.get_name() is not None and pollutant.get_value() is not None:
                station_data[pollutant.get_name()] = pollutant.get_value()

        if station_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = data_time
            items[u"data_value"] = station_data
            items[u"source"] = self.source
            items[u"source_id"] = resp.meta[u"code"]

            yield items
示例#26
0
    def get_station_data(self, resp):
        raw_table = resp.xpath(
            u"//*[@id='meteostar_wrapper']/div[3]/table/tr")[2:-3]

        raw_data_time = resp.xpath(
            u"//*[@id='meteostar_wrapper']/form/table[2]/tr[2]/td")
        month = raw_data_time[0].xpath(
            u"select/option[@selected='selected']/text()").extract_first()
        day = raw_data_time[1].xpath(
            u"select/option[@selected='selected']/text()").extract_first()
        year = raw_data_time[2].xpath(
            u"select/option[@selected='selected']/text()").extract_first()

        pollutants_hour = resp.xpath(
            u"//*[@id='meteostar_wrapper']/div[3]/table/tr[2]/th/text()"
        ).extract()
        pollutants_hour = self.validate_hours(pollutants_hour)
        data_times = [
            parser.parse(u" ".join((month, day, year, hour)))
            for hour in pollutants_hour
        ]

        units = {
            u"o3": u"ppb",
            u"ws": u"mph",
            u"wd": u"deg",
            u"temp": u"degf",
            u"pm10": u"ug/m3",
            u"pm25": u"ug/m3",
            u"pm": u"ug/m3",
            u"co": u"ppm",
            u"rain": u"in",
            u"no": u"ppb",
            u"no2": u"ppb",
            u"pres": u"mbar",
            u"hum_rel": u"%",
            u"no_y": u"ppb",
            u"so2": u"ppb",
        }

        station_data = dict()
        table = list()
        for row in raw_table:
            try:
                pollutant_name = row.xpath(u"td[1]/a/b/text()").extract()[0]
            except IndexError:
                pollutant_name = None

            pollutants_data = row.xpath(u"td")[1:-2]
            pollutants_data = [
                "".join(el.xpath(u".//text()").extract())
                for el in pollutants_data
            ]
            pollutants_data = map(self.coerce_float, pollutants_data)

            print(pollutants_data)
            print(pollutants_hour)

            records = list()
            for el in zip([pollutant_name] * len(data_times), pollutants_data,
                          data_times):
                pollutant = Feature(self.name)
                pollutant.set_source(self.source)
                pollutant.set_raw_name(el[0])
                pollutant.set_raw_value(el[1])
                pollutant.set_raw_units(units.get(pollutant.get_name()))

                res = {
                    "name": pollutant.get_name(),
                    "value": pollutant.get_value(),
                    "date": el[2],
                    "unit": pollutant.get_units()
                }
                records.append(res)

            table.extend(records)

        df = pd.DataFrame(table)
        df["value"] = df["value"].astype(float)
        df = df.dropna(axis=0)
        # print(df)

        current_data_time = df["date"].max()
        current_data = df[df["date"] == current_data_time]

        print(current_data)

        station_data = dict()
        for el in current_data[["name", "value"]].itertuples(index=False):
            station_data[el[0]] = el[1]

        if station_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = pd.to_datetime(current_data_time).replace(
                tzinfo=timezone(self.tz))
            items[u"data_value"] = station_data
            items[u"source"] = self.source
            items[u"source_id"] = resp.meta[u"code"]

            yield items
示例#27
0
    def get_station_data(self, resp):
        raw_date = resp.xpath(
            u"/html/body/table[3]/tr/td/table/tr/td/table[2]/tr/td/form/table/tr[1]/td/text()"
        ).extract()[1]
        raw_date = u" ".join(raw_date.split())
        curr_date = parser.parse(raw_date)
        str_date = u"{dd}-{mm}-{yyyy}".format(dd=curr_date.day,
                                              mm=curr_date.month,
                                              yyyy=curr_date.year)
        # print(curr_date)

        col_names = resp.xpath(
            u"/html/body/table[3]/tr/td/table/tr/td/table[2]/tr/td/form/table/tr[last()]/td[1]/table/tr[1]/th"
        )[1:-1]
        col_names = [el.xpath(u"./text()").extract_first() for el in col_names]
        # print(col_names)

        table = resp.xpath(
            u"/html/body/table[3]/tr/td/table/tr/td/table[2]/tr/td/form/table/tr[last()]/td[1]/table/tr[@id]"
        )
        df_records = list()
        for row in table:
            record = row.xpath(u"./td")[1:-1]
            record = [el.xpath(u"./text()").extract_first() for el in record]

            data = zip(col_names, record)
            df_record = dict(data)
            df_records.append(df_record)

        df = pd.DataFrame(df_records, columns=col_names)
        # could be one value in column
        df = df.dropna(axis=1, thresh=1)
        df = df.dropna(axis=0, how=u'all')

        latest_data = df[[u"Site", u"Param", u"UNITS", df.columns[-1]]]

        hour = latest_data.columns[-1]
        raw_data_time = u"{0} {1}:00".format(str_date, hour)
        data_time = parser.parse(
            raw_data_time, dayfirst=True).replace(tzinfo=timezone(self.tz))

        grouped = latest_data.groupby(by=u"Site")

        for name, gr in grouped:

            station_data = dict()
            station_id = None
            for record in gr.itertuples(index=False):
                if station_id is None:
                    station_id = record[0]

                pollutant_name = record[1]
                pollutant_value = record[3]
                pollutant_units = record[2]

                # print(pollutant_name, pollutant_value, pollutant_units)

                pollutant = Feature(self.name)
                pollutant.set_source(self.source)
                pollutant.set_raw_name(pollutant_name)
                pollutant.set_raw_value(pollutant_value)
                pollutant.set_raw_units(pollutant_units)

                # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units())
                if pollutant.get_name() is not None and pollutant.get_value(
                ) is not None:
                    station_data[pollutant.get_name()] = pollutant.get_value()

            # print(station_data)
            # print(name)

            if station_data:
                items = AppItem()
                items[u"scrap_time"] = datetime.now(
                    tz=timezone(SCRAPER_TIMEZONE))
                items[u"data_time"] = data_time
                items[u"data_value"] = station_data
                items[u"source"] = self.source
                items[u"source_id"] = station_id

                yield items
示例#28
0
    def get_station_data(self, resp):
        raw_stations_data = resp.xpath(u"/html/head/script[last()]/text()").re(
            u"createMultiStation\((.*)\);")
        for station in raw_stations_data:
            # test_data = u"[{0}]".format(raw_stations_data[0])
            test_data = u"[{0}]".format(station)
            test_data = test_data.replace(u'"', u'\\"')
            test_data = test_data.replace(u"'", u'"')
            test_data = test_data.replace(u"role", u'"role"')
            # print(test_data)
            json = ujson.loads(test_data)

            raw_data = json[3][0]

            station_id = raw_data[0]
            # print(station_id)
            row_data_time = Selector(text=raw_data[len(raw_data) - 2])
            data_time = row_data_time.xpath(u"//td[2]/text()").extract_first()
            data_time = parser.parse(data_time).replace(
                tzinfo=timezone(self.tz))

            pollutant_data = raw_data[len(raw_data) -
                                      4] + raw_data[len(raw_data) - 3]
            pollutant_data = Selector(text=pollutant_data)

            pollutants_name_p1 = pollutant_data.xpath(u"//table/tr[1]/td")[1:3]
            pollutants_name_p1 = [
                el.xpath(u"u/text()").extract_first()
                for el in pollutants_name_p1
            ]
            pollutants_name_p2 = pollutant_data.xpath(u"//table/tr[1]/td")[1:3]
            pollutants_name_p2 = [
                el.xpath(u"sub/text()").extract_first()
                for el in pollutants_name_p2
            ]

            pollutants_name = [
                u" ".join(x)
                for x in zip(pollutants_name_p1, pollutants_name_p2)
            ]

            pollutant_value_data = pollutant_data.xpath(
                u"//table/tr[2]/td")[1:3]
            pollutant_value_data = [
                val.xpath(u"text()").extract_first().split(u" ")
                for val in pollutant_value_data
            ]
            pollutant_value = [el[0] for el in pollutant_value_data]
            pollutant_units = [el[1] for el in pollutant_value_data]

            data = zip(pollutants_name, pollutant_value, pollutant_units)
            # print(data)

            station_data = dict()
            for record in data:
                pollutant = Feature(self.name)
                pollutant.set_source(self.source)
                # print("record", record)
                pollutant.set_raw_name(record[0])
                pollutant.set_raw_value(record[1])
                pollutant.set_raw_units(record[2])
                # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units())
                if pollutant.get_name() is not None and pollutant.get_value(
                ) is not None:
                    station_data[pollutant.get_name()] = pollutant.get_value()

            if station_data:
                items = AppItem()
                items[u"scrap_time"] = datetime.now(
                    tz=timezone(SCRAPER_TIMEZONE))
                items[u"data_time"] = data_time
                items[u"data_value"] = station_data
                items[u"source"] = self.source
                items[u"source_id"] = station_id

                yield items