def _load_datas(self, datas=None): kwargs = {} if not datas: # TODO: timeout, replace download = Downloader(url=self.url, store_filepath=self.store_path, filename=self.filename, use_existing_file=self.fetcher.use_existing_file) zip_filepath = download.get_filepath() self.fetcher.for_delete.append(zip_filepath) filepath = extract_zip_file(zip_filepath) self.fetcher.for_delete.append(zip_filepath) kwargs['filepath'] = filepath else: kwargs['fileobj'] = io.StringIO(datas, newline="\n") kwargs['date_format'] = "%a %b %d %H:%M:%S %Z %Y" kwargs['headers_line'] = DATASETS[self.dataset.dataset_code]['lines']['headers'] self._file, self._rows, self.headers, self.release_date, self.dimension_keys, self.periods = local_read_csv(**kwargs) self.dataset.dimension_keys = self.dimension_keys self.dataset.last_update = self.release_date self.start_date = get_ordinal_from_period(self.periods[0], freq=self.frequency) self.end_date = get_ordinal_from_period(self.periods[-1], freq=self.frequency)
def build_series(self, datas): datas = datas["datas"] series = {} series['key'] = "%s.%s" % (self.current_indicator["id"], self.current_country) series['name'] = "%s - %s" % (self.current_indicator["name"], self.available_countries[self.current_country]["name"]) series['frequency'] = self._search_frequency(datas[0]) #if self.current_indicator.get("sourceNote"): # series["notes"] = self.current_indicator.get("sourceNote") values = [] value_found = False for point in datas: frequency = self._search_frequency(point) if frequency != series['frequency']: raise Exception("Diff frequency [%s] != [%s] - series[%s]" % (frequency, series['frequency'], series['key'])) value = { 'attributes': None, 'release_date': self.release_date, 'value': str(point["value"]).replace("None", ""), 'ordinal': get_ordinal_from_period(point["date"], freq=series['frequency']), 'period': point["date"], } if not value_found and value["value"] != "": value_found = True if "obs_status" in point: obs_status = point.get("obs_status") if obs_status and len(obs_status) > 0: value["attributes"] = {"obs_status": obs_status} if not "obs_status" in self.dataset.codelists: self.dataset.codelists["obs_status"] = self.obs_status if not "obs_status" in self.dataset.concepts: self.dataset.concepts["obs_status"] = "Observation Status" values.append(value) if not value_found: msg = {"provider_name": self.provider_name, "dataset_code": self.dataset_code} raise errors.RejectEmptySeries(**msg) keyfunc = lambda x: x["ordinal"] series['values'] = sorted(values, key=keyfunc) series['provider_name'] = self.provider_name series['dataset_code'] = self.dataset_code series['start_date'] = series['values'][0]["ordinal"] series['end_date'] = series['values'][-1]["ordinal"] series['dimensions'] = {'country': self.current_country} series['attributes'] = None return series
def _process(self): for url in self.urls: #TODO: if not url.endswith("alla.xls"): #ex: http://www.imf.org/external/pubs/ft/weo/2006/02/data/WEOSep2006all.xls] date_str = match(".*WEO(\w{7})", url).groups()[0] #Sep2006 self.release_date = datetime.strptime(date_str, "%b%Y") #2006-09-01 00:00:00 if not self._is_updated(): msg = "upsert dataset[%s] bypass because is updated from release_date[%s]" logger.info(msg % (self.dataset_code, self.release_date)) continue self.dataset.last_update = self.release_date logger.info("load url[%s]" % url) download = Downloader( url=url, store_filepath=self.store_path, filename=os.path.basename(url), use_existing_file=self.fetcher.use_existing_file) data_filepath = download.get_filepath() self.fetcher.for_delete.append(data_filepath) with open(data_filepath, encoding='latin-1') as fp: self.sheet = csv.DictReader(fp, dialect=csv.excel_tab) self.years = self.sheet.fieldnames[8:-1] self.start_date = get_ordinal_from_period(self.years[0], freq=self.frequency) self.end_date = get_ordinal_from_period(self.years[-1], freq=self.frequency) for row in self.sheet: if not row or not row.get('Country Group Name'): break yield row, None yield None, None
def _process(self): for url in self.urls: #TODO: if not url.endswith("alla.xls"): #ex: http://www.imf.org/external/pubs/ft/weo/2006/02/data/WEOSep2006all.xls] date_str = match(".*WEO(\w{7})", url).groups()[0] #Sep2006 self.release_date = datetime.strptime(date_str, "%b%Y") #2006-09-01 00:00:00 if not self.is_updated(): msg = "upsert dataset[%s] bypass because is updated from release_date[%s]" logger.info(msg % (self.dataset_code, self.release_date)) continue self.dataset.last_update = self.release_date logger.info("load url[%s]" % url) download = Downloader(url=url, store_filepath=self.store_path, filename=os.path.basename(url), use_existing_file=self.fetcher.use_existing_file) data_filepath = download.get_filepath() self.fetcher.for_delete.append(data_filepath) with open(data_filepath, encoding='latin-1') as fp: self.sheet = csv.DictReader(fp, dialect=csv.excel_tab) self.years = self.sheet.fieldnames[9:-1] self.start_date = get_ordinal_from_period(self.years[0], freq=self.frequency) self.end_date = get_ordinal_from_period(self.years[-1], freq=self.frequency) for row in self.sheet: if not row or not row.get('Country'): break yield row, None #self.dataset.update_database(save_only=True) yield None, None
def _load_datas(self, datas=None): kwargs = {} if not datas: # TODO: timeout, replace download = Downloader( url=self.url, store_filepath=self.store_path, filename=self.filename, use_existing_file=self.fetcher.use_existing_file) zip_filepath = download.get_filepath() self.fetcher.for_delete.append(zip_filepath) filepath = extract_zip_file(zip_filepath) self.fetcher.for_delete.append(zip_filepath) kwargs['filepath'] = filepath else: kwargs['fileobj'] = io.StringIO(datas, newline="\n") kwargs['date_format'] = "%a %b %d %H:%M:%S %Z %Y" kwargs['headers_line'] = DATASETS[ self.dataset.dataset_code]['lines']['headers'] self._file, self._rows, self.headers, self.release_date, self.dimension_keys, self.periods = local_read_csv( **kwargs) self.dataset.dimension_keys = self.dimension_keys #TODO: if "frequency" in self.dataset.dimension_keys: # self.dataset.set_dimension_frequency("frequency") self.dataset.last_update = self.release_date self.start_date = get_ordinal_from_period(self.periods[0], freq=self.frequency) self.end_date = get_ordinal_from_period(self.periods[-1], freq=self.frequency)
def build_series(self, datas): datas = datas["datas"] series = {} series['key'] = "%s.%s" % (self.current_indicator["id"], self.current_country) series['name'] = "%s - %s" % (self.current_indicator["name"], self.available_countries[self.current_country]["name"]) series['frequency'] = self._search_frequency(datas[0]) if self.current_indicator.get("sourceNote"): series["notes"] = self.current_indicator.get("sourceNote") values = [] for point in datas: frequency = self._search_frequency(point) if frequency != series['frequency']: raise Exception("Diff frequency [%s] != [%s] - series[%s]" % (frequency, series['frequency'], series['key'])) value = { 'attributes': None, 'release_date': self.release_date, 'value': str(point["value"]), 'ordinal': get_ordinal_from_period(point["date"], freq=series['frequency']), 'period': point["date"], } if "obs_status" in point: obs_status = point.get("obs_status") if obs_status and len(obs_status) > 0: value["attributes"] = {"obs_status": obs_status} values.append(value) keyfunc = lambda x: x["ordinal"] series['values'] = sorted(values, key=keyfunc) series['provider_name'] = self.provider_name series['dataset_code'] = self.dataset_code series['start_date'] = series['values'][0]["ordinal"] series['end_date'] = series['values'][-1]["ordinal"] series['dimensions'] = {'country': self.current_country} series['attributes'] = None return series
def build_series(self, row): series_key = row['KEY'] dimensions = OrderedDict() for d in self.dimension_keys: dim_short_id = row[d].split(":")[0] dim_long_id = row[d].split(":")[1] dimensions[d] = dim_short_id if not d in self.dataset.codelists: self.dataset.codelists[d] = {} self.dataset.codelists[d][dim_short_id] = dim_long_id #dimensions[d] = self.dimension_list.update_entry(d, dim_short_id, dim_long_id) series_name = " - ".join([row[d].split(":")[1] for d in self.dimension_keys]) values = [] for period in self.periods: value = { 'attributes': None, 'release_date': self.release_date, 'ordinal': get_ordinal_from_period(period, freq=self.frequency), #'period_o': period, 'period': period, 'value': row[period] } values.append(value) bson = {'provider_name': self.dataset.provider_name, 'dataset_code': self.dataset.dataset_code, 'name': series_name, 'key': series_key, 'values': values, 'attributes': None, 'dimensions': dimensions, 'last_update': self.release_date, 'start_date': self.start_date, 'end_date': self.end_date, 'frequency': self.frequency} return bson
def update_sheet(self): try: self.sheet = next(self.sheets) except StopIteration: self.update_file() self.sheet = next(self.sheets) self.columns = iter(range(1,self.sheet.row_len(0))) periods = self.sheet.col_slice(0, start_rowx=2) start_period = periods[0].value end_period = periods[-1].value self.periods = [] if self.sheet.name == 'annual': self.frequency = 'A' self.start_date = get_ordinal_from_period(str(int(start_period)), freq='A') self.end_date = get_ordinal_from_period(str(int(end_period)), freq='A') self.periods = [str(int(p.value)) for p in periods] elif self.sheet.name == 'quarterly': self.frequency = 'Q' self.start_date = get_ordinal_from_period(start_period,freq='Q') self.end_date = get_ordinal_from_period(end_period,freq='Q') self.periods = [p.value for p in periods] elif self.sheet.name == 'monthly': self.frequency = 'M' self.start_date = get_ordinal_from_period(start_period.replace('M','-'),freq='M') self.end_date = get_ordinal_from_period(end_period.replace('M','-'),freq='M') self.periods = [p.value.replace('M','-') for p in periods] else: msg = {"provider_name": self.provider_name, "dataset_code": self.dataset_code, "frequency": self.sheet.name} raise errors.RejectFrequency(**msg) """ elif self.sheet.name == 'daily': self.frequency = 'D' self.start_date = self.translate_daily_dates(start_period) self.end_date = self.translate_daily_dates(end_period) TODO: self.periods = [p.value for p in periods] """ self.dataset.add_frequency(self.frequency)
def _get_datas(self): _zipfile = zipfile.ZipFile(self.filepath) for fname in _zipfile.namelist(): info = _zipfile.getinfo(fname) # bypass directory if info.file_size == 0 or info.filename.endswith("/"): continue if "Commodity Prices" in fname: logger.warning("bypass %s" % fname) continue # if not self.release_date: # last_update = clean_datetime(datetime(*self.zipfile.getinfo(fname).date_time[0:6])) series_name = fname[:-5] logger.info("open excel file[%s] - series.name[%s]" % (fname, series_name)) excel_book = xlrd.open_workbook(file_contents=_zipfile.read(fname)) for sheet in excel_book.sheets(): if sheet.name in [ "Sheet1", "Sheet2", "Sheet3", "Sheet4", "Feuille1", "Feuille2", "Feuille3", "Feuille4", ]: continue periods = sheet.col_slice(0, start_rowx=2) start_period = periods[0].value end_period = periods[-1].value frequency = None start_date = None end_date = None if sheet.name == "annual": frequency = "A" start_date = get_ordinal_from_period(str(int(start_period)), freq="A") end_date = get_ordinal_from_period(str(int(end_period)), freq="A") periods = [str(int(p.value)) for p in periods] elif sheet.name == "quarterly": frequency = "Q" start_date = get_ordinal_from_period(start_period, freq="Q") end_date = get_ordinal_from_period(end_period, freq="Q") periods = [p.value for p in periods] elif sheet.name == "monthly": frequency = "M" start_date = get_ordinal_from_period(start_period.replace("M", "-"), freq="M") end_date = get_ordinal_from_period(end_period.replace("M", "-"), freq="M") periods = [p.value.replace("M", "-") for p in periods] # elif sheet.name == 'daily': # frequency = 'D' # start_date = self._translate_daily_dates(start_period) # end_date = self._translate_daily_dates(end_period) # TODO: periods = [p.value for p in periods] else: msg = { "provider_name": self.provider_name, "dataset_code": self.dataset_code, "frequency": sheet.name, } raise errors.RejectFrequency(**msg) self.dataset.add_frequency(frequency) columns = iter(range(1, sheet.row_len(0))) for column in columns: settings = { "column": column, "sheet": sheet, "periods": periods, "series_name": series_name, "bson": {"frequency": frequency, "start_date": start_date, "end_date": end_date}, } yield settings, None
def test_get_ordinal_from_period(self): """ >>> pd.Period("1970-Q1", freq="Q").ordinal 0 >>> pd.Period("1970-Q2", freq="Q").ordinal 1 >>> pd.Period("1970-Q3", freq="Q").ordinal 2 >>> pd.Period("1970-Q4", freq="Q").ordinal 3 >>> pd.Period("1971-Q1", freq="Q").ordinal 4 >>> pd.Period("1969-Q1", freq="Q").ordinal -4 >>> pd.Period("1969-Q4", freq="Q").ordinal -1 >>> pd.Period("1968-Q1", freq="Q").ordinal -8 >>> pd.Period('1970', freq='A') Period('1970', 'A-DEC') >>> pd.Period('1970', freq='A').ordinal 0 >>> pd.Period('1970', freq='M').ordinal 0 >>> pd.Period('1970-01', freq='M').ordinal 0 >>> pd.Period('1970-02', freq='M').ordinal 1 >>> pd.Period('1969-12', freq='M').ordinal -1 >>> pd.Period('1968-01', freq='M').ordinal -24 >>> pd.Period('1971-01', freq='M').ordinal 12 >>> pd.Period('1969-01', freq='M').ordinal -12 >>> pd.Period('1970-07', freq='M').ordinal 6 >>> pd.Period('1971-07', freq='M').ordinal 18 >>> pd.Period('1969-07', freq='M').ordinal -6 """ TEST_VALUES = [("1970", "A", 0), ("1969", "A", -1), ("1971", "A", 1), ("1970-01-01", "A", 0), ("19700101", "A", 0), ("1970-01", "M", 0), ("197001", "M", 0), ("1970-02", "M", 1), ("1969-12", "M", -1), ("1969-01", "M", -12), ("1971-01", "M", 12), ("1970-07", "M", 6), ("1971-07", "M", 18), ("1969-07", "M", -6), ("1970-Q1", "Q", 0), ("1970Q1", "Q", 0), ("1968-Q1", "Q", -8)] for date_str, freq, result in TEST_VALUES: _value = utils.get_ordinal_from_period(date_str, freq) msg = "DATE[%s] - FREQ[%s] - ATEMPT[%s] - RETURN[%s]" % ( date_str, freq, result, _value) self.assertEquals(_value, result, msg) cache.configure_cache() for date_str, freq, result in TEST_VALUES: _value = utils.get_ordinal_from_period(date_str, freq) msg = "DATE[%s] - FREQ[%s] - ATEMPT[%s] - RETURN[%s]" % ( date_str, freq, result, _value) self.assertEquals(_value, result, msg)
def build_series(self, row): dimensions = {} attributes = {} #'WEO Subject Code': (BCA, Current account balance) dimensions['WEO Subject Code'] = self.dimension_list.update_entry('WEO Subject Code', row['WEO Subject Code'], row['Subject Descriptor']) if not 'WEO Subject Code' in self.dataset.codelists: self.dataset.codelists['WEO Subject Code'] = {} if not dimensions['WEO Subject Code'] in self.dataset.codelists['WEO Subject Code']: self.dataset.codelists['WEO Subject Code'][dimensions['WEO Subject Code']] = row['Subject Descriptor'] #'ISO': (DEU, Germany) dimensions['ISO'] = self.dimension_list.update_entry('ISO', row['ISO'], row['Country']) if not 'ISO' in self.dataset.codelists: self.dataset.codelists['ISO'] = {} if not dimensions['ISO'] in self.dataset.codelists['ISO']: self.dataset.codelists['ISO'][dimensions['ISO']] = row['Country'] #'WEO Country Code': (134, Germany) dimensions['WEO Country Code'] = self.dimension_list.update_entry('WEO Country Code', row['WEO Country Code'], row['Country']) if not 'WEO Country Code' in self.dataset.codelists: self.dataset.codelists['WEO Country Code'] = {} if not dimensions['WEO Country Code'] in self.dataset.codelists['WEO Country Code']: self.dataset.codelists['WEO Country Code'][dimensions['WEO Country Code']] = row['Country'] #'Units': (2, U.S. dollars) dimensions['Units'] = self.dimension_list.update_entry('Units', '', row['Units']) if not 'Units' in self.dataset.codelists: self.dataset.codelists['Units'] = {} if not dimensions['Units'] in self.dataset.codelists['Units']: self.dataset.codelists['Units'][dimensions['Units']] = row['Units'] attributes['Scale'] = self.attribute_list.update_entry('Scale', '', #row['Scale'], row['Scale']) if not 'Scale' in self.dataset.codelists: self.dataset.codelists['Scale'] = {} if not attributes['Scale'] in self.dataset.codelists['Scale']: self.dataset.codelists['Scale'][attributes['Scale']] = row['Scale'] #'BCA.DEU.2' # TODO: <Series FREQ="A" WEO Country Code="122" INDICATOR="AIP_IX" SCALE="0" SERIESCODE="122AIP_IX.A" BASE_YEAR="2010" TIME_FORMAT="P1Y" xmlns="http://dataservices.imf.org/compact/IFS"> series_key = "%s.%s.%s" % (dimensions['WEO Subject Code'], dimensions['ISO'], dimensions['Units']) #'Current account balance - Germany - U.S. dollars', series_name = "%s - %s - %s" % (row['Subject Descriptor'], row['Country'], row['Units']) values = [] estimation_start = None if row['Estimates Start After']: estimation_start = int(row['Estimates Start After']) for period in self.years: value = { 'attributes': None, 'release_date': self.release_date, 'ordinal': get_ordinal_from_period(period, freq=self.frequency), 'period': period, 'value': row[period].replace(',' ,'') } if estimation_start: if int(period) >= estimation_start: value["attributes"] = {'flag': 'e'} values.append(value) bson = { 'provider_name': self.dataset.provider_name, 'dataset_code': self.dataset.dataset_code, 'name': series_name, 'key': series_key, 'values': values, 'attributes': attributes, 'dimensions': dimensions, 'last_update': self.release_date, 'start_date': self.start_date, 'end_date': self.end_date, 'frequency': self.frequency } notes = [] if row['Subject Notes']: notes.append(row['Subject Notes']) if row['Country/Series-specific Notes']: notes.append(row['Country/Series-specific Notes']) if notes: bson["notes"] = "\n".join(notes) return bson
def build_series(self): try: column = next(self.columns) except StopIteration: self.update_sheet() column = next(self.columns) dimensions = {} col_header = self.sheet.cell_value(0,column) if self.series_name == 'Commodity Prices': dimensions['commodity'] = self.dimension_list.update_entry('Commodity','',col_header) if not col_header in self.dataset.codelists["commodity"]: self.dataset.codelists["commodity"][col_header] = col_header else: if col_header in self.available_countries: dimensions['country'] = self.available_countries[col_header]["id"] elif col_header in self.manual_countries: dimensions['country'] = self.manual_countries[col_header] else: logger.warning("country not found [%s]" % col_header) #self.countries_not_found.add(col_header) dimensions['country'] = self.dimension_list.update_entry('country','',col_header) if not dimensions['country'] in self.dataset.codelists["country"]: self.dataset.codelists["country"][dimensions['country']] = col_header values = [] _values = [str(v) for v in self.sheet.col_values(column, start_rowx=2)] for i, v in enumerate(_values): value = { 'attributes': None, 'release_date': self.last_update, 'ordinal': get_ordinal_from_period(self.periods[i], freq=self.frequency), 'period': self.periods[i], #str(period), 'value': v } values.append(value) series = {} series['values'] = values series_key = self.series_name.replace(' ','_').replace(',', '') # don't add a period if there is already one if series_key[-1] != '.': series_key += '.' series_key += col_header + '.' + self.frequency series['provider_name'] = self.provider_name series['dataset_code'] = self.dataset_code series['name'] = self.series_name + ' - ' + col_header + ' - ' + self.freq_long_name[self.frequency] series['key'] = series_key #series['values'] = values series['attributes'] = None series['dimensions'] = dimensions series['last_update'] = self.last_update #series['release_dates'] = release_dates series['start_date'] = self.start_date series['end_date'] = self.end_date series['frequency'] = self.frequency return series
def parse_dates(column): for row_nbr, c in enumerate(column): if type(c) is not str: continue matches = re.match(REGEX_ANNUAL, c) if matches: freq = 'A' start_year = int(matches.group(1)) end_year = start_year first_row = row_nbr last_row = first_row break matches = re.match(REGEX_QUARTER, c) if matches: freq = 'Q' start_year = int(matches.group(1)) start_quarter = parse_quarter(matches.group(2)) # checking next year beginning matches = re.match(REGEX_QUARTER, column[row_nbr + 5 - start_quarter]) if (not matches) or int(matches.group(1)) != start_year + 1: raise Exception('start_date not recognized') end_year = start_year end_quarter = start_quarter first_row = row_nbr last_row = first_row break if (row_nbr + 1) == len(column): raise Exception('start_date not recognized') if freq == 'A': for c in column[first_row + 1:]: if type(c) is not str: break matches = re.match(REGEX_ANNUAL, c) if not matches: break else: next_year = int(matches.group(1)) if next_year != end_year + 1: raise Exception('error in year sequence') end_year = next_year last_row = last_row + 1 else: for c in column[first_row + 1:]: if type(c) is not str: break matches = re.match(REGEX_QUARTER, c) if not matches: break elif matches.group(1): next_year = int(matches.group(1)) if next_year != end_year + 1: raise Exception('error in year sequence') next_quarter = parse_quarter(matches.group(2)) if next_quarter != 1: raise Exception('first quarter of the year is not 1') end_year = next_year else: next_quarter = parse_quarter(matches.group(2)) if next_quarter != end_quarter + 1: raise Exception('error in quarter sequence') end_quarter = next_quarter last_row = last_row + 1 if freq == 'A': start_date = get_ordinal_from_period(str(start_year), freq='A') end_date = get_ordinal_from_period(str(end_year), freq='A') elif freq == 'Q': start_date = get_ordinal_from_period('%sQ%s' % (start_year, start_quarter), freq='Q') end_date = get_ordinal_from_period('%sQ%s' % (end_year, end_quarter), freq='Q') return (freq, start_date, end_date, first_row, last_row)
def _get_datas(self): _zipfile = zipfile.ZipFile(self.filepath) for fname in _zipfile.namelist(): info = _zipfile.getinfo(fname) #bypass directory if info.file_size == 0 or info.filename.endswith('/'): continue if 'Commodity Prices' in fname: logger.warning("bypass %s" % fname) continue #if not self.release_date: # last_update = clean_datetime(datetime(*self.zipfile.getinfo(fname).date_time[0:6])) series_name = fname[:-5] logger.info("open excel file[%s] - series.name[%s]" % (fname, series_name)) excel_book = xlrd.open_workbook(file_contents=_zipfile.read(fname)) for sheet in excel_book.sheets(): if sheet.name in [ 'Sheet1', 'Sheet2', 'Sheet3', 'Sheet4', 'Feuille1', 'Feuille2', 'Feuille3', 'Feuille4' ]: continue periods = sheet.col_slice(0, start_rowx=2) start_period = periods[0].value end_period = periods[-1].value frequency = None start_date = None end_date = None if sheet.name == 'annual': frequency = 'A' start_date = get_ordinal_from_period(str( int(start_period)), freq='A') end_date = get_ordinal_from_period(str(int(end_period)), freq='A') periods = [str(int(p.value)) for p in periods] elif sheet.name == 'quarterly': frequency = 'Q' start_date = get_ordinal_from_period(start_period, freq='Q') end_date = get_ordinal_from_period(end_period, freq='Q') periods = [p.value for p in periods] elif sheet.name == 'monthly': frequency = 'M' start_date = get_ordinal_from_period(start_period.replace( 'M', '-'), freq='M') end_date = get_ordinal_from_period(end_period.replace( 'M', '-'), freq='M') periods = [p.value.replace('M', '-') for p in periods] #elif sheet.name == 'daily': # frequency = 'D' # start_date = self._translate_daily_dates(start_period) # end_date = self._translate_daily_dates(end_period) # TODO: periods = [p.value for p in periods] else: msg = { "provider_name": self.provider_name, "dataset_code": self.dataset_code, "frequency": sheet.name } raise errors.RejectFrequency(**msg) self.dataset.add_frequency(frequency) columns = iter(range(1, sheet.row_len(0))) for column in columns: settings = { "column": column, "sheet": sheet, "periods": periods, "series_name": series_name, "bson": { "frequency": frequency, "start_date": start_date, "end_date": end_date, } } yield settings, None
def test_get_ordinal_from_period(self): """ >>> pd.Period("1970-Q1", freq="Q").ordinal 0 >>> pd.Period("1970-Q2", freq="Q").ordinal 1 >>> pd.Period("1970-Q3", freq="Q").ordinal 2 >>> pd.Period("1970-Q4", freq="Q").ordinal 3 >>> pd.Period("1971-Q1", freq="Q").ordinal 4 >>> pd.Period("1969-Q1", freq="Q").ordinal -4 >>> pd.Period("1969-Q4", freq="Q").ordinal -1 >>> pd.Period("1968-Q1", freq="Q").ordinal -8 >>> pd.Period('1970', freq='A') Period('1970', 'A-DEC') >>> pd.Period('1970', freq='A').ordinal 0 >>> pd.Period('1970', freq='M').ordinal 0 >>> pd.Period('1970-01', freq='M').ordinal 0 >>> pd.Period('1970-02', freq='M').ordinal 1 >>> pd.Period('1969-12', freq='M').ordinal -1 >>> pd.Period('1968-01', freq='M').ordinal -24 >>> pd.Period('1971-01', freq='M').ordinal 12 >>> pd.Period('1969-01', freq='M').ordinal -12 >>> pd.Period('1970-07', freq='M').ordinal 6 >>> pd.Period('1971-07', freq='M').ordinal 18 >>> pd.Period('1969-07', freq='M').ordinal -6 """ TEST_VALUES = [ ("1970", "A", 0), ("1969", "A", -1), ("1971", "A", 1), ("1970-01-01", "A", 0), ("19700101", "A", 0), ("1970-01", "M", 0), ("197001", "M", 0), ("1970-02", "M", 1), ("1969-12", "M", -1), ("1969-01", "M", -12), ("1971-01", "M", 12), ("1970-07", "M", 6), ("1971-07", "M", 18), ("1969-07", "M", -6), ("1970-Q1", "Q", 0), ("1970Q1", "Q", 0), ("1968-Q1", "Q", -8) ] for date_str, freq, result in TEST_VALUES: _value = utils.get_ordinal_from_period(date_str, freq) msg = "DATE[%s] - FREQ[%s] - ATEMPT[%s] - RETURN[%s]" % (date_str, freq, result, _value) self.assertEquals(_value, result, msg) cache.configure_cache() for date_str, freq, result in TEST_VALUES: _value = utils.get_ordinal_from_period(date_str, freq) msg = "DATE[%s] - FREQ[%s] - ATEMPT[%s] - RETURN[%s]" % (date_str, freq, result, _value) self.assertEquals(_value, result, msg)
def _build_series(self, group, p_series, obs): dimensions = OrderedDict() attributes = OrderedDict() bson = OrderedDict() dim = group.copy() dim.update(p_series) attrib = defaultdict(list) frequency, start_date, end_date = get_dates(dim, obs) self.dataset.add_frequency(frequency) values = list() for v in obs: Obs_attribute_keys = [ k for k in v.keys() if k not in ['time-period', 'obs-value'] ] for key in Obs_attribute_keys: if key not in self.dataset.attribute_keys: self.dataset.attribute_keys.append(key) self.dataset.concepts[key] = key self.dataset.codelists[key] = {} if v.get(key) not in self.dataset.codelists[key]: self.dataset.codelists[key][v.get(key)] = v.get(key) for v in obs: period = v['time-period'] a = OrderedDict() for k in self.dataset.attribute_keys: try: a[k] = v[k] except KeyError: a[k] = '' attrib[k].append(a[k]) value = { 'attributes': a, 'release_date': self.release_date, 'ordinal': get_ordinal_from_period(period, freq=frequency), 'period': period, 'value': v['obs-value'] } values.append(value) for key in self.dataset.dimension_keys: dimensions[key] = self.dimension_list.update_entry( key, dim[key], self.dataset.codelists[key][slugify(dim[key])]) for key in self.dataset.attribute_keys: attributes[key] = self.attribute_list.update_entry( key, str(attrib[key]), attrib[key]) serie_key = self.fix_series_keys(dimensions) serie_name = self.fix_series_names(dim, serie_key) bson['values'] = values bson['provider_name'] = self.provider_name bson['dataset_code'] = self.dataset_code bson['name'] = serie_name bson['key'] = str(serie_key) bson['start_date'] = start_date bson['end_date'] = end_date bson['last_update'] = self.release_date bson['dimensions'] = dimensions bson['frequency'] = frequency bson['attributes'] = attributes return bson
def get_dates(dim, obs): frequency = dim['freq'] start_date = get_ordinal_from_period(obs[0]['time-period'], freq=frequency) end_date = get_ordinal_from_period(obs[-1]['time-period'], freq=frequency) return frequency, start_date, end_date
def build_series(self, datas): datas = datas["datas"] series = {} series["last_update"] = self.release_date series["frequency"] = self._search_frequency(datas[0]) series["key"] = "%s.%s.%s" % (self.current_indicator["id"], self.current_country, series["frequency"]) series["name"] = "%s - %s - %s" % ( self.current_indicator["name"], self.available_countries[self.current_country]["name"], constants.FREQUENCIES_DICT[series["frequency"]], ) # if self.current_indicator.get("sourceNote"): # series["notes"] = self.current_indicator.get("sourceNote") values = [] value_found = False for point in datas: frequency = self._search_frequency(point) if frequency != series["frequency"]: raise Exception( "Diff frequency [%s] != [%s] - series[%s]" % (frequency, series["frequency"], series["key"]) ) value = { "attributes": None, "value": str(point["value"]).replace("None", ""), "ordinal": get_ordinal_from_period(point["date"], freq=series["frequency"]), # tmp value "period": point["date"], } if not value_found and value["value"] != "": value_found = True if "obs_status" in point: obs_status = point.get("obs_status") if obs_status and len(obs_status) > 0: value["attributes"] = {"obs_status": obs_status} if not "obs_status" in self.dataset.codelists: self.dataset.codelists["obs_status"] = self.obs_status if not "obs_status" in self.dataset.concepts: self.dataset.concepts["obs_status"] = "Observation Status" if not "obs_status" in self.dataset.attribute_keys: self.dataset.attribute_keys.append("obs_status") values.append(value) if not value_found: msg = {"provider_name": self.provider_name, "dataset_code": self.dataset_code} raise errors.RejectEmptySeries(**msg) keyfunc = lambda x: x["ordinal"] series["values"] = sorted(values, key=keyfunc) series["provider_name"] = self.provider_name series["dataset_code"] = self.dataset_code series["start_date"] = series["values"][0]["ordinal"] series["end_date"] = series["values"][-1]["ordinal"] # PATCH for v in series["values"]: v.pop("ordinal") series["dimensions"] = { "country": self.current_country, "indicator": self.current_indicator["id"], "frequency": series["frequency"], } if not self.current_indicator["id"] in self.dataset.codelists["indicator"]: self.dataset.codelists["indicator"][self.current_indicator["id"]] = self.current_indicator["name"] if not series["frequency"] in self.dataset.codelists["frequency"]: self.dataset.codelists["frequency"][series["frequency"]] = constants.FREQUENCIES_DICT[series["frequency"]] series["attributes"] = None self.dataset.add_frequency(series["frequency"]) return series
def parse_dates(column): for row_nbr, c in enumerate(column): if type(c) is not str: continue matches = re.match(REGEX_ANNUAL, c) if matches: freq = 'A' start_year = int(matches.group(1)) end_year = start_year first_row = row_nbr last_row = first_row break matches = re.match(REGEX_QUARTER, c) if matches: freq = 'Q' start_year = int(matches.group(1)) start_quarter = parse_quarter(matches.group(2)) # checking next year beginning matches = re.match(REGEX_QUARTER, column[row_nbr + 5 - start_quarter]) if (not matches) or int(matches.group(1)) != start_year + 1: raise Exception('start_date not recognized') end_year = start_year end_quarter = start_quarter first_row = row_nbr last_row = first_row break if (row_nbr + 1) == len(column): raise Exception('start_date not recognized') if freq == 'A': for c in column[first_row+1:]: if type(c) is not str: break matches = re.match(REGEX_ANNUAL,c) if not matches: break else: next_year = int(matches.group(1)) if next_year != end_year + 1: raise Exception('error in year sequence') end_year = next_year last_row = last_row + 1 else: for c in column[first_row+1:]: if type(c) is not str: break matches = re.match(REGEX_QUARTER,c) if not matches: break elif matches.group(1): next_year = int(matches.group(1)) if next_year != end_year + 1: raise Exception('error in year sequence') next_quarter = parse_quarter(matches.group(2)) if next_quarter != 1: raise Exception('first quarter of the year is not 1') end_year = next_year else: next_quarter = parse_quarter(matches.group(2)) if next_quarter != end_quarter + 1: raise Exception('error in quarter sequence') end_quarter = next_quarter last_row = last_row + 1 if freq == 'A': start_date = get_ordinal_from_period(start_year, freq='A') end_date = get_ordinal_from_period(end_year, freq='A') elif freq == 'Q': start_date = get_ordinal_from_period('%sQ%s' % (start_year, start_quarter), freq='Q') end_date = get_ordinal_from_period('%sQ%s' % (end_year, end_quarter), freq='Q') return (freq, start_date, end_date, first_row, last_row)
def _build_series(self, group, p_series, obs): dimensions = OrderedDict() attributes = OrderedDict() bson = OrderedDict() dim = group.copy() dim.update(p_series) attrib = defaultdict(list) frequency, start_date, end_date = get_dates(dim, obs) self.dataset.add_frequency(frequency) values=list() for v in obs: Obs_attribute_keys = [k for k in v.keys() if k not in ['time-period', 'obs-value']] for key in Obs_attribute_keys: if key not in self.dataset.attribute_keys: self.dataset.attribute_keys.append(key) self.dataset.concepts[key] = key self.dataset.codelists[key] = {} if v.get(key) not in self.dataset.codelists[key]: self.dataset.codelists[key][v.get(key)] = v.get(key) for v in obs: period = v['time-period'] a=OrderedDict() for k in self.dataset.attribute_keys: try: a[k]=v[k] except KeyError: a[k]='' attrib[k].append(a[k]) value = { 'attributes': a, 'release_date': self.release_date, 'ordinal': get_ordinal_from_period(period, freq=frequency), 'period': period, 'value': v['obs-value'] } values.append(value) for key in self.dataset.dimension_keys: dimensions[key] = self.dimension_list.update_entry(key, dim[key], self.dataset.codelists[key][slugify(dim[key])]) for key in self.dataset.attribute_keys: attributes[key] = self.attribute_list.update_entry(key, str(attrib[key]), attrib[key]) serie_key = self.fix_series_keys(dimensions) serie_name = self.fix_series_names(dim, serie_key) bson['values'] = values bson['provider_name'] = self.provider_name bson['dataset_code'] = self.dataset_code bson['name'] = serie_name bson['key'] = str(serie_key) bson['start_date'] = start_date bson['end_date'] = end_date bson['last_update'] = self.release_date bson['dimensions'] = dimensions bson['frequency'] = frequency bson['attributes'] = attributes return bson
def build_series(self, datas): datas = datas["datas"] series = {} series["last_update"] = self.release_date series['frequency'] = self._search_frequency(datas[0]) series['key'] = "%s.%s.%s" % (self.current_indicator["id"], self.current_country, series['frequency']) series['name'] = "%s - %s - %s" % ( self.current_indicator["name"], self.available_countries[self.current_country]["name"], constants.FREQUENCIES_DICT[series["frequency"]]) #if self.current_indicator.get("sourceNote"): # series["notes"] = self.current_indicator.get("sourceNote") values = [] value_found = False for point in datas: frequency = self._search_frequency(point) if frequency != series['frequency']: raise Exception( "Diff frequency [%s] != [%s] - series[%s]" % (frequency, series['frequency'], series['key'])) value = { 'attributes': None, 'value': str(point["value"]).replace("None", ""), 'ordinal': get_ordinal_from_period(point["date"], freq=series['frequency']), #tmp value 'period': point["date"], } if not value_found and value["value"] != "": value_found = True if "obs_status" in point: obs_status = point.get("obs_status") if obs_status and len(obs_status) > 0: value["attributes"] = {"obs_status": obs_status} if not "obs_status" in self.dataset.codelists: self.dataset.codelists["obs_status"] = self.obs_status if not "obs_status" in self.dataset.concepts: self.dataset.concepts[ "obs_status"] = "Observation Status" if not "obs_status" in self.dataset.attribute_keys: self.dataset.attribute_keys.append("obs_status") values.append(value) if not value_found: msg = { "provider_name": self.provider_name, "dataset_code": self.dataset_code } raise errors.RejectEmptySeries(**msg) keyfunc = lambda x: x["ordinal"] series['values'] = sorted(values, key=keyfunc) series['provider_name'] = self.provider_name series['dataset_code'] = self.dataset_code series['start_date'] = series['values'][0]["ordinal"] series['end_date'] = series['values'][-1]["ordinal"] #PATCH for v in series['values']: v.pop("ordinal") series['dimensions'] = { 'country': self.current_country, 'indicator': self.current_indicator["id"], 'frequency': series["frequency"] } if not self.current_indicator["id"] in self.dataset.codelists[ 'indicator']: self.dataset.codelists['indicator'][ self.current_indicator["id"]] = self.current_indicator["name"] if not series["frequency"] in self.dataset.codelists['frequency']: self.dataset.codelists['frequency'][series[ "frequency"]] = constants.FREQUENCIES_DICT[series["frequency"]] series['attributes'] = None self.dataset.add_frequency(series["frequency"]) return series