def run(self, distribution, catalog): """ Valida las distribuciones de series de tiempo de un catálogo entero a partir de su URL, o archivo fuente Returns: bool: True si la distribución pasa las validaciones, False caso contrario """ distribution_id = distribution.get(IDENTIFIER) url = distribution.get(DOWNLOAD_URL) if not self.read_local: if not url or requests.head(url).status_code != 200: msg = u'{} {}'.format(strings.INVALID_DISTRIBUTION_URL, distribution_id) raise ValueError(msg) # Fix a pandas fallando en lectura de URLs no ascii url = url.encode('UTF-8') url = urllib.parse.quote(url, safe='/:') dataset = catalog.get_dataset(distribution[DATASET_IDENTIFIER]) df = pd.read_csv(url, parse_dates=[settings.INDEX_COLUMN]) df = df.set_index(settings.INDEX_COLUMN) validate_distribution(df, catalog, dataset, distribution) return True
def test_validate(self): catalog = os.path.join(SAMPLES_DIR, "data.json") catalog = DataJson(catalog) distrib_meta = catalog.get_distribution(identifier="125.1") df = pd.read_csv(distrib_meta["downloadURL"], parse_dates=["indice_tiempo" ]).set_index("indice_tiempo") dataset_meta = catalog.get_dataset( identifier=distrib_meta["dataset_identifier"]) validate_distribution(df, catalog, dataset_meta, distrib_meta)
def test_repeated_field_id(self): catalog = os.path.join(SAMPLES_DIR, "repeated_field_id.json") catalog = DataJson(catalog) identifier = "125.1" distribution = catalog.get_distribution(identifier=identifier) dataset = catalog.get_dataset( identifier=distribution["dataset_identifier"]) df = pd.read_csv(distribution["downloadURL"], parse_dates=["indice_tiempo" ]).set_index("indice_tiempo") validate_distribution(df, catalog, dataset, distribution)
def validate(self): logging.debug('Valida la distribución') try: validate_distribution( df=self._df, catalog=self.parent.parent.metadata, _dataset_meta=self.parent.metadata, distrib_meta=self.metadata, ) logging.debug(f'Distribución {self.identifier} válida') except Exception as ex: logging.debug(f'Distribución {self.identifier} inválida') raise
def run(self): distribution_df = None file_source = os.path.join( CATALOGS_DIR_INPUT, self.catalog_metadata.get('identifier'), 'sources', self.distribution_metadata.get('scrapingFileURL').split('/')[-1]) try: xl = self.catalog_context['catalog'][self.catalog_metadata.get( 'identifier')]['xl'].get(file_source.split('/')[-1]) distribution_params = self.gen_distribution_params( self.catalog_metadata, self.distribution_metadata.get('identifier')) distrib_meta = self.catalog_metadata.get_distribution( self.distribution_metadata.get('identifier')) dataset_meta = self.catalog_metadata.get_dataset( self.distribution_metadata.get('identifier').split(".")[0]) df = self.scrape_dataframe(xl, **distribution_params) if isinstance(df, list): df = pd.concat(df, axis=1) # VALIDACIONES worksheet = distribution_params["worksheet"] headers_coord = distribution_params["headers_coord"] headers_value = distribution_params["headers_value"] validate_distribution_scraping(xl, worksheet, headers_coord, headers_value, distrib_meta) validate_distribution(df, self.catalog_metadata, dataset_meta, distrib_meta, self.distribution_metadata.get('identifier')) return df except Exception: logging.debug('Falló la descarga de la distribución') raise return distribution_df
def run(self, distribution_model: Distribution, catalog: DataJson): """ Valida las distribuciones de series de tiempo de un catálogo entero a partir de su URL, o archivo fuente Returns: bool: True si la distribución pasa las validaciones, False caso contrario """ df = self.init_df(distribution_model) dataset_id = distribution_model.dataset.identifier if dataset_id is None: raise ValueError( NO_DATASET_IDENTIFIER.format(distribution_model.identifier)) dataset = catalog.get_dataset(dataset_id) distribution = catalog.get_distribution(distribution_model.identifier) validate_distribution(df, catalog, dataset, distribution) return True
def scrape_distribution(xl, catalog, distribution_identifier): distribution_params = gen_distribution_params(catalog, distribution_identifier) distrib_meta = catalog.get_distribution(distribution_identifier) dataset_meta = catalog.get_dataset(distribution_identifier.split(".")[0]) df = scrape_dataframe(xl, **distribution_params) if isinstance(df, list): df = pd.concat(df, axis=1) # VALIDACIONES worksheet = distribution_params["worksheet"] headers_coord = distribution_params["headers_coord"] headers_value = distribution_params["headers_value"] validate_distribution_scraping(xl, worksheet, headers_coord, headers_value, distrib_meta) validate_distribution(df, catalog, dataset_meta, distrib_meta, distribution_identifier) return df
def analyze_distribution(catalog_id, catalog, dataset_id, distribution_id): distrib_meta = catalog.get_distribution(distribution_id) dataset_meta = catalog.get_dataset(dataset_id) distribution_path = get_distribution_path( catalog_id, dataset_meta["identifier"], distribution_id, CATALOGS_DIR_INPUT) # print("leyendo distribucion {} en {}".format( # distribution_id, distribution_path)) time_index = "indice_tiempo" df = pd.read_csv( distribution_path, index_col=time_index, parse_dates=[time_index], date_parser=lambda x: arrow.get(x, "YYYY-MM-DD").datetime # encoding="utf-8" ) validate_distribution(df, catalog, dataset_meta, distrib_meta, distribution_id) return distribution_path, df