示例#1
0
    def run(self, distribution, catalog):
        """
        Valida las distribuciones de series de tiempo de un catálogo
        entero a partir de su URL, o archivo fuente

        Returns:
            bool: True si la distribución pasa las validaciones, False caso contrario
        """
        distribution_id = distribution.get(IDENTIFIER)
        url = distribution.get(DOWNLOAD_URL)
        if not self.read_local:
            if not url or requests.head(url).status_code != 200:
                msg = u'{} {}'.format(strings.INVALID_DISTRIBUTION_URL,
                                      distribution_id)
                raise ValueError(msg)

        # Fix a pandas fallando en lectura de URLs no ascii
        url = url.encode('UTF-8')
        url = urllib.parse.quote(url, safe='/:')

        dataset = catalog.get_dataset(distribution[DATASET_IDENTIFIER])
        df = pd.read_csv(url, parse_dates=[settings.INDEX_COLUMN])
        df = df.set_index(settings.INDEX_COLUMN)

        validate_distribution(df, catalog, dataset, distribution)

        return True
示例#2
0
    def test_validate(self):
        catalog = os.path.join(SAMPLES_DIR, "data.json")
        catalog = DataJson(catalog)
        distrib_meta = catalog.get_distribution(identifier="125.1")
        df = pd.read_csv(distrib_meta["downloadURL"],
                         parse_dates=["indice_tiempo"
                                      ]).set_index("indice_tiempo")
        dataset_meta = catalog.get_dataset(
            identifier=distrib_meta["dataset_identifier"])

        validate_distribution(df, catalog, dataset_meta, distrib_meta)
示例#3
0
    def test_repeated_field_id(self):
        catalog = os.path.join(SAMPLES_DIR, "repeated_field_id.json")
        catalog = DataJson(catalog)
        identifier = "125.1"
        distribution = catalog.get_distribution(identifier=identifier)
        dataset = catalog.get_dataset(
            identifier=distribution["dataset_identifier"])

        df = pd.read_csv(distribution["downloadURL"],
                         parse_dates=["indice_tiempo"
                                      ]).set_index("indice_tiempo")

        validate_distribution(df, catalog, dataset, distribution)
示例#4
0
    def validate(self):
        logging.debug('Valida la distribución')

        try:
            validate_distribution(
                df=self._df,
                catalog=self.parent.parent.metadata,
                _dataset_meta=self.parent.metadata,
                distrib_meta=self.metadata,
            )
            logging.debug(f'Distribución {self.identifier} válida')
        except Exception as ex:
            logging.debug(f'Distribución {self.identifier} inválida')
            raise
示例#5
0
    def run(self):
        distribution_df = None

        file_source = os.path.join(
            CATALOGS_DIR_INPUT, self.catalog_metadata.get('identifier'),
            'sources',
            self.distribution_metadata.get('scrapingFileURL').split('/')[-1])

        try:
            xl = self.catalog_context['catalog'][self.catalog_metadata.get(
                'identifier')]['xl'].get(file_source.split('/')[-1])

            distribution_params = self.gen_distribution_params(
                self.catalog_metadata,
                self.distribution_metadata.get('identifier'))
            distrib_meta = self.catalog_metadata.get_distribution(
                self.distribution_metadata.get('identifier'))
            dataset_meta = self.catalog_metadata.get_dataset(
                self.distribution_metadata.get('identifier').split(".")[0])

            df = self.scrape_dataframe(xl, **distribution_params)

            if isinstance(df, list):
                df = pd.concat(df, axis=1)

            # VALIDACIONES
            worksheet = distribution_params["worksheet"]
            headers_coord = distribution_params["headers_coord"]
            headers_value = distribution_params["headers_value"]

            validate_distribution_scraping(xl, worksheet, headers_coord,
                                           headers_value, distrib_meta)
            validate_distribution(df, self.catalog_metadata, dataset_meta,
                                  distrib_meta,
                                  self.distribution_metadata.get('identifier'))

            return df

        except Exception:
            logging.debug('Falló la descarga de la distribución')
            raise

        return distribution_df
示例#6
0
    def run(self, distribution_model: Distribution, catalog: DataJson):
        """
        Valida las distribuciones de series de tiempo de un catálogo
        entero a partir de su URL, o archivo fuente

        Returns:
            bool: True si la distribución pasa las validaciones, False caso contrario
        """

        df = self.init_df(distribution_model)

        dataset_id = distribution_model.dataset.identifier
        if dataset_id is None:
            raise ValueError(
                NO_DATASET_IDENTIFIER.format(distribution_model.identifier))
        dataset = catalog.get_dataset(dataset_id)

        distribution = catalog.get_distribution(distribution_model.identifier)

        validate_distribution(df, catalog, dataset, distribution)

        return True
def scrape_distribution(xl, catalog, distribution_identifier):

    distribution_params = gen_distribution_params(catalog,
                                                  distribution_identifier)
    distrib_meta = catalog.get_distribution(distribution_identifier)
    dataset_meta = catalog.get_dataset(distribution_identifier.split(".")[0])

    df = scrape_dataframe(xl, **distribution_params)

    if isinstance(df, list):
        df = pd.concat(df, axis=1)

    # VALIDACIONES
    worksheet = distribution_params["worksheet"]
    headers_coord = distribution_params["headers_coord"]
    headers_value = distribution_params["headers_value"]

    validate_distribution_scraping(xl, worksheet, headers_coord, headers_value,
                                   distrib_meta)
    validate_distribution(df, catalog, dataset_meta, distrib_meta,
                          distribution_identifier)

    return df
def analyze_distribution(catalog_id, catalog, dataset_id, distribution_id):

    distrib_meta = catalog.get_distribution(distribution_id)
    dataset_meta = catalog.get_dataset(dataset_id)

    distribution_path = get_distribution_path(
        catalog_id, dataset_meta["identifier"], distribution_id,
        CATALOGS_DIR_INPUT)
    # print("leyendo distribucion {} en {}".format(
    #     distribution_id, distribution_path))

    time_index = "indice_tiempo"
    df = pd.read_csv(
        distribution_path,
        index_col=time_index,
        parse_dates=[time_index],
        date_parser=lambda x: arrow.get(x, "YYYY-MM-DD").datetime
        # encoding="utf-8"
    )

    validate_distribution(df, catalog, dataset_meta, distrib_meta,
                          distribution_id)

    return distribution_path, df