Пример #1
0
def run_thematic_dataset_indexing(dataset: MetadataDataset):
    indexed_resources_ids = []
    metadata_dataset = ""
    if len(dataset.resources) == 0:
        return []

    for metadata in [dataset.title, dataset.notes, dataset.tags]:
        metadata_dataset += f"{metadata} "

    metadata_resources = ""
    if solr_dataset.search(f'id:{dataset.id}', fl='id').__len__() != 0:
        log.info(f"Atualizando recursos e conjunto de dados")
        solr_resource.delete(f"package_id:{dataset.id}")
        solr_dataset.delete(f"id:{dataset.id}")

    for resource in dataset.resources:
        log.info('id_resource: ' + resource.id)
        metadata_resource = ""
        for metadata in [resource.name, resource.description]:
            metadata_resource += f"{metadata} "
        solr_resource.add({'id': resource.id, 'package_id': resource.package_id,
                           'metadata': metadata_resource + metadata_dataset})
        if not resource.thematic_indexing:
            indexed_resources_ids.append(resource.id)
        metadata_resources += f"{metadata_resource} "
    metadata_dataset += f" {metadata_resources}"
    solr_dataset.add({'id': dataset.id, 'metadata': metadata_dataset})
    return indexed_resources_ids
Пример #2
0
def run_temporal_dataset_indexing(dataset: MetadataDataset,
                                  update_num_package_resources):
    indexed_resources_ids = []
    if update_num_package_resources:
        update_num_resources(dataset.id, dataset.num_resources)
    for resource in dataset.resources:
        log.info('id_resource: ' + resource.id)
        run_temporal_index(resource, dataset.num_resources, dataset.title,
                           dataset.notes)
        indexed_resources_ids.append(resource.id)
    return indexed_resources_ids
Пример #3
0
def run(task_hour=config.scheduled_hour):
    while True:
        day = datetime.datetime.now()
        download_and_persist_metadata()
        remove()
        finish = index(task_hour, day.date())
        if finish:
            day = datetime.datetime.now().date()
            log.info("Amanhã os metadados serão atualizados.")
            while datetime.datetime.now(
            ).hour != task_hour and day == datetime.datetime.now().date():
                sleep(1)
Пример #4
0
def remove():
    log.info(
        '#----------------------------------------------------------------------------------------------#'
    )
    log.info("Removendo recursos que não pertencem mais a base de dados.")
    while True:
        results = get_deleted_resources()
        if not results:
            break
        for result in results:
            delete_spatial_indexes(result[0], result[1])
            delete_temporal_indexes(result[0], result[1])
            delete_thematic_indexes(result[1])
Пример #5
0
def delete_thematic_indexes(resource):
    log.info(f'removendo índice temático de resource {resource.id}')
    solr_resource.delete(id=resource.id)
    if len(solr_resource.search(f'package_id:{resource.package_id}', rows=1)) == 0:
        solr_dataset.delete(id=resource.package_id)
        return
    metadata_resource = ''
    for metadata in [resource.name, resource.description]:
        metadata_resource += f"{metadata} "
    doc = solr_dataset.search(f'id:{resource.package_id}').docs
    doc = doc[0]
    doc['metadata'] = doc['metadata'].replace(metadata_resource, '')
    solr_dataset.delete(id=resource.package_id)
    solr_dataset.add({'id': doc['id'], 'metadata': doc['metadata']})
Пример #6
0
def types_and_indexes(csv_file: list, driver_: GraphDatabase):
    """
        Existem colunas com, por exemplo, nomes de bairros que são também nomes de municípios, portanto este método busca
    diminuir a identificação de colunas de “falsos” locais, verificando uma certa quantidade de
    linhas do CSV e selecionando os tipos de locais(Municípios, UFs, Regiões) que mais apareceram, bem como os índices
    que indicam onde estão os locais encontrados.
    """
    quant_rows = 0
    quant_none_place = 0
    quant_rows_with_place_found = 0
    list_patterns = []
    for row in csv_file:
        quant_rows += 1
        pattern_type = ""
        pattern_index = ""
        undefined_type = False
        for j in range(len(row)):
            res = None
            try:
                res = return_type_place(row[j], driver_)
            except Exception:
                log.info("Erro ao verificar se o termo é um local e se possui um tipo."
                         "(índice a partir do 0) i={0}, j={1}".format(str(quant_rows), str(j)))
            if res:
                if res != "UNDEFINED":
                    pattern_type = "+".join([pattern_type, res])
                    pattern_index = "+".join([pattern_index, str(j)])
                else:
                    undefined_type = True
                    break
        if pattern_type:
            quant_rows_with_place_found += 1
        if pattern_type and not undefined_type:
            list_patterns.append("|".join([pattern_type[1:], pattern_index[1:]]))
        else:
            quant_none_place += 1
    if list_patterns and quant_rows_with_place_found > quant_none_place:
        try:
            m = mode(list_patterns)
        except StatisticsError as err:
            m = list_patterns[0]
        types_and_index = m.split("|")
        types_in_order = types_and_index[0].split("+")
        index_cols = types_and_index[1].split("+")
        log.info("Tipos de lugares e índices encontrados: " + m)
        return types_in_order, [int(i) for i in index_cols]
    else:
        return [], []
Пример #7
0
def date_parser(text):
    match = search(text)
    dates = []
    for m in match:
        # '62.428.073/0001-36' <<< ('0001', 12)
        try:
            found = dateparser.search.search_dates(
                m[0],
                languages=['pt'],
                settings={'RELATIVE_BASE': datetime.datetime(1000, 1, 1)})
        except OverflowError:
            log.info(f"date OverflowError")
        if found and config.min_date_allowed <= found[0][
                1] <= config.max_date_allowed:
            dates.append((found[0][1], m[1]))
    return dates
Пример #8
0
def run_temporal_index(resource, num_resources_package, package_title,
                       package_notes):
    funcs = [
        lambda:
        (date_parser(resource.name), f"resource.name: {resource.name}"),
        lambda: (date_parser(resource.description),
                 f"resource.description: {resource.description}"),
        lambda:
        (date_parser(package_title), f"package.title: {package_title}"),
        lambda:
        (date_parser(package_notes), f"package.notes: {package_notes}"),
    ]
    for func in funcs:
        dates = func()
        if dates[0]:
            break
    # -----------------------------------------------------------------------------------------------------------------
    if dates[0]:
        log.info(f"Datas em {dates[1]}")

    if len(dates[0]) == 0:
        interval = None
        if resource.created:
            date = datetime.strptime(resource.created, '%Y-%m-%dT%H:%M:%S.%f')
            interval = (date, date)
            if interval:
                log.info(f"Datas em resource.created")
    elif len(dates[0]) == 1:
        date = dates[0][0]
        interval = (date[0], date[0] + relativedelta(months=date[1] - 1, day=31)) \
            if date[1] != 0 else (date[0], date[0])
    else:
        dates = [
            (date[0], date[0] +
             relativedelta(months=date[1] - 1, day=31)) if date[1] != 0 else
            (date[0], date[0]) for date in dates[0]
        ]
        least_recent = dates[0][0]
        last = dates[0][1]
        for date in dates[1:]:
            if date[0] < least_recent:
                least_recent = date[0]
            if date[1] > last:
                last = date[1]
        interval = [least_recent, last]

    if interval:
        interval = [i.date() for i in interval]
        if resource.updated or resource.temporal_indexing:
            log.info(f"recurso {resource.id} marcado para atualização")
            delete_temporal_index(resource.id)

        insert_index(resource.id, interval[0], interval[1],
                     resource.package_id, num_resources_package)
    log.info(f"intervalo encontrado: {interval}")
Пример #9
0
def count_places_id(index_cols, types_in_order, len_row, csv_file, places_id_dict, start_csv, lock, quant_indexed_rows):
    quant_row = 0
    not_found_place = 0
    quant_indexed_rows_aux = 0
    places_id_dict_aux = {}
    if index_cols:
        neo4j_index = Neo4jIndex()
        for row in csv_file:
            quant_row += 1
            if len(row) == len_row:
                try:
                    res = neo4j_index.find_places_return_id([row[i] for i in index_cols], types_in_order)
                except CypherSyntaxError:
                    res = None
                    log.info(f"CypherSyntaxError. Erro na linha {quant_row}")
                if not res and not_found_place < 2:
                    not_found_place += 1
                    log.info(f"Não foi encontrado 'locais' na linha {quant_row + start_csv}")
                elif res:
                    quant_indexed_rows_aux += 1
                    for id_ in res:
                        try:
                            places_id_dict_aux[id_] += 1
                        except KeyError:
                            places_id_dict_aux[id_] = 1
            else:
                log.info(f"Linha do CSV não corresponde ao tamanho encontrado. Num. linha: {quant_row + start_csv}")

        lock.acquire()
        quant_indexed_rows.value += quant_indexed_rows_aux
        for key in places_id_dict_aux.keys():
            try:
                places_id_dict[key] += places_id_dict_aux[key]
            except KeyError:
                places_id_dict[key] = places_id_dict_aux[key]
        lock.release()
    del csv_file
Пример #10
0
def index(update_hour, current_date, num=0):
    log.info(
        '#----------------------------------------------------------------------------------------------#'
    )
    log.info("Indexando recursos...")
    while True:
        packages = get_dataset(num)
        now = datetime.datetime.now()
        if now.hour >= update_hour and now.date() > current_date:
            log.info("Parando indexação para atualizar metadados.")
            return False
        if not packages:
            log.info("Todos os recursos foram atualizados hoje.")
            return True
        for package in packages:
            resources = package.resources
            if resources:
                log.info(
                    '#------------------------------------------------------------------------------------------#'
                )
                log.info(str(num) + ' - ' + package.id)
                log.info('')
                log.info(
                    '#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>indexação espacial<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<#'
                )
                package.resources = [
                    resource for resource in resources
                    if not resource.spatial_indexing
                ]
                update_num_package_resources = False
                if len(package.resources) < len(resources):
                    update_num_package_resources = True
                spatial_indexing_done(
                    run_spatial_dataset_indexing(package,
                                                 update_num_package_resources))

                log.info('')
                log.info(
                    '#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>indexação temporal<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<#'
                )
                if not package.temporal_indexing:
                    package.resources = resources
                    temporal_indexing_done(
                        run_temporal_dataset_indexing(package, False),
                        package.id)
                else:
                    package.resources = [
                        resource for resource in resources
                        if not resource.temporal_indexing
                    ]
                    update_num_package_resources = False
                    if len(package.resources) < len(resources):
                        update_num_package_resources = True
                    temporal_indexing_done(
                        run_temporal_dataset_indexing(
                            package, update_num_package_resources))
                log.info('')
                log.info(
                    '#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>indexação temática<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<#'
                )
                if not package.thematic_indexing:
                    package.resources = resources
                    thematic_indexing_done(
                        run_thematic_dataset_indexing(package), package.id)
                for resource in resources:
                    if not resource.thematic_indexing:
                        package.resources = resources
                        thematic_indexing_done(
                            run_thematic_dataset_indexing(package))
                        break
            num += 1
Пример #11
0
def download_and_persist_metadata():
    log.info(
        '#----------------------------------------------------------------------------------------------#'
    )
    log.info("Baixando e persistindo metadados.")
    log.info(f'Verificando conexão com {url_portal}...')
    while True:
        try:
            assert dados_gov.action.site_read()
            break
        except CKANAPIError as err:
            log.info(err)
            sleep(10)
    log.info('conexão estabelecida.')
    with engine.connect() as conn:
        conn.execute("UPDATE metadata_resources SET excluded=TRUE;")
    page = 0
    time0 = time()
    limit = config.metadata['limit']
    offset = config.metadata['offset']
    while True:
        log.info(f"Página(até {limit} metadados de recursos): " + str(page))
        while True:
            try:
                metadata = dados_gov.action.current_package_list_with_resources(
                    limit=limit, offset=offset * page)
                break
            except CKANAPIError:
                log.info(
                    f"Não foi possível recuperar a página {page}. Tentando novamente..."
                )
                sleep(10)

        page += 1

        new_metadata = pd.json_normalize(metadata)
        dataset = new_metadata[[
            'id', 'maintainer', 'author', 'name', 'url', 'notes',
            'metadata_created', 'tags', 'metadata_modified', 'title'
        ]]
        resources = pd.json_normalize(metadata, 'resources')
        resources = resources[[
            'id', 'package_id', 'url', 'description', 'name', 'format',
            'created', 'last_modified'
        ]]

        num_csv = resources['format'].eq('CSV').astype(int).groupby(
            resources['package_id']).sum()
        dataset = pd.merge(dataset, num_csv, left_on='id', right_on='package_id', how='left')\
            .rename(columns={'format': 'num_resources'})
        resources['spatial_indexing'] = False
        resources['temporal_indexing'] = False
        resources['thematic_indexing'] = False
        resources['updated'] = False
        resources['excluded'] = False

        tags = []
        organizations = []
        organization_id = []
        for m in metadata:
            try:
                organizations.append(m['organization']['name'])
                organization_id.append(m['organization']['id'])
            except TypeError:
                organizations.append(None)
                organization_id.append(None)
            tags.append(", ".join([tag['name'] for tag in m['tags']]))
        dataset['tags'] = tags
        dataset['organization_name'] = organizations
        dataset['organization_id'] = organization_id
        dataset['temporal_indexing'] = False
        dataset['thematic_indexing'] = False
        dataset['portal_id'] = id_portal

        dataset.to_sql(name='metadata_dataset',
                       con=engine,
                       if_exists='append',
                       index=False)
        resources.to_sql(name='metadata_resources',
                         con=engine,
                         if_exists='append',
                         index=False)

        if len(metadata) < limit:
            break

    time1 = time()
    log.info("Tempo para baixar e persistir os metadados: " +
             str(time1 - time0) + 's')

    with engine.connect() as conn:
        num_dataset = conn.execute(
            "SELECT count(*) FROM metadata_dataset;").fetchone()
        num_resources = conn.execute(
            "SELECT count(*) FROM metadata_resources;").fetchone()
        log.info("Quantidade de conjuntos de dados: " + str(num_dataset))
        log.info("Quantidade de recursos: " + str(num_resources))
    dados_gov.close()
Пример #12
0
def delete_spatial_indexes(num_resources, resource):
    log.info(f'removendo índice espacial de resource {resource.id}')
    delete_spatial_index(resource.id, driver)
    update_num_resources(resource.package_id, num_resources, driver)
Пример #13
0
def run_spatial_dataset_indexing(dataset: MetadataDataset, update_num_package_resources):
    detector_charset = UniversalDetector()
    indexed_resources_ids = []
    if update_num_package_resources:
        update_num_resources(dataset.id, dataset.num_resources, driver)
    for resource in dataset.resources:
        if path.exists("./spatial_indexing/tmp_csv.csv"):
            remove("./spatial_indexing/tmp_csv.csv")
        log.info('id_resource: ' + resource.id + ' url: ' + resource.url)
        try:
            with get(resource.url, timeout=config.request_timeout, stream=True) as request:
                encoding = request.encoding
                log.info('headers: ' + request.headers.__str__())
                if not request.headers['Content-Type'].__contains__('text/html') and \
                        not request.headers['Content-Type'].__contains__('text/css') and \
                        not request.headers['Content-Type'].__contains__('text/xml') and \
                        not request.headers['Content-Type'].__contains__('application/vnd.ms-excel') and \
                        not request.headers['Content-Type'].__contains__(
                            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'):
                    with open('./spatial_indexing/tmp_csv.csv', 'wb') as file_contents:
                        detector_charset.reset()
                        for chunk in request.iter_content(chunk_size=config.csv_chunk_size):
                            file_contents.write(chunk)
                            detector_charset.feed(chunk)
                        detector_charset.close()
                    log.info(f'charset para csv encontrado na requisição: {encoding}')
                    log.info(f"charset para csv encontrado em verificação[charset: confiança(max. 1.0)]: "
                             f"{detector_charset.result['encoding']}: {detector_charset.result['confidence']}")
                    if not encoding or detector_charset.result['confidence'] >= 0.9:
                        encoding = detector_charset.result['encoding']
                    index(resource, dataset.num_resources, encoding)
                    indexed_resources_ids.append(resource.id)
        except ConnectionError:
            log.info('Erro de Conexão, ConnectionError')
        except ReadTimeout:
            log.info('Erro de Conexão, ReadTimeout')
        except MissingSchema:
            log.exception('requests.exceptions.MissingSchema', exc_info=True)
        except ChunkedEncodingError:
            log.info('a conexação foi encerrada, ChunkedEncodingError')
        except ContentDecodingError:
            log.exception('ContentDecodingError', exc_info=True)
        except StopIteration:
            log.info('Chunk vazio')
    return indexed_resources_ids
Пример #14
0
def index(resource: MetadataResources, num_package_resources, encoding, quant_process=config.num_cpu_to_index):
    try:
        with open('./spatial_indexing/tmp_csv.csv', 'r', encoding=encoding, newline=None) as file_contents:
            res = analyze_csv(file_contents)
            if not res:
                return
            dialect = res[0]
            len_row = res[1]
            types_and_indexes_ = res[2]

            with Manager() as manager:
                places_id_dict = manager.dict()
                quant_indexed_rows = Value('d', 0.0)
                lock = Lock()
                time_i = time()
                while True:
                    csv_file = reader(file_contents.readlines(config.csv_chunk_size), dialect, quoting=QUOTE_ALL)
                    csv_file = list(csv_file)
                    if not csv_file:
                        break
                    # -----------------------------------------------------------------------------------------#
                    types_in_order = types_and_indexes_[0]
                    index_cols = types_and_indexes_[1]
                    len_csv_file = len(csv_file)
                    csv_file_division = int(len_csv_file / quant_process)
                    # -----------------------------------------------------------------------------------------#
                    if quant_indexed_rows.value > 0:
                        log.info(f"Já existiam {quant_indexed_rows.value} linhas indexadas deste resource. "
                                 f"O total de linhas indexadas será atualizado")
                    processes = []
                    for i in range(quant_process):
                        start = csv_file_division * i
                        if i != quant_process - 1:
                            end = csv_file_division * (1 + i)
                            processes.append(Process(
                                target=count_places_id,
                                args=(index_cols, types_in_order, len_row, csv_file[start:end],
                                      places_id_dict, start, lock, quant_indexed_rows)))
                        else:
                            processes.append(Process(target=count_places_id,
                                                     args=(
                                                         index_cols, types_in_order, len_row,
                                                         csv_file[start:],
                                                         places_id_dict, start, lock, quant_indexed_rows)))

                    for process in processes:
                        process.start()

                    for process in processes:
                        process.join()

                    log.info(f"{len_csv_file} linhas foram verificadas")

                if resource.updated or resource.spatial_indexing:
                    log.info(f"recurso {resource.id} marcado para atualização")
                    delete_spatial_index(resource.id, driver)
                total_places_references = sum(places_id_dict.values())
                for key in places_id_dict:
                    try:
                        insert_into_resource_place(key, resource, num_package_resources, total_places_references,
                                                   places_id_dict[key], driver)
                    except CypherSyntaxError:
                        log.info(f"CypherSyntaxError. key:{key} quant_places: {places_id_dict[key]} "
                                 f"quant_indexed_rows: {quant_indexed_rows.value} "
                                 f"resource: {resource.id, resource.package_id}")
                time_f = time()
                log.info(f"Quantidades de linhas indexadas do CSV: {quant_indexed_rows.value}")
                log.info(f"Tempo de indexação(em segundos): {time_f - time_i}")
                log.info(
                    f"Tempo médio de indexação de linha do CSV(em segundos): "
                    f"{(time_f - time_i) / quant_indexed_rows.value}\n")
    except UnicodeError or _csv.Error as err:
        # encoding = 'ISO-8859-1'
        log.info('Ao decodificar o arquivo, uma exceção ocorreu...')
        log.info(err)
Пример #15
0
def analyze_csv(file):
    try:
        dialect = Sniffer().sniff(''.join(read_lines_file(file, config.num_lines_to_check_csv_dialect)))
        log.info(f"delimiter: ({dialect.delimiter}) doublequote: ({dialect.doublequote}) "
                 f"escapechar: ({dialect.escapechar}) "
                 f"lineterminator: ({dialect.lineterminator}) quotechar: ({dialect.quotechar}) "
                 f"quoting: ({dialect.quoting}) "
                 f"skipinitialspace: ({dialect.skipinitialspace})")
    except _csv.Error:
        log.info("Não foi possível determinar o delimitador.")
        return ()
    try:
        file.seek(0)
        csv_file = reader(read_lines_file(file, config.num_lines_to_check_type_of_place), dialect, quoting=QUOTE_ALL)
        csv_file = list(csv_file)
    except _csv.Error as err:
        log.info("<><><><><><><><><><><><><><><>")
        log.info(err)
        return ()
    # Verifica qual o provável tamanho de cada linha do csv
    try:
        len_row = mode([len(x) for x in csv_file[0:]])
        log.info(f"tamanho provável da linha: {len_row}")
    except StatisticsError:
        log.info('Problema ao verifica qual o provável tamanho de cada linha do csv')
        return ()
    time_i = time()
    types_and_indexes_ = types_and_indexes(csv_file, driver)
    time_f = time()
    log.info(f"Tempo para verificação de tipos: {time_f - time_i}")
    if not types_and_indexes_[0]:
        return ()
    file.seek(0)
    return dialect, len_row, types_and_indexes_
Пример #16
0
def delete_temporal_indexes(num_resources, resource):
    log.info(f'removendo índice temporal de resource {resource.id}')
    delete_temporal_index(resource.id)
    update_num_resources(resource.package_id, num_resources)