예제 #1
0
 def fetchSocrata(self, year=2019, querySize=10000):
     '''Fetch data from Socrata connection and return pandas dataframe'''
     # Load config files
     socrata_domain = self.config['Socrata']['DOMAIN']
     socrata_dataset_identifier = self.config['Socrata']['AP' + str(year)]
     socrata_token = self.token
     # Establish connection to Socrata resource
     client = Socrata(socrata_domain, socrata_token)
     # Fetch data
     metadata = client.get_metadata(socrata_dataset_identifier)
     # Loop for querying dataset
     queryDf = None
     for i in range(0, querySize, 1000):
         print(i)
         results = client.get(socrata_dataset_identifier,
                              offset=i,
                              select="*",
                              order="updateddate DESC")
         tempDf = pd.DataFrame.from_dict(results)
         if queryDf is None:
             queryDf = tempDf.copy()
         else:
             queryDf = queryDf.append(tempDf)
     self.data = queryDf
     # Fetch data
     metadata = client.get_metadata(socrata_dataset_identifier)
예제 #2
0
def getDataset(dataset_id):
    table = ''
    try:
        # Creating Socrata Client
        client = Socrata(cfg["web"],
                         cfg["token"],
                         username=cfg["email"],
                         password=cfg["password"])
        data = client.get(dataset_id, content_type="json")
        data = str(data)
        data = data.replace("'", "\"")
        data = data.upper()
        #getting data to compare with the uploaded data
        #print(type(data))
        #print(data)
        table = pd.read_json(data)
        #Replacing NaN for ''
        table = table.replace(pd.np.nan, '', regex=True)
        table = table.to_html(classes='table-striped " id = "my_table',
                              index=False)
        vistas = client.get_metadata(dataset_id)
        vistas = str(vistas.get("viewCount"))
        client.close()
    except BaseException as e:
        #if there is an error, reload login with error message
        error = str(e)
        print('Error description:')
        print(error)
        client.close()
    return table, vistas
예제 #3
0
class SocrataClient:
    def __init__(self, config=None):
        config = config['Socrata']

        domain = config['DOMAIN']
        token = None if config['TOKEN'] == 'None' else config['TOKEN']
        timeout = int(config['TIMEOUT'])

        self.client = Socrata(domain, token, timeout=timeout)
        self.attempts = int(config['ATTEMPTS'])
        self.config = config

    def __del__(self):
        self.client.close()

    def dataset_id(self, year):
        return self.config['AP' + str(year)]

    def get(self, year, **kwargs):
        id = self.dataset_id(year)
        for attempt in range(self.attempts):
            try:
                return self.client.get(id, **kwargs)
            except Exception as e:
                if attempt < self.attempts - 1:
                    continue
                else:
                    raise e

    def get_metadata(self, year):
        id = self.dataset_id(year)
        return self.client.get_metadata(id)
예제 #4
0
class SocrataClient:
    def __init__(self):
        self.client = Socrata(conf.DOMAIN, conf.TOKEN, timeout=conf.TIMEOUT)

    def __del__(self):
        self.client.close()

    def dataset_id(self, year):
        return conf.DATASET_IDS[year]

    def get(self, year, **kwargs):
        id = self.dataset_id(year)
        for attempt in range(conf.ATTEMPTS):
            try:
                return self.client.get(id, **kwargs)
            except Exception as e:
                if attempt < conf.ATTEMPTS - 1:
                    continue
                else:
                    raise e

    def get_metadata(self, year):
        id = self.dataset_id(year)
        return self.client.get_metadata(id)

    def get_datasets(self):
        '''
        Search for "MyLA311 Service Request Data" within the response
        to get the dataset ids for each year.
        '''
        return self.client.datasets()
예제 #5
0
def check_for_new_data(app_token):
    """Test the Austin API to see if there is new data"""
    client = Socrata("data.austintexas.gov", app_token)
    austin_res = client.get_metadata(dataset_identifier="7d8e-dm7r")
    austin_max = datetime.datetime.fromtimestamp(austin_res['rowsUpdatedAt'])
    austin_max_pretty = austin_max.strftime('%Y-%m-%d %H:%M')
    return (austin_max, austin_max_pretty)
예제 #6
0
def get_dataset_name(x):
    """
    Create a function to pull 
    in the name of the dataset
    for the corresponding api endpoint 
    """

    #formatting for the final table
    #want to display the full name
    #of the dataset
    pd.options.display.max_colwidth = 200

    #setup a basic client
    # authenticated client (needed for non-public datasets):
    client = Socrata("opendata.mass-cannabis-control.com", None)

    # list comprehension to capture relevant metadata (i.e. Name)
    dataset_name = [client.get_metadata(y)['name'] for y in x]

    # combine the api-endpoints
    # with the name of the assoc.
    # dataset
    data = list(zip(x, dataset_name))

    #store final result into dataframe
    api_table = pd.DataFrame(
        data, columns=['api_endpoints',
                       'Name']).drop_duplicates().reset_index(drop=True)

    return api_table
예제 #7
0
파일: cli.py 프로젝트: kspinka/socrata2sql
def main():
    arguments = docopt(__doc__, version=__version__)

    client = Socrata(arguments['<site>'], arguments['-a'])

    try:
        if arguments['ls']:
            datasets = list_datasets(client, arguments['<site>'])
            print(tabulate(datasets, headers='keys', tablefmt='psql'))
        elif arguments['insert']:
            dataset_id = arguments['<dataset_id>']
            metadata = client.get_metadata(dataset_id)

            engine, session, geo = get_connection(arguments['-d'], metadata)
            Binding = get_binding(client, dataset_id, metadata, geo,
                                  arguments['-t'])

            # Create the table
            try:
                Binding.__table__.create(engine)
            except ProgrammingError as e:
                # Catch these here because this is our first attempt to
                # actually use the DB
                if 'already exists' in str(e):
                    raise CLIError(
                        'Destination table already exists. Specify a new table'
                        ' name with -t.')
                raise CLIError('Error creating destination table: %s' % str(e))

            num_rows = get_row_count(client, dataset_id)
            bar = FillingCirclesBar('  ▶ Loading from API', max=num_rows)

            # Iterate the dataset and INSERT each page
            for page in get_dataset(client, dataset_id):
                to_insert = []
                for row in page:
                    to_insert.append(Binding(**parse_row(row, Binding)))

                session.add_all(to_insert)
                session.flush()
                bar.next(n=len(to_insert))

            bar.finish()

            ui.item(
                'Committing rows (this can take a bit for large datasets).')
            session.commit()

            success = 'Successfully imported %s rows from "%s".' % (
                num_rows, metadata['name'])
            ui.header(success, color='\033[92m')

        client.close()
    except CLIError as e:
        ui.header(str(e), color='\033[91m')
예제 #8
0
파일: views.py 프로젝트: rodcruzh/datavis
def datavis(request, dataset_id):
    ods = OpenDataSource.objects.get(pk=dataset_id)
    client = Socrata(ods.website, ods.token, ods.user, ods.password)
    dataset = DataSet.objects.get(pk=dataset_id)
    data = client.get(dataset.identifier)
    metadata = client.get_metadata(dataset.identifier)
    client.close()
    template = loader.get_template('datavis/datavis.html')
    data = json.dumps(data, indent=4, sort_keys=True)
    context = {'data': data, 'metadata': metadata, 'dataset': dataset}
    return HttpResponse(template.render(context, request))
예제 #9
0
def test_get_metadata():
    mock_adapter = {}
    mock_adapter["prefix"] = PREFIX
    adapter = requests_mock.Adapter()
    mock_adapter["adapter"] = adapter
    client = Socrata(DOMAIN, APPTOKEN, session_adapter=mock_adapter)

    response_data = "get_song_metadata.txt"
    setup_old_api_mock(adapter, "GET", response_data, 200)
    response = client.get_metadata(DATASET_IDENTIFIER)

    assert isinstance(response, dict)
    assert "newBackend" in response
    assert "attachments" in response["metadata"]

    client.close()
예제 #10
0
    def cargar_base(self):
        """ Se conecta al API de Socrata y retorna la base de datos descargada del Portal de Datos Abiertos
        como dataframe. :ref:`Ver ejemplo <datos_gov.cargarself._base>`
    
        .. warning::
            Al descargar una base de datos utilizando el API de Socrata, esta omitirá cualquier
            columna que no contenga registros, lo cual puede generar inconsistencias con la información
            descrita en el portal de datos abiertos.

        :param api_id: (str) Identificación de la base de datos asociado con la API de Socrata.
        :param token: (str) opcional - token de usuario de la API Socrata.
        :param limite_filas: (int) (valor mayor a 0), indica el número máximo de filas a descargar de la base de datos \
        asociada al api_id. El límite está pensado para bases de gran tamaño que superen la capacidad del computador.
        :return: base de datos en formato dataframe.
        """

        client = Socrata(self.dominio_datos_gov, app_token=self.token)
        results = client.get(self.api_id, limit=self.limite_filas)
        self._base = pd.DataFrame.from_records(results)
        self.metadata = client.get_metadata(self.api_id)
예제 #11
0
class SocrataClient:
    def __init__(self):
        conf = config['Socrata']

        domain = conf['DOMAIN']
        token = conf['TOKEN']
        timeout = conf['TIMEOUT']

        self.client = Socrata(domain, token, timeout=timeout)
        self.attempts = conf['ATTEMPTS']
        self.years = conf

    def __del__(self):
        self.client.close()

    def dataset_id(self, year):
        return self.years['AP' + str(year)]

    def get(self, year, **kwargs):
        id = self.dataset_id(year)
        for attempt in range(self.attempts):
            try:
                return self.client.get(id, **kwargs)
            except Exception as e:
                if attempt < self.attempts - 1:
                    continue
                else:
                    raise e

    def get_metadata(self, year):
        id = self.dataset_id(year)
        return self.client.get_metadata(id)

    def get_datasets(self):
        '''
        Search for "MyLA311 Service Request Data" within the response
        to get the dataset ids for each year.
        '''
        return self.client.datasets()
예제 #12
0
파일: tasks.py 프로젝트: la-counts/scdc
def scrape_datasets(dataset):
    dataportal = dataset.dataportal
    domain = dataportal.domain
    client = Socrata(domain, None)
    dataset.sourced_meta_data = client.get_metadata(dataset.identifier)
예제 #13
0
class SocrataPortal(Portal):
    '''
    Stores SODA data.
    '''
    def __init__(self, site, dataset_id, app_token, tbl_name=None):
        Portal.__init__(self, site)
        self.col_mappings = {
            'checkbox': Boolean,
            'url': Text,
            'text': Text,
            'number': Numeric,
            'calendar_date': DateTime,
            'point': Geometry(geometry_type='POINT', srid=4326),
            'location': Geometry(geometry_type='POINT', srid=4326),
            'multipolygon': Geometry(geometry_type='MULTIPOLYGON', srid=4326)
        }
        self.site = site
        self.name = "Socrata"
        self.dataset_id = dataset_id
        self.app_token = app_token
        self.client = Socrata(self.site, self.app_token)
        self.tbl_name = utils.get_table_name(
            self.client.get_metadata(
                self.dataset_id)['name']).lower() if not tbl_name else tbl_name
        self.metadata = self.__get_metadata()
        self.srid = 4326

        self.num_rows = int(
            self.client.get(self.dataset_id,
                            select='COUNT(*) AS count'))[0]['count']
        self.data = self.__get_socrata_data(5000)

    def __get_metadata(self):
        '''
        Uses provided metadata to map column types to SQLAlchemy.
        '''
        ui.item("Gathering metadata")
        print()
        metadata = []
        for col in self.client.get_metadata(self.dataset_id)['columns']:
            print(col['fieldName'], ":", col['dataTypeName'])
            try:
                metadata.append(
                    (col['fieldName'], self.col_mappings[col['dataTypeName']]))
            except KeyError:
                warnings.warn('Unable to map "%s" to a SQL type.' %
                              col['fieldName'])
                continue
        return metadata

    def __get_socrata_data(self, page_size=5000):
        '''
        Iterate over a datasets pages using the Socrata API
        '''
        ui.item("Gathering data (this can take a bit for large datasets).")
        page_num = 0
        more_pages = True

        while more_pages:
            try:

                api_data = self.client.get(
                    self.dataset_id,
                    limit=page_size,
                    offset=page_size * page_num,
                )

                if len(api_data) < page_size:
                    more_pages = False
                page_num += 1
                yield api_data

            except:
                ui.item("Sleeping for 10 seconds to avoid timeout")
                time.sleep(10)

    def insert(self, circle_bar):
        for page in self.data:
            utils.insert_data(
                page, self.session, circle_bar, self.binding, srid=self.srid, \
                socrata=True)
        pass
예제 #14
0
class SocrataRepository(HarvestRepository):
    """ Socrata Repository """

    def setRepoParams(self, repoParams):
        self.metadataprefix = "socrata"
        super(SocrataRepository, self).setRepoParams(repoParams)
        # sodapy doesn't like http/https preceding URLs
        self.socratarepo = Socrata(self.url, self.socrata_app_token)
        self.domain_metadata = []


    def _crawl(self):
        kwargs = {
            "repo_id": self.repository_id, "repo_url": self.url, "repo_set": self.set, "repo_name": self.name, "repo_type": "socrata", 
            "enabled": self.enabled, "repo_thumbnail": self.thumbnail, "item_url_pattern": self.item_url_pattern,
            "abort_after_numerrors": self.abort_after_numerrors, "max_records_updated_per_run": self.max_records_updated_per_run,
            "update_log_after_numitems": self.update_log_after_numitems, "record_refresh_days": self.record_refresh_days,
            "repo_refresh_days": self.repo_refresh_days, "homepage_url": self.homepage_url
        }
        self.repository_id = self.db.update_repo(**kwargs)
        records = self.socratarepo.datasets()

        item_count = 0
        for rec in records:
            result = self.db.write_header(rec["resource"]["id"], self.repository_id)
            item_count = item_count + 1
            if (item_count % self.update_log_after_numitems == 0):
                tdelta = time.time() - self.tstart + 0.1
                self.logger.info("Done {} item headers after {} ({:.1f} items/sec)".format(item_count, self.formatter.humanize(tdelta), item_count/tdelta) )

        self.logger.info("Found {} items in feed".format(item_count) )

    def format_socrata_to_oai(self, socrata_record, local_identifier):
        record = {}

        record["title"] = socrata_record["name"]
        record["description"] = socrata_record.get("description", "")
        record["tags"] = socrata_record.get("tags", "")
        record["identifier"] = local_identifier
        record["creator"] = socrata_record.get("attribution", self.name)
        record["pub_date"] = datetime.fromtimestamp(socrata_record["publicationDate"]).strftime('%Y-%m-%d')
        record["contact"] = self.contact
        record["series"] = socrata_record.get("category", "")

        return record

    def _rate_limited(max_per_second):
        """ Decorator that make functions not be called faster than a set rate """
        threading = __import__('threading')
        lock = threading.Lock()
        min_interval = 1.0 / float(max_per_second)

        def decorate(func):
            last_time_called = [0.0]

            @wraps(func)
            def rate_limited_function(*args, **kwargs):
                lock.acquire()
                elapsed = time.clock() - last_time_called[0]
                left_to_wait = min_interval - elapsed

                if left_to_wait > 0:
                    time.sleep(left_to_wait)

                lock.release()

                ret = func(*args, **kwargs)
                last_time_called[0] = time.clock()
                return ret

            return rate_limited_function

        return decorate

    @_rate_limited(5)
    def _update_record(self,record):

        try:            
            socrata_record = self.socratarepo.get_metadata(record['local_identifier'])
            oai_record = self.format_socrata_to_oai(socrata_record,record['local_identifier'])
            if oai_record:
                self.db.write_record(oai_record, self.repository_id, self.metadataprefix.lower(), self.domain_metadata)
            return True

        except Exception as e:
            self.logger.error("Updating record {} failed: {}".format(record['local_identifier'], e))
            # Touch the record so we do not keep requesting it on every run
            self.db.touch_record(record)
            self.error_count = self.error_count + 1
            if self.error_count < self.abort_after_numerrors:
                return True

        return False
예제 #15
0
def main(
    dataset_id,
    table_name,
    database,
    socrata_username,
    socrata_password,
    where_clause,
    existing_table_rows="drop",
):
    """
    Read in dataset from Socrata and write output to Platform
    Parameters
    --------
    dataset_id: str
        Socrata dataset identifier
    table_name: str, optional
        destination table in Platform (schema.table)
    database: str, optional
        destination database in Platform
    socrata_username: str, optional
        username for socrata account, required for private data sets
    socrata_password: str, optional
        password for socrata account, required for private data sets
    where_clause: str, optional
        SoQL for filtering dataset
    existing_table_rows: str, optional
        options to pass to dataframe_to_civis command

    Outputs
    ------
    Adds data as file output
    and, if table_name and database are specified, writes data to Platform
    """

    socrata_client = Socrata(
        "data.lacity.org", None, username=socrata_username, password=socrata_password
    )

    socrata_client.timeout = 50

    raw_metadata = socrata_client.get_metadata(dataset_id)

    dataset = _read_paginated(socrata_client, dataset_id, where=where_clause)

    civis_client = civis.APIClient()

    if dataset.empty:
        msg = f"No rows returned for dataset {dataset_id}."
        LOG.warning(msg)
        write_and_attach_jsonvalue(json_value=msg, name="Error", client=civis_client)
    else:
        data_file_name = (
            f"{dataset_id}_extract_{datetime.now().strftime('%Y-%m-%d')}.csv"
        )
        file_id = _store_and_attach_dataset(
            client=civis_client, df=dataset, filename=data_file_name
        )
        LOG.info(f"add the {file_id}")

        if table_name:
            # Optionally start table upload
            LOG.info(f"Storing data in table {table_name} on database {database}")
            print("writing table")
            run_id = os.environ["CIVIS_RUN_ID"]
            job_id = os.environ["CIVIS_JOB_ID"]
            dataset["civis_job_id"] = job_id
            dataset["civis_run_id"] = run_id
            table_upload = civis.io.dataframe_to_civis(
                dataset,
                database=database,
                table=table_name,
                existing_table_rows=existing_table_rows,
            ).result()
            LOG.info(f"using {table_upload}")

    # Parse raw_metadata to extract useful fields and attach both raw and
    # cleaned metadata as script outputs
    metadata_file_name = (
        f"{dataset_id}_metadata_{datetime.now().strftime('%Y-%m-%d')}.json"
    )

    metadata_paths = {
        "Proposed access level": "metadata.custom_fields.Proposed Access Level.Proposed Access Level",  # noqa: E501
        "Description": "description",
        "Data updated at": "rowsUpdatedAt",
        "Data provided by": "tableAuthor.screenName",
    }

    _, clean_metadata = _store_and_attach_metadata(
        client=civis_client,
        metadata=raw_metadata,
        metadata_paths=metadata_paths,
        filename=metadata_file_name,
    )

    if table_name:
        sql = f'COMMENT ON TABLE {table_name} IS \'{clean_metadata["Description"]}\''
        civis.io.query_civis(
            sql, database=database, polling_interval=2, client=civis_client
        ).result()
예제 #16
0
def main():
    arguments = docopt(__doc__)

    site = arguments['<site>']

    if arguments['--HUD']:
        source = "HUD"
        dataset_id = site
        client = None
    if arguments['--Socrata']:
        source = "Socrata"
        client = Socrata(site, arguments.get('-a'))

    try:
        if arguments.get('ls'):
            datasets = list_datasets(client, site)
            print(tabulate(datasets, headers='keys', tablefmt='psql'))
        elif arguments.get('insert'):        
            if source == "Socrata":
                dataset_id = arguments['<dataset_id>']
                metadata = client.get_metadata(dataset_id)['columns']
            if source == "HUD":
                metadata = json.loads(
                    urllib.request.urlopen(site).read())['fields']

            engine, session, geo = \
                get_connection(arguments['-d'], metadata, source)
            
            if arguments['-t']:
                Binding = get_binding(
                    metadata, geo, arguments['-t'], source
                )
            else:
                Binding = get_binding(
                    metadata, geo, dataset_id, source
                )

            # Create the table
            try:
                Binding.__table__.create(engine)
            except ProgrammingError as e:
                # Catch these here because this is our first attempt to
                # actually use the DB
                if 'already exists' in str(e):
                    raise CLIError(
                        'Destination table already exists. Specify a new table'
                        ' name with -t.'
                    )
                raise CLIError('Error creating destination table: %s' % str(e))

            num_rows, data = get_data(source, dataset_id, client)
            bar = FillingCirclesBar('  ▶ Loading from source', max=num_rows)

            # Iterate the dataset and INSERT each page
            if source == "Socrata":
                for page in data:
                    insert_data(page, session, bar, Binding)

            if source == "HUD":
                insert_data(data, session, bar, Binding)

            bar.finish()

            ui.item(
                'Committing rows (this can take a bit for large datasets).'
            )
            session.commit()

            success = 'Successfully imported %s rows.' % (
                num_rows
            )
            ui.header(success, color='\033[92m')
        if client:
            client.close()
    except CLIError as e:
        ui.header(str(e), color='\033[91m')
예제 #17
0
class SocrataDataset(object):
    def __init__(self,
                 dataset_id,
                 socrata_client=None,
                 socrata_params={},
                 float_fields=[]):
        self.dataset_id = dataset_id
        self.client = socrata_client
        if not socrata_client and socrata_params:
            self.client = Socrata(**socrata_params)
        self.socrata_params = socrata_params
        self.col_dtype_dict = self.get_col_dtype_dict()
        self.float_fields = float_fields

    def get_col_dtype_dict(self):
        '''
        Retrieve data dictionary of a Socrata data set in the form of a dictionary,
        with the key being the column name and the value being the column data type

    	Returns:
    		data dictionary of a Socrata data set in the form of a dictionary,
            with the key being the column name and the value being the column data type
        '''
        dataset_col_meta = self.client.get_metadata(self.dataset_id)['columns']
        col_dtype_dict = {
            col['name']: col['dataTypeName']
            for col in dataset_col_meta
        }
        return col_dtype_dict

    def mod_dtype(self, rec, col_dtype_dict=None, float_fields=None):
        '''
        Make sure the data type of each field in the data record matches the data type
        of the field in the Socrata data set.

    	Parameters:
    		rec: dictionary object of the data record
            col_dtype_dict: data dictionary of a Socrata data set in the form of a dictionary,
            with the key being the column name and the value being the column data type
            float_fields: list of fields that should be a float

    	Returns:
    		dictionary object of the data record, with number, string, and boolean fields
            modified to align with the data type of the corresponding Socrata data set
        '''
        col_dtype_dict = col_dtype_dict or self.col_dtype_dict
        float_fields = float_fields or self.float_fields

        identity = lambda x: x
        dtype_func = {'number': float, 'text': str, 'checkbox': bool}
        out = {}
        for k, v in rec.items():
            if k in float_fields and k in col_dtype_dict:
                out[k] = float(v)
            elif k in col_dtype_dict:
                if v is not None and v is not '':
                    out[k] = dtype_func.get(
                        col_dtype_dict.get(k, 'nonexistentKey'), identity)(v)
        out = {k: v for k, v in out.items() if k in col_dtype_dict}
        return out

    def create_new_draft(self):
        draftDataset = requests.post(
            'https://{}/api/views/{}/publication.json'.format(
                self.client.domain, self.dataset_id),
            auth=(self.socrata_params['username'],
                  self.socrata_params['password']),
            params={'method': 'copySchema'})
        logger.info(draftDataset.json())
        draftId = draftDataset.json()['id']
        return draftId

    def publish_draft(self, draftId):
        time.sleep(5)
        publishResponse = requests.post(
            'https://{}/api/views/{}/publication.json'.format(
                self.client.domain, draftId),
            auth=(self.socrata_params['username'],
                  self.socrata_params['password']))
        logger.info(publishResponse.json())
        return publishResponse

    def delete_draft(self, draftId):
        time.sleep(5)
        deleteResponse = self.client.delete(draftId)
        if deleteResponse.status_code == 200:
            logger.info('Empty draft {} has been discarded.'.format(draftId))
        return deleteResponse

    def clean_and_upsert(self, recs, dataset_id=None):
        dataset_id = dataset_id or self.dataset_id
        out_recs = [self.mod_dtype(r) for r in recs]
        uploadResponse = self.client.upsert(dataset_id, out_recs)
        return uploadResponse
예제 #18
0
def main(
    socrata_client_url: str,
    dataset_id: str,
    civis_table_name: str,
    civis_database: str,
    database_type: str,
    socrata_username: str,
    socrata_password: str,
    grant_group: str,
    varchar_len: str = None,
    action_existing_table_rows: str = "drop",
):
    """
    Read in dataset from Socrata and write output to Platform

    Parameters
    --------
    socrata_client_url: str
        url of socrata portal being referenced
    dataset_id: str
        Socrata dataset identifier
    civis_table_name: str
        destination table in Platform (schema.table)
    civis_database: str
        destination database in Platform
    database_type: str
        type of destination database
    socrata_username: str, optional
        username for socrata account, required for private data sets
    socrata_password: str, optional
        password for socrata account, required for private data sets
    grant_group: str
        string of group(s) that are passed to civis API to be granted select
        table access
    varchar_len: str
        sets the varchar length when datatypes are passed to civis API, 256 is
        defualt
    action_existing_table_rows: str, optional
        options to pass to dataframe_to_civis command

    Outputs
    ------
    Adds data as file output and, if table_name and database are specified,
    writes data to Platform
    """

    socrata_client = Socrata(socrata_client_url,
                             None,
                             username=socrata_username,
                             password=socrata_password)
    # define socrata cleint

    civis_client = civis.APIClient()
    # define civis cleint

    socrata_client.timeout = 50

    sample_data = socrata_client.get(dataset_id,
                                     limit=5,
                                     content_type="csv",
                                     exclude_system_fields=False,
                                     offset=0)
    # collects sample data from dataset

    sample_data_df = results_to_df(sample_data)
    # writes sample data to dataframe

    if sample_data_df.empty:
        msg = f"No rows returned for dataset {dataset_id}."
        LOG.warning(msg)
        write_and_attach_jsonvalue(json_value=msg,
                                   name="Error",
                                   client=civis_client)
        os._exit(1)
    # provides exit if no rows avalible in dataset

    raw_metadata = socrata_client.get_metadata(dataset_id)
    # calls for raw metadata

    sql_type = select_sql_map(database_type, varchar_len)
    # defines apropriate sql types for datatype mapping depending on
    # specifications

    (
        civis_table_columns,
        point_columns,
        pandas_column_order,
        extra_columns,
    ) = create_col_type_dict(raw_metadata, sample_data_df, sql_type)
    # creates civis specific array of dicts that maps column name to
    # datatype using socrata metadata as guidence. Also, provides point
    # columns that are used to clean point column formatting during import.
    # And, provides array of columns that corresponds to order of the mapping
    # dict (civis_file_to_table is sensitive to order.

    print("Columns present in Metadata but not in data:", extra_columns)

    consolidated_csv_path = _read_paginated(
        client=socrata_client,
        dataset_id=dataset_id,
        point_columns=point_columns,
        column_order=pandas_column_order,
    )
    # reads in socrata data in chunks (using offset and page_limit), and
    # appenda all to one csv and outputs path here

    data_file_name = f"{dataset_id}_extract_{datetime.now().strftime('%Y-%m-%d')}.csv"
    uploaded_file_id = _store_and_attach_dataset_csv(
        client=civis_client,
        csv_path=consolidated_csv_path,
        filename=data_file_name)
    print("file_id:", uploaded_file_id)
    LOG.info(f"add the {uploaded_file_id}")

    LOG.info(
        f"Storing data in table {civis_table_name} on database {civis_database}"
    )

    table_upload = civis.io.civis_file_to_table(
        file_id=uploaded_file_id,
        database=civis_database,
        table=civis_table_name,
        table_columns=civis_table_columns,
        existing_table_rows=action_existing_table_rows,
        headers=True,
    ).result()
    LOG.info(f"using {table_upload}")
    # takes in file id and writes to table

    metadata_file_name = (
        f"{dataset_id}_metadata_{datetime.now().strftime('%Y-%m-%d')}.json")
    # parse raw_metadata to extract useful fields and attach both raw and
    # cleaned metadata as script outputs

    upload_metadata_paths = {
        "Description": "description",
        "Data updated at": "rowsUpdatedAt",
        "Data provided by": "tableAuthor.screenName",
    }

    _, clean_metadata = _store_and_attach_metadata(
        client=civis_client,
        metadata=raw_metadata,
        metadata_paths=upload_metadata_paths,
        filename=metadata_file_name,
    )

    if civis_table_name:
        sql = f"""
                COMMENT ON TABLE {civis_table_name} IS
                \'{clean_metadata["Description"]}\'
                 """
        civis.io.query_civis(sql,
                             database=civis_database,
                             polling_interval=2,
                             client=civis_client).result()

    if grant_group:
        sql = f"GRANT ALL ON {civis_table_name} TO GROUP {grant_group}"
        civis.io.query_civis(sql,
                             database=civis_database,
                             polling_interval=2,
                             client=civis_client).result()
예제 #19
0
from sodapy import Socrata

import os
import pandas as pd
import numpy as np
import plotly.plotly as py
from plotly.graph_objs import *
import plotly
import collections
import plotly.graph_objs as go

client = Socrata("data.cityofchicago.org",
                 "9ugcPuahbyTpHmzfeCefy30Ni",
                 username="******",
                 password="******")
metadata = client.get_metadata("cwig-ma7x")
plotly.tools.set_credentials_file(username='******',
                                  api_key='jINELDLWjEHsJLDibvbI')

socrata_token = os.environ.get("SODAPY_APPTOKEN")
results = client.get("cwig-ma7x", limit=170000)
df = pd.DataFrame.from_dict(results)
mapbox_access_token = 'pk.eyJ1IjoiYWxleHRpZmE3IiwiYSI6ImNqZ2x5aDR1NDF1cGgyd21qNW5kcWp0NzUifQ.5qMfl1OmJPDwIonDZergiA'
lan = []
lon = []
unique_name = []
u = []
for i in range(len(df['dba_name'])):

    if [df['dba_name'][i], df['address'][i]] in u:
        if df['results'][i] == 'Fail':
예제 #20
0
def main (args):
    if len(args) == 0:
        print ('args: list of dataset id, state/province, city, country')
        exit(0)

    ids = []
    with open(args[0],'r') as id_file:
        for line in id_file:
            ids.append(line.strip('\n\r'))

    print (ids)

    graphql = config(section='graphql')
    rebloc = config(section='rebloc')

    headers = { 'X-Hasura-Access-Key': graphql['apitoken']}
    _transport = RequestsHTTPTransport(
                    url=graphql['endpoint'],
                    headers=headers,
                    use_json=True
                )

    graphql_client = Client(
                        transport=_transport,
                        fetch_schema_from_transport=True
                    )

    my_marketplace = ReblocMarketplace(rebloc['endpoint'],graphql_client)
    ownerid = my_marketplace.look_up_user_id(rebloc['registeremail'])

    api_config = config(section='sourceapi')
    domain_client = Socrata(api_config['domain'], api_config['token'])

    for dataset_identifier in ids:
        metadata = domain_client.get_metadata(dataset_identifier)

        open_data = MyOpenData(domain_client,metadata,dataset_identifier)

        try:
            schema = open_data.data_schema()
            print (schema)

            server_config = config(section='ipfs')
            gen = DocumentGenerator()

            seed = gen.sentence()
            print(seed)
            # 32 bytes encryption keys
            sample_key = hashlib.sha256(seed.encode('utf-8')).hexdigest()[:32].encode('utf8')
            print("key = %s" % sample_key)

            seed = gen.sentence()
            print(seed)
            # 32 bytes encryption keys
            data_key = hashlib.sha256(seed.encode('utf-8')).hexdigest()[:32].encode('utf8')
            print("key = %s" % data_key)

            # publish sample
            print ('publishing sample....')
            sample_info = open_data.publish_sample_data(
                                        sample_key,
                                        server_config['endpoint'],
                                        server_config['port'],
                                        sample_size=300
                                    )

            # publish full data
            print('publishing all data....')
            data_info = open_data.publish_all_data(
                                        data_key,
                                        server_config['endpoint'],
                                        server_config['port']
                                    )

            current_date_time = datetime.datetime.utcnow().strftime("%a %b %d %H:%M:%S %Y")
            search_terms = "{property,taxes}"
            if metadata.get('tags') is not None:
                search_terms = "{" + ",".join(metadata['tags']) + "}"

            default_ipfs_gateway = "http://demo-app.rebloc.io:8080/ipfs/"
            default_price = 0.5

            if 0.00001 * data_info['num_of_rows'] > default_price:
                default_price = round (0.01 * data_info['num_of_rows'],2)

            dataset = {
                "id": str(uuid.uuid1()),
                "name": metadata['name'],
                "table_name": metadata['id'],
                "description":  metadata['description'],
                "country": "united states",
                "state_province": "california",
                "city": "{san mateo}",
                "topic": "{" + "assessment" + "}",
                "date_created": current_date_time,
                "date_modified": current_date_time,
                "dataset_owner_id": ownerid,
                "delivery_method": "IPFS/CSV",
                "enc_data_key": data_key.decode(),
                "enc_sample_key": sample_key.decode(),
                "sample_access_url": default_ipfs_gateway + sample_info['ipfs_hash'],
                "sample_hash": sample_info['md5_file_hash'],
                "access_url": default_ipfs_gateway + data_info['ipfs_hash'],
                'data_hash': data_info['md5_file_hash'],
                "num_of_records": data_info['num_of_rows'],
                "search_terms": search_terms,
                "price_high": default_price,
                "price_low": 0.5,
                "stage": 3,
                "schema": schema,
                "json_schema": json.dumps(schema)
            }
            print (dataset)
            # list draft datasets to marketplace
            result = my_marketplace.post_draft_dataset(dataset)
            print (result)
            print (dataset_identifier + ' completed')

        except Exception as err:
            print("error occurs:%s" % err)

    print ('done')
예제 #21
0
print("Updating covid data")

if not (token := os.environ.get("SODAPY_APPTOKEN")):
    raise EnvironmentError("SODAPY_APPTOKEN not set")

domain = "data.ny.gov"
covid_id = "xdss-u53e"

client = Socrata(domain, token)

site_last_updated = 0
if "site-last-updated" in os.listdir("."):
    with open("site-last-updated", "r") as f:
        site_last_updated = int(f.read())

metadata = client.get_metadata(covid_id)
if data_last_updated := metadata.get("rowsUpdatedAt"):
    data_last_updated = int(data_last_updated)
    if site_last_updated >= data_last_updated:
        sys.exit(0)

testing_data = client.get_all(covid_id,
                              select="county, test_date, new_positives")

print("Cleaning data")

testing_data = [{
    "county":
    c["county"].lower(),
    "test_date":
    datetime.strptime(c["test_date"], "%Y-%m-%dT%H:%M:%S.%f"),
예제 #22
0
def controllerCenter(allIDS):

    flagCounter = 0
    limit = 10

    token = 'sC4N6wXghMXaL2C3uUxVMphf0'
    client = Socrata('www.datos.gov.co',
                     token,
                     username="******",
                     password="******")

    client.timeout = 180

    print("Conjuntos de datos a evaluar", len(allIDS))

    for i in allIDS:

        resultado = ""
        flagCounter += 1

        try:

            try:

                # Generando los insumos del proceso.

                datasetURL = "https://www.datos.gov.co/resource/{}.json".format(i)

                print(flagCounter)
                print(datasetURL)

                # Valida URL y si es accesible
                statusDataset = requests.get(datasetURL, timeout=60)

                # Obtiene información del Dataset
                datosDataSet = client.get(i, limit=limit)

                # Obteniendo Metadatos del Dataset
                metaDataset = client.get_metadata(i)

                # Reviando que el Dataset no esté vacio 
                frametoValidate = pd.DataFrame.from_records(datosDataSet)

                # Validando DataSet
                metaData = metaDataset['metadata']


            except KeyError as error:

                print(error)
                logging.error(str(error))
                indexCompletitud = 0
                indexCredibilidad = 0
                indexActualidad = 0
                indexTrazabilidad = 0
                indexDisponibildiad = 0
                indexConformidad = 0
                indexComprensibilidad = 0
                indexPortabilidad = 0
                indexConsistencia = 0
                indexExactitud = 0

            except TimeoutError as error:

                logging.error(str(error))

            except requests.exceptions.ConnectionError as error:

                logging.error(str(error))


            else:

                # Cálculo de indicadores para cada conjunto de datos evaluado

                if statusDataset.status_code == 200 and frametoValidate.empty == False:

                    # Se genera primer indicador de disponibilidad
                    resultado = str(i)

                    # Se genera primer indicador de disponibilidad
                    resultado = resultado + ';' + str(10)

                    # Creación de la Instancia
                    evaluation = Evaluation()

                    # Indicador Completitud
                    indexCompletitud = evaluation.indicadorCompletitud(frametoValidate)
                    resultado = resultado + ',' + str(indexCompletitud)

                    # Indicador Actualidad
                    indexActualidad = evaluation.indicadorActualidad(metaDataset, metaData)
                    resultado = resultado + ',' + str(indexActualidad)

                    # Indicador Credibilidad
                    indexCredibilidad = evaluation.indicadorCredibilidad(metaDataset)
                    resultado = resultado + ',' + str(indexCredibilidad)

                    # Indicador Trazabilidad
                    indexTrazabilidad = evaluation.indicadorTrazabilidad(metaDataset)
                    resultado = resultado + ',' + str(indexTrazabilidad)

                    # Indicador Conformidad
                    indexConformidad = evaluation.indicadorConformidad(metaDataset)
                    resultado = resultado + ',' + str(indexConformidad)

                    # Indicador Comprensibilidad
                    indexComprensibilidad = evaluation.indicadorComprensibilidad(metaDataset, frametoValidate)
                    resultado = resultado + ',' + str(indexComprensibilidad)

                    # Indicador Portabilidad
                    indexPortabilidad = evaluation.indicadorPortabilidad(datosDataSet)
                    resultado = resultado + ',' + str(indexPortabilidad)

                    # Indicador Consistencia
                    indexConsistencia = evaluation.indicadorConsisetencia(frametoValidate)
                    resultado = resultado + ',' + str(indexConsistencia)
                    # INdicador Exactitud
                    indexExactitud = evaluation.indicadorExactitud(metaDataset, frametoValidate)
                    resultado = resultado + ',' + str(indexExactitud)

                else:

                    print("error")

            finally:


                with open("Quality_Indicators.csv", 'a', encoding='UTF-8') as qIndicators:
                    qIndicators.write(str(resultado))
                    qIndicators.write('\n')

        except BrokenPipeError as errorBroken:

            logging.error(str(errorBroken))
            resultado = str(i) + "0,0,0,0,0,0,0,0,0,0"
            with open("Quality_Indicators.csv", 'a', encoding='UTF-8') as qIndicators:
                qIndicators.write(str(resultado))
                qIndicators.write('\n')
class SocrataDataset(object):
    """
    Helper class for interacting with datasets in Socrata.

    """
    logger = None

    def __init__(self,
                 dataset_id,
                 socrata_client=None,
                 socrata_params=None,
                 float_fields=None,
                 logger=None):
        """
        Initialization function of the SocrataDataset class.

        Parameters:
            dataset_id: 4x4 ID of the Socrata draft (e.g. x123-bc12)
            client: Optional parameter if the user chooses to pass in the
                socrata_params parameter. If user chooses not to pass in
                socrata_params, they can also pass in an sodapy.Socrata object
                that has been initialized with the proper socrata credentials.
            socrata_params: Optional parameter if the user choose to pass in the
                socrata_client parameter. Dictionary object containing Socrata
                credentials. Must include the following fields: 'username',
                'password', 'app_token', 'domain'.
            float_fields: An array of Socrata field names that should be of
                float types (numbers with decimals).
            logger: Optional parameter. Could pass in a logger object or not pass
                in anything. If a logger object is passed in, information will be
                logged instead of printed. If not, information will be printed.
        """
        self.socrata_params = {}
        self.float_fields = []
        self.dataset_id = dataset_id
        self.client = socrata_client
        if not socrata_client and socrata_params:
            self.client = Socrata(**socrata_params)
        self.socrata_params = socrata_params
        self.col_dtype_dict = self.get_col_dtype_dict()
        self.float_fields = float_fields
        self.print_func = print
        if logger:
            self.print_func = logger.info

    def get_col_dtype_dict(self):
        """
        Retrieve data dictionary of a Socrata data set in the form of a dictionary,
        with the key being the column name and the value being the column data type

    	Returns:
    		Data dictionary of a Socrata data set in the form of a dictionary,
            with the key being the column name and the value being the column data type.
        """
        dataset_col_meta = self.client.get_metadata(self.dataset_id)['columns']
        col_dtype_dict = {
            col['name']: col['dataTypeName']
            for col in dataset_col_meta
        }
        return col_dtype_dict

    def mod_dtype(self, rec, col_dtype_dict=None, float_fields=None):
        """
        Make sure the data type of each field in the data record matches the data type
        of the field in the Socrata data set.

    	Parameters:
    		rec: dictionary object of the data record
            col_dtype_dict: data dictionary of a Socrata data set in the form of a dictionary,
            with the key being the column name and the value being the column data type
            float_fields: list of fields that should be a float

    	Returns:
    		Dictionary object of the data record, with number, string, and boolean fields
            modified to align with the data type of the corresponding Socrata data set.
        """
        col_dtype_dict = col_dtype_dict or self.col_dtype_dict
        float_fields = float_fields or self.float_fields

        identity = lambda x: x
        dtype_func = {'number': float, 'text': str, 'checkbox': bool}
        out = {}
        for k, v in rec.items():
            if k in float_fields and k in col_dtype_dict:
                out[k] = float(v)
            elif (k in col_dtype_dict and v not in [None, '']):
                out[k] = dtype_func.get(
                    col_dtype_dict.get(k, 'nonexistentKey'), identity)(v)
        out = {k: v for k, v in out.items() if k in col_dtype_dict}
        return out

    def create_new_draft(self):
        """
        Create a new draft of the current dataset.

        Returns:
            Draft ID of the new draft.
        """
        draft_dataset = requests.post(
            'https://{}/api/views/{}/publication.json'.format(
                self.client.domain, self.dataset_id),
            auth=(self.socrata_params['username'],
                  self.socrata_params['password']),
            params={'method': 'copySchema'})
        logger.info(draft_dataset.json())
        draft_id = draft_dataset.json()['id']
        return draft_id

    def publish_draft(self, draft_id):
        """
        Publish the Socrata draft specified.

        Parameters:
            draft_id: 4x4 ID of the Socrata draft (e.g. x123-bc12)

        Returns:
            Response of the publish draft request.
        """
        time.sleep(5)
        publish_response = requests.post(
            'https://{}/api/views/{}/publication.json'.format(
                self.client.domain, draft_id),
            auth=(self.socrata_params['username'],
                  self.socrata_params['password']))
        logger.info(publish_response.json())
        return publish_response

    def delete_draft(self, draft_id):
        """
        Delete the Socrata draft specified.

        Parameters:
            draft_id: 4x4 ID of the Socrata draft (e.g. x123-bc12)

        Returns:
            Response of the delete draft request.
        """
        time.sleep(5)
        delete_response = self.client.delete(draft_id)
        if delete_response.status_code == 200:
            logger.info('Empty draft {} has been discarded.'.format(draft_id))
        return delete_response

    def clean_and_upsert(self, recs, dataset_id=None):
        """
        Publish the Socrata draft specified.

        Parameters:
            recs: an array of dictionary objects of the data to upsert.
            dataset_id: 4x4 ID of the Socrata dataset (e.g. x123-bc12) to perform
            upserts to. This parameter is not required if you are performing upserts to the
            dataset you've initialized this class with.

        Returns:
            A dictionary object with the following fields:
            'Rows Deleted' - number of rows deleted due to the upsert request
            'Rows Updated' - number of rows updated due to the upsert request
            'Rows Created' - number of rows created due to the upsert request
        """
        dataset_id = dataset_id or self.dataset_id
        out_recs = [self.mod_dtype(r) for r in recs]
        upload_response = self.client.upsert(dataset_id, out_recs)
        return upload_response
예제 #24
0
import os
from sodapy import Socrata
import json

socrata_token = os.environ.get("SODAPY_APPTOKEN")

nyc_realestate_domain = 'data.cityofnewyork.us'
nyc_realestate_dataset_identifier = 'm8p6-tp4b'

nyc_realestate_domain_client = Socrata(nyc_realestate_domain, socrata_token)
metadata = nyc_realestate_domain_client.get_metadata(
    nyc_realestate_dataset_identifier)
# print (metadata)
dataCols = []
for x in metadata['columns']:
    col = dict()
    col['name'] = x['fieldName']

    if x['dataTypeName'] == "calendar_date":
        col['type'] = "timestamp without timezone"
    else:
        if x['dataTypeName'] == "number":
            col['type'] = "integer"
        else:
            col['type'] = x['dataTypeName']

    col['label'] = x['name'].replace('_', ' ')
    col['description'] = x['description']
    dataCols.append(col)

# print (json.dumps(dataCols,indent=4, sort_keys=False,default=str))
예제 #25
0
	, 'resolution_description': sqlalchemy.types.Text()    
	, 'resolution_action_updated_date': sqlalchemy.DateTime()    
	, 'created_date': sqlalchemy.DateTime()
	, 'due_date': sqlalchemy.DateTime()        
    , 'closed_date': sqlalchemy.DateTime()    
	, 'descriptor': sqlalchemy.types.Text()
	, 'location': sqlalchemy.types.Text()    
}

#### connect w/ Socrata 
endpoint = 'https://data.cityofnewyork.us/resource/p5f6-bkga.json'
domain = 'data.cityofnewyork.us'
data_id = 'p5f6-bkga'
token = myToken
client = Socrata(domain, token)
metadata = client.get_metadata(data_id)
columns = [x['name'] for x in metadata['columns']]
meta_amount = [x for x in metadata['columns'] if x['name'] == 'Agency'][0]
retries = 3

#### Start Operation
# 3006411 as of 2019-03-31
batches = int(3006411 // 50000) + 1
i=0
dropped_cols = []
while i < batches:
    fail_cnt = 0
    query ="""
    select
        *
    where
예제 #26
0
class SocrataRepository(HarvestRepository):
    """ Socrata Repository """

    def setRepoParams(self, repoParams):
        self.metadataprefix = "socrata"
        super(SocrataRepository, self).setRepoParams(repoParams)
        # sodapy doesn't like http/https preceding URLs
        self.socratarepo = Socrata(self.url, self.socrata_app_token)
        self.domain_metadata = []


    def _crawl(self):
        kwargs = {
            "repo_id": self.repository_id, "repo_url": self.url, "repo_set": self.set, "repo_name": self.name, "repo_type": "socrata", 
            "enabled": self.enabled, "repo_thumbnail": self.thumbnail, "item_url_pattern": self.item_url_pattern,
            "abort_after_numerrors": self.abort_after_numerrors, "max_records_updated_per_run": self.max_records_updated_per_run,
            "update_log_after_numitems": self.update_log_after_numitems, "record_refresh_days": self.record_refresh_days,
            "repo_refresh_days": self.repo_refresh_days, "homepage_url": self.homepage_url,
            "repo_oai_name": self.repo_oai_name
        }
        self.repository_id = self.db.update_repo(**kwargs)
        records = self.socratarepo.datasets()

        item_count = 0
        for rec in records:
            result = self.db.write_header(rec["resource"]["id"], self.repository_id)
            item_count = item_count + 1
            if (item_count % self.update_log_after_numitems == 0):
                tdelta = time.time() - self.tstart + 0.1
                self.logger.info("Done {} item headers after {} ({:.1f} items/sec)".format(item_count, self.formatter.humanize(tdelta), item_count/tdelta) )

        self.logger.info("Found {} items in feed".format(item_count) )

    def format_socrata_to_oai(self, socrata_record, local_identifier):
        record = {}

        record["title"] = socrata_record.get("name","").strip()
        record["description"] = socrata_record.get("description", "")
        record["tags"] = socrata_record.get("tags", "")
        record["identifier"] = local_identifier
        record["creator"] = socrata_record.get("attribution", self.name)
        record["pub_date"] = datetime.fromtimestamp(socrata_record["publicationDate"]).strftime('%Y-%m-%d')
        record["subject"] = socrata_record.get("category", "")
        record["title_fr"] = ""
        record["series"] = ""
        record["rights"] = []

        if ('license' in socrata_record) and socrata_record['license']:
            # Winnipeg, Nova Scotia, PEI
            record["rights"].append(socrata_record['license'].get("name", ""))
            record["rights"].append(socrata_record['license'].get("termsLink", ""))
            record["rights"] = "\n".join(record["rights"])
            record["rights"] = record["rights"].strip()

        if record["rights"] == "See Terms of Use":
            # Calgary, Edmonton
            record["rights"] = []

        if ('metadata' in socrata_record) and socrata_record['metadata']:
            if ('custom_fields' in socrata_record['metadata']) and socrata_record['metadata']['custom_fields']:
                if ('License/Attribution' in socrata_record['metadata']['custom_fields']) and socrata_record['metadata']['custom_fields']['License/Attribution']:
                    if ('License URL' in socrata_record['metadata']['custom_fields']['License/Attribution'] and socrata_record['metadata']['custom_fields']['License/Attribution']['License URL']):
                        if record["rights"] == "" or record["rights"] == []:
                            # Calgary
                            record["rights"] = socrata_record['metadata']['custom_fields']['License/Attribution']['License URL']
                    if ('License-URL' in socrata_record['metadata']['custom_fields']['License/Attribution'] and socrata_record['metadata']['custom_fields']['License/Attribution']['License-URL']):
                        if record["rights"] == "" or record["rights"] == []:
                            # Calgary
                            record["rights"] = socrata_record['metadata']['custom_fields']['License/Attribution']['License-URL']
                elif ('Licence' in socrata_record['metadata']['custom_fields']) and socrata_record['metadata']['custom_fields']['Licence']:
                    if ('Licence' in socrata_record['metadata']['custom_fields']['Licence']) and socrata_record['metadata']['custom_fields']['Licence']['Licence']:
                        if record["rights"] == "" or record["rights"] == []:
                            # Winnipeg
                            record["rights"] = socrata_record['metadata']['custom_fields']['Licence']['Licence']
                elif ('Attributes' in socrata_record['metadata']['custom_fields']) and socrata_record['metadata']['custom_fields']['Attributes']:
                    if ('Licence' in socrata_record['metadata']['custom_fields']['Attributes']) and socrata_record['metadata']['custom_fields']['Attributes']['Licence']:
                        if record["rights"] == "" or record["rights"] == []:
                            # Strathcona
                            record["rights"] = socrata_record['metadata']['custom_fields']['Attributes']['Licence']
        if record["rights"] == "" or record["rights"] == []:
            record.pop("rights")


        # Continue to default to English for our current Socrata repositories.
        # For Nova Scoatia, "fra" language refers to the dataset, not the metadata.
        
        # language = self.default_language
        # if "metadata" in socrata_record:
        #     if "custom_fields" in socrata_record["metadata"]:
        #         if "Detailed Metadata" in socrata_record["metadata"]["custom_fields"]:
        #             if "Language" in socrata_record["metadata"]["custom_fields"]["Detailed Metadata"]:
        #                 # Nova Scotia
        #                 language = socrata_record["metadata"]["custom_fields"]["Detailed Metadata"]["Language"]
        #         elif "Dataset Information" in socrata_record["metadata"]["custom_fields"]:
        #             if "Language" in socrata_record["metadata"]["custom_fields"]["Dataset Information"]:
        #                 # Prince Edward Island
        #                 language = socrata_record["metadata"]["custom_fields"]["Dataset Information"]["Language"]
        # language = language.lower()
        #
        # if language in ["fr", "fre", "fra", "french"]:
        #     language = "fr"
        # else:
        #     language = "en"

        return record


    @rate_limited(5)
    def _update_record(self,record):

        try:            
            socrata_record = self.socratarepo.get_metadata(record['local_identifier'])
            oai_record = self.format_socrata_to_oai(socrata_record,record['local_identifier'])
            if oai_record:
                self.db.write_record(oai_record, self)
            return True

        except Exception as e:
            self.logger.error("Updating record {} failed: {}".format(record['local_identifier'], e))
            if self.dump_on_failure == True:
                try:
                    print(socrata_record)
                except:
                    pass
            # Touch the record so we do not keep requesting it on every run
            self.db.touch_record(record)
            self.error_count = self.error_count + 1
            if self.error_count < self.abort_after_numerrors:
                return True

        return False
예제 #27
0
# def createDaemon():
#   try:
#     # Store the Fork PID
#     pid = os.fork()
#     if pid > 0:
#       print('PID: ',pid)
#       os._exit(0)
#   except error:
#     print('Unable to fork. Error: ', error.errno, error.strerror)
#     os._exit(1)

#   runTask()

if __name__ == '__main__':
    # API config (Do not change)
    app_token = "1CKHfUB8qIpEQKUM1JNdiEK1N"
    socrata_dataset_identifier = "xdss-u53e"

    client = Socrata("health.data.ny.gov", app_token)
    metadata = client.get_metadata(socrata_dataset_identifier)

    with open("Last updated.txt", 'r') as fp:
        last_updated = fp.read()
        last_updated_obj = datetime.datetime.strptime(last_updated,
                                                      '%Y-%m-%dT00:00:00.000')

    while True:
        runTask()
        with open("logs.txt", "a+") as fp:
            fp.write("Last run at " + str(time.ctime()) + "\n")
        time.sleep(600)
예제 #28
0
# dataset_identifier: '9mfq-cb36'
# sample query:
# https://data.cdc.gov/resource/9mfq-cb36.json?submission_date=2021-02-20T00:00:00.000

import requests
import json
import os
from sodapy import Socrata

socrata_domain = 'data.cdc.gov'
socrata_dataset_identifier = '9mfq-cb36'
socrata_token = os.environ['SOCRATA_APP_TOKEN'] # Token in secrets.sh; turned into a global environmental variable

client = Socrata(socrata_domain, socrata_token)

metadata = client.get_metadata('9mfq-cb36') # this is the metadata, not the data
########### FINDING OUT ABOUT THE METADATA #############
#print(metadata.keys())
    #OUTPUT: dict_keys(['id', 'name', 'assetType', 'attribution', 'attributionLink', 'averageRating', 'category', 'createdAt', 'description', 'displayType', 'downloadCount', 'hideFromCatalog', 'hideFromDataJson', 'licenseId', 'newBackend', 'numberOfComments', 'oid', 'provenance', 'publicationAppendEnabled', 'publicationDate', 'publicationGroup', 'publicationStage', 'rowsUpdatedAt', 'rowsUpdatedBy', 'tableId', 'totalTimesRated', 'viewCount', 'viewLastModified', 'viewType', 'approvals', 'columns', 'grants', 'license', 'metadata', 'owner', 'query', 'rights', 'tableAuthor', 'tags', 'flags'])

#print(metadata['name'])
    #OUTPUT: United States COVID-19 Cases and Deaths by State over Time

#print([x['name'] for x in metadata['columns']], ('***'*20))
    #OUTPUT: ['submission_date', 'state', 'tot_cases', 'conf_cases', 'prob_cases', 'new_case', 'pnew_case', 'tot_death', 'conf_death', 'prob_death', 'new_death', 'pnew_death', 'created_at', 'consent_cases', 'consent_deaths']


#meta_amount = [x for x in metadata['columns'] if x['name'] == 'submission_date'][0]
#print(len(meta_amount), meta_amount)
    #OUTPUT: 10 {'id': 481357041, 'name': 'submission_date', 'dataTypeName': 'calendar_date', 'description': 'Date of counts', 'fieldName': 'submission_date', 'position': 1, 'renderTypeName': 'calendar_date', 'tableColumnId': 105526416, 'cachedContents': {'non_null': '24120', 'largest': '2021-02-26T00:00:00.000', 'null': '0', 'top': [{'item': '2020-03-28T00:00:00.000', 'count': '60'}, {'item': '2020-11-27T00:00:00.000', 'count': '60'}, {'item': '2021-01-07T00:00:00.000', 'count': '60'}, {'item': '2021-01-30T00:00:00.000', 'count': '60'}, {'item': '2020-05-08T00:00:00.000', 'count': '60'}, {'item': '2020-04-11T00:00:00.000', 'count': '60'}, {'item': '2021-01-17T00:00:00.000', 'count': '60'}, {'item': '2020-10-28T00:00:00.000', 'count': '60'}, {'item': '2020-09-04T00:00:00.000', 'count': '60'}, {'item': '2020-03-25T00:00:00.000', 'count': '60'}, {'item': '2020-05-01T00:00:00.000', 'count': '60'}, {'item': '2020-08-14T00:00:00.000', 'count': '60'}, {'item': '2020-03-14T00:00:00.000', 'count': '60'}, {'item': '2020-11-17T00:00:00.000', 'count': '60'}, {'item': '2020-08-02T00:00:00.000', 'count': '60'}, {'item': '2020-07-09T00:00:00.000', 'count': '60'}, {'item': '2020-01-29T00:00:00.000', 'count': '60'}, {'item': '2020-07-10T00:00:00.000', 'count': '60'}, {'item': '2020-12-25T00:00:00.000', 'count': '60'}, {'item': '2020-04-19T00:00:00.000', 'count': '60'}], 'smallest': '2020-01-22T00:00:00.000', 'cardinality': '402'}, 'format': {'view': 'date'}}
    ### What you can see here are the most recent and the earliest dates (largest and smallest), and that there are '24120' non-null values