Exemplo n.º 1
0
def test_stream_reset_on_close_issue_190():
    source = [['1', 'english'], ['2', '中国人']]
    stream = Stream(source)
    stream.open()
    stream.read(limit=1) == [['1', 'english']]
    stream.open()
    stream.read(limit=1) == [['1', 'english']]
    stream.close()
Exemplo n.º 2
0
def test_stream_local_csv_zip_multiple_open():
    # That's how `tableschema.iter()` acts
    stream = Stream('data/table.csv.zip')
    stream.open()
    assert stream.headers is None
    assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人']]
    stream.close()
    stream.open()
    assert stream.headers is None
    assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人']]
    stream.close()
Exemplo n.º 3
0
def read_list_from_csv(
    url: str,
    headers: Union[int, List[int], List[str], None] = None,
    dict_form: bool = False,
    **kwargs: Any,
) -> List[Union[Dict, List]]:
    """Read a list of rows in dict or list form from a csv. The headers argument is either a row
       number or list of row numbers (in case of multi-line headers) to be considered as headers
       (rows start counting at 1), or the actual headers defined a list of strings. If not set,
       all rows will be treated as containing values.

    Args:
        url (str): URL or path to read from
        headers (Union[int, List[int], List[str], None]): Row number of headers. Defaults to None.
        dict_form (bool): Return dict (requires headers parameter) or list for each row. Defaults to False (list)
        **kwargs: Other arguments to pass to Tabulator Stream

    Returns:
        List[Union[Dict, List]]: List of rows in dict or list form

    """
    if dict_form and headers is None:
        raise ValueError("If dict_form is True, headers must not be None!")
    stream = Stream(url, headers=headers, **kwargs)
    stream.open()
    result = stream.read(keyed=dict_form)
    stream.close()
    return result
Exemplo n.º 4
0
def read_list_from_csv(filepath, dict_form=False, headers=None):
    # type: (str, bool, Optional[int]) -> List[Union[Dict, List]]
    """Read a list of rows in dict or list form from a csv.

    Args:
        filepath (str): Path to read from
        dict_form (bool): Return in dict form. Defaults to False.
        headers (Optional[List[str]]): Row number of headers. Defaults to None.

    Returns:
        List[Union[Dict, List]]: List of rows in dict or list form

    """
    stream = Stream(filepath, headers=headers)
    stream.open()
    result = stream.read(keyed=dict_form)
    stream.close()
    return result
Exemplo n.º 5
0
    def create_datastore(self,
                         schema=None,
                         primary_key=None,
                         delete_first=0,
                         path=None):
        # type: (Optional[List[dict]], Optional[str], Optional[int], Optional[str]) -> None
        """For csvs, create a resource in the HDX datastore which enables data preview in HDX. If no schema is provided
        all fields are assumed to be text. If path is not supplied, the file is first downloaded from HDX.

        Args:
            schema (List[dict]): List of fields and types of form {'id': 'FIELD', 'type': 'TYPE'}. Defaults to None.
            primary_key (Optional[str]): Primary key of schema. Defaults to None.
            delete_first (int): Delete datastore before creation. 0 = No, 1 = Yes, 2 = If no primary key. Defaults to 0.
            path (Optional[str]): Local path to file that was uploaded. Defaults to None.

        Returns:
            None
        """
        if delete_first == 0:
            pass
        elif delete_first == 1:
            self.delete_datastore()
        elif delete_first == 2:
            if primary_key is None:
                self.delete_datastore()
        else:
            raise HDXError(
                'delete_first must be 0, 1 or 2! (0 = No, 1 = Yes, 2 = Delete if no primary key)'
            )
        if path is None:
            # Download the resource
            url, path = self.download()
            delete_after_download = True
        else:
            url = self.data.get('url', None)
            if not url:
                raise HDXError('No URL to download!')
            delete_after_download = False

        zip_path = None
        stream = None
        try:
            extension = splitext(path)[1]
            if extension.lower() == '.zip':
                zip_file = zipfile.ZipFile(path)
                filename = zip_file.namelist()[0]
                tempdir = dirname(abspath(path))
                zip_file.extract(filename, tempdir)
                zip_path = path
                path = join(tempdir, filename)

            def convert_to_text(extended_rows):
                for number, headers, row in extended_rows:
                    for i, val in enumerate(row):
                        row[i] = str(val)
                    yield (number, headers, row)

            tabulator.config.BYTES_SAMPLE_SIZE = 1000000
            stream = Stream(path, headers=1, post_parse=[convert_to_text])
            stream.open()
            if schema is None:
                schema = list()
                for fieldname in stream.headers:
                    schema.append({'id': fieldname, 'type': 'text'})
            data = {
                'resource_id': self.data['id'],
                'force': True,
                'fields': schema,
                'primary_key': primary_key
            }
            self._write_to_hdx('datastore_create', data, 'id')
            if primary_key is None:
                method = 'insert'
            else:
                method = 'upsert'
            logger.debug('Uploading data from %s to datastore' % url)
            offset = 0
            chunksize = 100
            rowset = stream.read(keyed=True, limit=chunksize)
            while len(rowset) != 0:
                data = {
                    'resource_id': self.data['id'],
                    'force': True,
                    'method': method,
                    'records': rowset
                }
                self._write_to_hdx('datastore_upsert', data, 'id')
                rowset = stream.read(keyed=True, limit=chunksize)
                logger.debug('Uploading: %s' % offset)
                offset += chunksize
        except Exception as e:
            six.raise_from(HDXError('Upload to datastore of %s failed!' % url),
                           e)
        finally:
            if stream:
                stream.close()
            if delete_after_download:
                unlink(path)
                if zip_path:
                    unlink(zip_path)
            else:
                if zip_path:
                    unlink(
                        path
                    )  # ie. we keep the zip but remove the extracted file
Exemplo n.º 6
0
def test_stream_read_closed():
    stream = Stream('data/table.csv')
    with pytest.raises(exceptions.TabulatorException) as excinfo:
        stream.read()
    assert 'stream.open()' in str(excinfo.value)
Exemplo n.º 7
0
def test_stream_source_error_data():
    stream = Stream('[1,2]', scheme='text', format='json')
    with pytest.raises(exceptions.SourceError) as excinfo:
        stream.open()
        stream.read()
Exemplo n.º 8
0
def test_stream_source_error_data():
    stream = Stream('[1,2]', scheme='text', format='json')
    with pytest.raises(exceptions.SourceError) as excinfo:
        stream.open()
        stream.read()