Пример #1
0
def download(state, year, cache=True):
    """
    Downloads data directly from Datasus ftp server
    :param state: two-letter state identifier: MG == Minas Gerais
    :param year: 4 digit integer
    :return: pandas dataframe
    """
    state = state.upper()
    if year < 1994:
        raise ValueError("SINASC does not contain data before 1994")
    ftp = FTP('ftp.datasus.gov.br')
    ftp.login()
    if year >= 1996:
        ftp.cwd('/dissemin/publicos/SINASC/NOV/DNRES')
        fname = 'DN{}{}.DBC'.format(state, year)
    else:
        ftp.cwd('/dissemin/publicos/SINASC/ANT/DNRES')
        fname = 'DNR{}{}.DBC'.format(state, str(year)[-2:])
    cachefile = os.path.join(CACHEPATH, 'SINASC_'+fname.split('.')[0] + '_.parquet')
    if os.path.exists(cachefile):
        df = pd.read_parquet(cachefile)
        return df

    ftp.retrbinary('RETR {}'.format(fname), open(fname, 'wb').write)
    df = read_dbc(fname, encoding='iso-8859-1')
    if cache:
        df.to_parquet(cachefile)
    os.unlink(fname)
    return df
Пример #2
0
def download(state, year, cache=True):
    """
    Downloads data directly from Datasus ftp server
    :param state: two-letter state identifier: MG == Minas Gerais
    :param year: 4 digit integer
    :return: pandas dataframe
    """
    assert len(str(year)) == 4
    state = state.upper()
    if year < 1994:
        raise ValueError("SINASC does not contain data before 1994")
    ftp = FTP("ftp.datasus.gov.br")
    ftp.login()
    if year >= 1996:
        ftp.cwd("/dissemin/publicos/SINASC/NOV/DNRES")
        fname = "DN{}{}.DBC".format(state, year)
    else:
        ftp.cwd("/dissemin/publicos/SINASC/ANT/DNRES")
        fname = "DNR{}{}.DBC".format(state, str(year)[-2:])
    cachefile = os.path.join(CACHEPATH,
                             "SINASC_" + fname.split(".")[0] + "_.parquet")
    if os.path.exists(cachefile):
        df = pd.read_parquet(cachefile)
        return df

    ftp.retrbinary("RETR {}".format(fname), open(fname, "wb").write)
    df = read_dbc(fname, encoding="iso-8859-1")
    if cache:
        df.to_parquet(cachefile)
    os.unlink(fname)
    return df
Пример #3
0
def read_data_from_state(state):
    """
    Reads data from the state passed in and formats it into a pandas data frame
    :param state: the Brazilian state code (e.g. "MG") to read into memory in a dataframe format
    :return: pandas dataframe
    """
    print(f"reading {state} dbc file")
    fname = f'./data/DN{state}2017.dbc'
    df = read_dbc(fname, encoding='iso-8859-1')
    return df
Пример #4
0
def _fetch_file(fname, ftp, ftype):
    try:
        ftp.retrbinary('RETR {}'.format(fname), open(fname, 'wb').write)
    except:
        raise Exception("File {} not available".format(fname))
    if ftype == 'DBC':
        df = read_dbc(fname, encoding='iso-8859-1')
    elif ftype == 'DBF':
        dbf = DBF(fname, encoding='iso-8859-1')
        df = pd.DataFrame(list(dbf))
    os.unlink(fname)
    return df
Пример #5
0
def _fetch_file(fname, ftp, ftype):
    try:
        ftp.retrbinary("RETR {}".format(fname), open(fname, "wb").write)
    except error_perm:
        raise Exception("File {} not available".format(fname))
    if ftype == "DBC":
        df = read_dbc(fname, encoding="iso-8859-1")
    elif ftype == "DBF":
        dbf = DBF(fname, encoding="iso-8859-1")
        df = pd.DataFrame(list(dbf))
    os.unlink(fname)
    return df
Пример #6
0
def download(state, year, cache=True, folder=None):
    """
    Downloads data directly from Datasus ftp server
    :param state: two-letter state identifier: MG == Minas Gerais
    :param year: 4 digit integer
    :return: pandas dataframe
    """
    year2 = str(year)[-2:].zfill(2)
    state = state.upper()
    ftp_dir = ""
    fname = ""
    if year < 1979:
        raise ValueError("SIM does not contain data before 1979")
    elif year >= 1996:
        ftp_dir = '/dissemin/publicos/SIM/CID10/DORES'
        fname = 'DO{}{}.DBC'.format(state, year)
    else:
        ftp_dir = '/dissemin/publicos/SIM/CID9/DORES'
        fname = fname = 'DOR{}{}.DBC'.format(state, year2)

    cache_fail = False
    cachefile = os.path.join(CACHEPATH,
                             'SIM_' + fname.split('.')[0] + '_.parquet')
    if folder:
        fname = "{}/{}".format(folder, fname)
    elif cache:
        if os.path.exists(cachefile):
            df = pd.read_parquet(cachefile)
            return df
        else:
            cache_fail = True

    # Se tiver folder não tenta cache
    if not folder and (cache_fail or not cache):
        ftp = FTP('ftp.datasus.gov.br')
        ftp.login()
        ftp.cwd(ftp_dir)

        try:
            ftp.retrbinary('RETR {}'.format(fname), open(fname, 'wb').write)
        except:
            try:
                ftp.retrbinary('RETR {}'.format(fname.upper()),
                               open(fname, 'wb').write)
            except:
                raise Exception("File {} not available".format(fname))

    df = read_dbc(fname, encoding='iso-8859-1')

    df.to_parquet(cachefile)

    os.unlink(fname)
    return df
Пример #7
0
def _fetch_file(fname, path, ftype):
    ftp = FTP("ftp.datasus.gov.br")
    ftp.login()
    ftp.cwd(path)
    try:
        ftp.retrbinary("RETR {}".format(fname), open(fname, "wb").write)
    except:
        raise Exception("File {} not available".format(fname))
    if ftype == "DBC":
        df = read_dbc(fname, encoding="iso-8859-1")
    elif ftype == "DBF":
        dbf = DBF(fname, encoding="iso-8859-1")
        df = pd.DataFrame(list(dbf))
    os.unlink(fname)
    return df
Пример #8
0
def download(state, year, disease, cache=True):
    """
    Downloads SINAN data directly from Datasus ftp server
    :param state: two-letter state identifier: MG == Minas Gerais
    :param year: 4 digit integer
    :disease: Diseases
    :return: pandas dataframe
    """
    try:
        assert disease.title() in agravos
    except AssertionError:
        print(
            f'Disease {disease} is not available in SINAN.\nAvailable diseases: {list_diseases()}'
        )
    year2 = str(year)[-2:].zfill(2)
    state = state.upper()
    if year < 2007:
        raise ValueError("SINAN does not contain data before 2007")
    ftp = FTP('ftp.datasus.gov.br')
    ftp.login()
    ftp.cwd("/dissemin/publicos/SINAN/DADOS/FINAIS")
    dis_code = agravos[disease.title()]
    fname = f'{dis_code}{state}{year2}.DBC'

    cachefile = os.path.join(CACHEPATH,
                             'SINAN_' + fname.split('.')[0] + '_.parquet')
    if os.path.exists(cachefile):
        df = pd.read_parquet(cachefile)
        return df

    try:
        ftp.retrbinary('RETR {}'.format(fname), open(fname, 'wb').write)
    except:
        try:
            ftp.retrbinary('RETR {}'.format(fname.upper()),
                           open(fname, 'wb').write)
        except Exception as e:
            raise Exception("{}\nFile {} not available".format(e, fname))

    df = read_dbc(fname, encoding='iso-8859-1')
    if cache:
        df.to_parquet(cachefile)
    os.unlink(fname)
    return df
Пример #9
0
def _fetch_file(fname: str, path: str, ftype: str) -> pd.DataFrame:
    """
    Fetch a single file.
    :return:
    Pandas Dataframe
    """
    ftp = FTP("ftp.datasus.gov.br")
    ftp.login()
    ftp.cwd(path)
    try:
        ftp.retrbinary("RETR {}".format(fname), open(fname, "wb").write)
    except:
        raise Exception("File {} not available".format(fname))
    if ftype == "DBC":
        df = read_dbc(fname, encoding="iso-8859-1")
    elif ftype == "DBF":
        dbf = DBF(fname, encoding="iso-8859-1")
        df = pd.DataFrame(list(dbf))

    if os.path.exists(fname):
        os.unlink(fname)
    return df
Пример #10
0
def download(state, year):
    """
    Downloads data directly from Datasus ftp server
    :param state: two-letter state identifier: MG == Minas Gerais
    :param year: 4 digit integer
    :return: pandas dataframe
    """
    if year < 1994:
        raise ValueError("SINASC does not contain data before 1994")
    ftp = FTP('ftp.datasus.gov.br')
    ftp.login()
    if year >= 1996:
        ftp.cwd('/dissemin/publicos/SINASC/NOV/DNRES')
        fname = 'DN{}{}.DBC'.format(state, year)
    else:
        ftp.cwd('/dissemin/publicos/SINASC/ANT/DNRES')
        fname = 'DNR{}{}.DBC'.format(state, str(year)[-2:])


    ftp.retrbinary('RETR {}'.format(fname), open(fname, 'wb').write)
    df = read_dbc(fname, encoding='iso-8859-1')
    os.unlink(fname)
    return df
Пример #11
0
def _fetch_file(fname, ftp, ftype):
    """
    Does the FTP fetching.
    :param fname: file name
    :param ftp: ftp connection object
    :param ftype: file type: DBF|DBC
    :return: pandas dataframe
    """
    print("Downloading {}...".format(fname))
    try:
        ftp.retrbinary('RETR {}'.format(fname), open(fname, 'wb').write)
    except:
        try:
            ftp.retrbinary('RETR {}'.format(fname.lower()), open(fname, 'wb').write)
        except:
            raise Exception("File {} not available".format(fname))
    if ftype == 'DBC':
        df = read_dbc(fname, encoding='iso-8859-1')
    elif ftype == 'DBF':
        dbf = DBF(fname, encoding='iso-8859-1')
        df = pd.DataFrame(list(dbf))
    os.unlink(fname)
    return df
Пример #12
0
def _fetch_file(fname, ftp, ftype):
    """
    Does the FTP fetching.
    :param fname: file name
    :param ftp: ftp connection object
    :param ftype: file type: DBF|DBC
    :return: pandas dataframe
    """
    print(f'Downloading {fname}...')
    try:
        ftp.retrbinary(f'RETR {fname}', open(fname, 'wb').write)
    except:
        try:
            ftp.retrbinary(f'RETR {fname.lower()}', open(fname, 'wb').write)
        except:
            raise Exception(f'File {fname} not available')
    if ftype == 'DBC':
        df = read_dbc(fname, encoding='iso-8859-1')
    elif ftype == 'DBF':
        dbf = DBF(fname, encoding='iso-8859-1')
        df = pd.DataFrame(list(dbf))
    os.unlink(fname)
    return df
Пример #13
0
def download(state, year, cache=True):
    """
    Downloads data directly from Datasus ftp server
    :param state: two-letter state identifier: MG == Minas Gerais
    :param year: 4 digit integer
    :return: pandas dataframe
    """
    year2 = str(year)[-2:].zfill(2)
    state = state.upper()
    if year < 1979:
        raise ValueError("SIM does not contain data before 1979")
    ftp = FTP('ftp.datasus.gov.br')
    ftp.login()
    if year >= 1996:
        ftp.cwd('/dissemin/publicos/SIM/CID10/DORES')
        fname = 'DO{}{}.DBC'.format(state, year)
    else:
        ftp.cwd('/dissemin/publicos/SIM/CID9/DORES')
        fname = 'DOR{}{}.DBC'.format(state, year2)
    cachefile = os.path.join(CACHEPATH, 'SIM_'+fname.split('.')[0] + '_.parquet')
    if os.path.exists(cachefile):
        df = pd.read_parquet(cachefile)
        return df

    try:
        ftp.retrbinary('RETR {}'.format(fname), open(fname, 'wb').write)
    except:
        try:
            ftp.retrbinary('RETR {}'.format(fname.upper()), open(fname, 'wb').write)
        except:
            raise Exception("File {} not available".format(fname))

    df = read_dbc(fname, encoding='iso-8859-1')
    if cache:
        df.to_parquet(cachefile)
    os.unlink(fname)
    return df
Пример #14
0
 def test_read_dbc(self):
     df = read_dbc(b"test_data/sids.dbc")
     self.assertIsInstance(df, pd.DataFrame)
     self.assertGreater(df.size, 0)
Пример #15
0
 def test_read_dbc(self):
     df = read_dbc(b'test_data/sids.dbc')
     self.assertIsInstance(df, pd.DataFrame)
     self.assertGreater(df.size, 0)