Пример #1
0
def read_sas(file_path):
    pieces = []
    with sas.SAS7BDAT(file_path) as f:
        pieces.append(f.to_data_frame())
        data = pd.concat(pieces)
    # data = data[data['HUBSUPDO_HUB_PART_NUM'].str[0] != 'U']
    return data
Пример #2
0
    def build(self):
        import sas7bdat

        for k, v in self.metadata.sources.items():
            zfn = self.filesystem.download(k)
            fn = self.filesystem.unzip(zfn)

            print fn

            sas = sas7bdat.SAS7BDAT(fn)

            for row in sas.readData():
                print row
Пример #3
0
    def test_sas7bdat(self):       
        import sas7bdat
        import swat.tests as st

        myFile = os.path.join(os.path.dirname(st.__file__), 'datasources', 'cars.sas7bdat')

        dmh = swat.datamsghandlers.SAS7BDAT(myFile)

        tbl = self.s.addtable(table='cars', **dmh.args.addtable).casTable

        f = tbl.to_frame()
        s = sas7bdat.SAS7BDAT(myFile).to_data_frame()

        self.assertTablesEqual(f, s, sortby=SORT_KEYS)
Пример #4
0
    def iterload(self):
        import sas7bdat
        self.dat = sas7bdat.SAS7BDAT(str(self.source),
                                     skip_header=True,
                                     log_level=logging.CRITICAL)
        self.columns = []
        for col in self.dat.columns:
            self.addColumn(
                ColumnItem(col.name.decode('utf-8'),
                           col.col_id,
                           type=SASTypes.get(col.type, anytype)))

        with self.dat as fp:
            yield from Progress(fp, total=self.dat.properties.row_count)
Пример #5
0
def convert(path):
    print(path)
    files = glob(path)
    print('Converting %d files in %s' % (len(files), path))
    for f in files:
        if f[-9:] == '.sas7bdat':
            fout = f[:-9] + '.csv'
            try:
                raw = sas7bdat.SAS7BDAT(f)
                raw.convert_file(fout)
            except:
                try:
                    traceback.print_last()
                except:
                    print('Exception trying to print exception: %s' % f)
Пример #6
0
def aggregate_data(sourcefile, targetfile):
    reader = sas7bdat.SAS7BDAT(os.path.join(SOURCE_DIR, sourcefile),
                               skip_header=True)
    # map column names to column indices
    COL2IDX = {
        column.name.decode('utf-8'): column.col_id
        for column in reader.columns
    }

    def get_field(row, column):
        return row[COL2IDX[column]]

    def row2flu(row):
        for i in range(1, 9):
            dx = get_field(row, "dx{}".format(i))
            flu_cat = get_flu_cat(dx)
            if (flu_cat != None): return flu_cat
        return 0

    def row2epiweek(row):
        date = get_field(row, 'd_event')
        year, month, day = date.year, date.month, date.day
        week_tuple = epi.Week.fromdate(year, month, day).weektuple()
        year, week_num = week_tuple[0], week_tuple[1]
        return year, week_num

    results_dict = dict()
    for r, row in enumerate(reader):
        # if (r >= 1000000): break
        if (get_field(row, 'type') != "Outpt"): continue
        year, week_num = row2epiweek(row)
        dmisid = get_field(row, 'DMISID')
        flu_cat = row2flu(row)

        key_list = [year, week_num, dmisid, flu_cat]
        curr_dict = results_dict
        for i, key in enumerate(key_list):
            if (i == len(key_list) - 1):
                if (not key in curr_dict): curr_dict[key] = 0
                curr_dict[key] += 1
            else:
                if (not key in curr_dict): curr_dict[key] = dict()
                curr_dict = curr_dict[key]

    results_path = os.path.join(TARGET_DIR, targetfile)
    with open(results_path, 'wb') as f:
        pickle.dump(results_dict, f, pickle.HIGHEST_PROTOCOL)
    return
Пример #7
0
    def reload(self):
        import sas7bdat
        self.dat = sas7bdat.SAS7BDAT(self.source.resolve(),
                                     skip_header=True,
                                     log_level=logging.CRITICAL)
        self.columns = []
        for col in self.dat.columns:
            self.addColumn(
                ColumnItem(col.name.decode('utf-8'),
                           col.col_id,
                           type=SASTypes.get(col.type, anytype)))

        with self.dat as fp:
            self.rows = []
            for row in Progress(fp, total=self.dat.properties.row_count):
                self.rows.append(row)
Пример #8
0
    def test_sas7bdat(self):
        try:
            import sas7bdat
        except ImportError:
            tm.TestCase.skipTest(self, 'sas7bdat package is not available')

        import swat.tests as st

        myFile = os.path.join(os.path.dirname(st.__file__), 'datasources',
                              'cars.sas7bdat')

        dmh = swat.datamsghandlers.SAS7BDAT(myFile)

        tbl = self.s.addtable(table='cars', **dmh.args.addtable).casTable

        f = tbl.to_frame()
        s = sas7bdat.SAS7BDAT(myFile).to_data_frame()

        self.assertTablesEqual(f, s, sortby=SORT_KEYS)
Пример #9
0
def convert_files(files: List[str]) -> None:
    """Create a csv file for each matching sas7bdat file."""

    matching = [f for f in files if f.endswith('.sas7bdat')]
    num_files = len(matching)
    if num_files != len(files):
        print(
            f'Ignoring {len(files) - num_files:d}/{len(files):d} files that were not '
            '.sas7bdat files.')
    print(f'Attempting to convert {num_files:d} files')

    for i, filename in enumerate(sorted(matching)):
        csv_filename = filename.rsplit('.sas7bdat', 1)[0] + '.csv'
        print(f'{i + 1:d}/{num_files:d}: Converting {filename} to {csv_filename}')
        if os.path.exists(csv_filename):
            print('Skipping, resulting csv already exists')
            continue
        with sas7bdat.SAS7BDAT(filename) as infile:
            infile.convert_file(csv_filename)
Пример #10
0
def flatten_iterator(files_pattern: str) -> Iterator[Dict[str, str]]:
    """Iterate over all FHS files as if they were one big file.

    It iterates through the file and yields each result separately. It adds an
    extra field at the end '__file__' which contains the filename from which the
    record was extracted.

    Args:
        files_pattern: a glob pattern for the files to flatten. They should all
            have the same schema. Must end with .csv or .sas7bdat.

    Yields:
        each record as a dict using the headers as keys.
    """

    files = glob.glob(files_pattern)
    if not files:
        raise ValueError(f'No files found matching {files_pattern}')

    headers = None

    print(f'Flattening {len(files):d} files')

    for current_file in sorted(files):
        reader = None
        if files_pattern.endswith('sas7bdat'):
            reader = sas7bdat.SAS7BDAT(current_file).readlines()
        elif files_pattern.endswith('csv'):
            reader = csv.reader(open(current_file))
        else:
            raise ValueError(
                f'Can only process .csv and .sas7bdat files. Got pattern {files_pattern}'
            )
        header_line = next(reader)
        if headers is None:
            headers = header_line + ['__file__']
        elif headers[:-1] != header_line:
            raise ValueError(
                f"Headers from file {current_file} don't match those of previous "  # pylint: disable=unsubscriptable-object
                f'files. Was expecting:\n{headers[:-1]}\n'
                f'  got:\n{header_line}')
        for line in reader:
            yield dict(zip(headers, line + [current_file]))
Пример #11
0
def read_sas(filePath: 'string') -> 'dataFrame':
    pieces = []
    with sas.SAS7BDAT(filePath) as f:
        pieces.append(f.to_data_frame())
        data = pd.concat(pieces)
    return data
Пример #12
0
 def __init__(self, path, nrecs=1000, transformers=None, **kwargs):
     import sas7bdat
     super(SAS7BDAT,
           self).__init__(sas7bdat.SAS7BDAT(path, **kwargs).to_data_frame(),
                          nrecs=nrecs,
                          transformers=transformers)
Пример #13
0
def main(args):

    filelist = [
        x for x in os.listdir(args.inputdir)
        if os.path.isfile(args.inputdir + x) and ".zip" in x
    ]

    filelist = group_by_filename(filelist)

    for grp in filelist:

        # skip these data sets as they require special handing
        if grp in ["AllClinical", "AccelData"]:
            continue

        # create tmp directory for
        tmp_dir = "%s%s/" % (TMP_ROOT, grp)
        if not os.path.exists(tmp_dir):
            os.mkdir(tmp_dir)

        bdatfmt = {}
        var_map, var_fmt, var_labels = {}, {}, {}

        for i, zipfname in enumerate(filelist[grp]):

            filename = "%s%s" % (args.inputdir, zipfname)
            zf = zipfile.ZipFile(filename, 'r')
            manifest = sorted(zf.namelist())
            sasfiles = [x for x in manifest if "sas7bdat" in x]

            if len(sasfiles) > 1:
                continue

            # dump SAS to a temporary file
            data = zf.read(sasfiles[0])
            tmpfile = "%s%s.sas7bdat" % (tmp_dir, i)
            with open(tmpfile, "wb") as tmp:
                tmp.write(data)

            d = sas7bdat.SAS7BDAT(tmpfile)

            # SAS header format
            var_ids = [(col.name, col.format, col.label)
                       for col in d.header.parent.columns]
            for var, dtype, label in var_ids:
                key = norm_col_name(var) if var not in ["ID", "VERSION"
                                                        ] else var.lower()
                if key not in var_map:
                    var_map[key] = {}
                    var_fmt[key] = {}
                    var_labels[key] = []
                var_map[key][var] = 1
                var_fmt[key][dtype] = 1
                var_labels[key] += [label]

            # types actually created by sas2bdat
            header = [
                norm_col_name(col.name) for col in d.header.parent.columns
            ]
            for idx, row in enumerate(d):
                if idx == 0:
                    continue

                for j in range(0, len(row)):
                    t = type(row[j])
                    if header[j] not in bdatfmt:
                        bdatfmt[header[j]] = {}
                    bdatfmt[header[j]][t] = bdatfmt[header[j]].get(t, 0) + 1

        # normalize labels
        for var in var_labels:
            var_labels[var] = norm_col_label(var_labels[var])

        var_fmt["id"] = var_fmt["version"] = {"$": 1}

        # assign majority type as column data type
        for var in bdatfmt:
            types = sorted(bdatfmt[var].items(),
                           key=operator.itemgetter(1),
                           reverse=1)
            bdatfmt[var] = types[0][0]

        # confirm consistent data types across all fields
        # use this as a data type in SQL schema
        sql_types = {}
        for var in var_fmt:
            if var not in bdatfmt:
                sys.stderr.write("FATAL ERROR -- exiting")
                sys.stderr.write("%s %s" % (grp, var))
                sys.exit()

            sql_types[var] = "TEXT" if bdatfmt[var] in [unicode, str
                                                        ] else "NUMERIC"

            if "MMDDYY" in var_fmt[var]:
                sql_types[var] = "DATE"

        # manually add visit column to tables containing
        # multiple visits
        if grp not in ["Outcomes"]:
            sql_types["vid"] = "INTEGER"

        pkeys = primary_key_defs[grp] if grp in primary_key_defs else []
        schema = create_table_schema(grp, sql_types, var_labels, pkeys)

        print schema
        print

        if grp == "Enrollees":
            tmpfile = "%s%s.sas7bdat" % (tmp_dir, 0)
            data = sas7bdat.SAS7BDAT(tmpfile)
            enrollees_sql_insert(grp, data, sql_types)
            continue

        # only 1 file (don't use VID)
        if len(filelist[grp]) == 1:
            tmpfile = "%s%s.sas7bdat" % (tmp_dir, 0)
            data = sas7bdat.SAS7BDAT(tmpfile)
            sql_insert(grp, data, sql_types, vid=None)
            continue

        for i in range(0, len(filelist[grp])):
            tmpfile = "%s%s.sas7bdat" % (tmp_dir, i)
            data = sas7bdat.SAS7BDAT(tmpfile)
            sql_insert(grp, data, sql_types, vid=i)
Пример #14
0
                       columns=['col3', 'col4'],
                       index=[0, 1])
df1.join(df2, how='inner')  # 内关联(还有outer外连接、left左连接、right右连接),直接通过index关联

# 横向合并
df1 = pandas.DataFrame({'col1': [1, 2, 3], 'col2': [4, 5, 6]})
df2 = pandas.DataFrame({'col2': [4, 5, 7], 'col3': [1, 2, 2]})
print(df1.merge(df2))  # 通过index关联并横向扩展,内关联,col2列不一致的值去掉了
print(df1.merge(df2, how='left', left_on='col1',
                right_on='col3'))  # 指定左关联字段并横向合并
"""
    数据输入、输出
"""
# 和SAS数据文件的交互,pandas.read_sas处理gb2312的中文是乱码,这里使用sas7bdat包中的函数,先转换成DataFrame
sasdata1 = pandas.read_sas('D:\\06-JerryTech\\pf_data1.sas7bdat')
sasdata2 = sas7bdat.SAS7BDAT('D:\\06-JerryTech\\pf_data1.sas7bdat',
                             encoding="gb2312").to_data_frame()
writer = pandas.ExcelWriter('D:\\06-JerryTech\\pf_data11.xlsx')
sasdata2.to_excel(writer)
writer.save()
"""
    其他常用、有用功能
"""
# 透视表,参数:index行,columns列,values要统计的列,aggfunc统计函数,margins加汇总值,normalize='index'算占比
df = pandas.DataFrame([[1, 11, 3], [4, 17, 6], [7, 14, 9], [7, 11, 12],
                       [4, 14, 15], [1, 17, 18]],
                      columns=["A", "B", "C"])
g = pandas.crosstab(index=df.A,
                    columns=df.B,
                    values=df.C,
                    aggfunc=numpy.mean,
                    margins=True,
Пример #15
0
def readsas(file_path):
    pieces = []
    with sas.SAS7BDAT(file_path) as f:
        pieces.append(f.to_data_frame())
        data = pd.concat(pieces)
    return data
Пример #16
0
 def __init__(self, fname):
     self.infile = sas.SAS7BDAT(fname)
     self.lines = self.infile.readlines()
     # For now, we just throw this out
     header = self.lines.next()  # noqa