def read_sas(file_path): pieces = [] with sas.SAS7BDAT(file_path) as f: pieces.append(f.to_data_frame()) data = pd.concat(pieces) # data = data[data['HUBSUPDO_HUB_PART_NUM'].str[0] != 'U'] return data
def build(self): import sas7bdat for k, v in self.metadata.sources.items(): zfn = self.filesystem.download(k) fn = self.filesystem.unzip(zfn) print fn sas = sas7bdat.SAS7BDAT(fn) for row in sas.readData(): print row
def test_sas7bdat(self): import sas7bdat import swat.tests as st myFile = os.path.join(os.path.dirname(st.__file__), 'datasources', 'cars.sas7bdat') dmh = swat.datamsghandlers.SAS7BDAT(myFile) tbl = self.s.addtable(table='cars', **dmh.args.addtable).casTable f = tbl.to_frame() s = sas7bdat.SAS7BDAT(myFile).to_data_frame() self.assertTablesEqual(f, s, sortby=SORT_KEYS)
def iterload(self): import sas7bdat self.dat = sas7bdat.SAS7BDAT(str(self.source), skip_header=True, log_level=logging.CRITICAL) self.columns = [] for col in self.dat.columns: self.addColumn( ColumnItem(col.name.decode('utf-8'), col.col_id, type=SASTypes.get(col.type, anytype))) with self.dat as fp: yield from Progress(fp, total=self.dat.properties.row_count)
def convert(path): print(path) files = glob(path) print('Converting %d files in %s' % (len(files), path)) for f in files: if f[-9:] == '.sas7bdat': fout = f[:-9] + '.csv' try: raw = sas7bdat.SAS7BDAT(f) raw.convert_file(fout) except: try: traceback.print_last() except: print('Exception trying to print exception: %s' % f)
def aggregate_data(sourcefile, targetfile): reader = sas7bdat.SAS7BDAT(os.path.join(SOURCE_DIR, sourcefile), skip_header=True) # map column names to column indices COL2IDX = { column.name.decode('utf-8'): column.col_id for column in reader.columns } def get_field(row, column): return row[COL2IDX[column]] def row2flu(row): for i in range(1, 9): dx = get_field(row, "dx{}".format(i)) flu_cat = get_flu_cat(dx) if (flu_cat != None): return flu_cat return 0 def row2epiweek(row): date = get_field(row, 'd_event') year, month, day = date.year, date.month, date.day week_tuple = epi.Week.fromdate(year, month, day).weektuple() year, week_num = week_tuple[0], week_tuple[1] return year, week_num results_dict = dict() for r, row in enumerate(reader): # if (r >= 1000000): break if (get_field(row, 'type') != "Outpt"): continue year, week_num = row2epiweek(row) dmisid = get_field(row, 'DMISID') flu_cat = row2flu(row) key_list = [year, week_num, dmisid, flu_cat] curr_dict = results_dict for i, key in enumerate(key_list): if (i == len(key_list) - 1): if (not key in curr_dict): curr_dict[key] = 0 curr_dict[key] += 1 else: if (not key in curr_dict): curr_dict[key] = dict() curr_dict = curr_dict[key] results_path = os.path.join(TARGET_DIR, targetfile) with open(results_path, 'wb') as f: pickle.dump(results_dict, f, pickle.HIGHEST_PROTOCOL) return
def reload(self): import sas7bdat self.dat = sas7bdat.SAS7BDAT(self.source.resolve(), skip_header=True, log_level=logging.CRITICAL) self.columns = [] for col in self.dat.columns: self.addColumn( ColumnItem(col.name.decode('utf-8'), col.col_id, type=SASTypes.get(col.type, anytype))) with self.dat as fp: self.rows = [] for row in Progress(fp, total=self.dat.properties.row_count): self.rows.append(row)
def test_sas7bdat(self): try: import sas7bdat except ImportError: tm.TestCase.skipTest(self, 'sas7bdat package is not available') import swat.tests as st myFile = os.path.join(os.path.dirname(st.__file__), 'datasources', 'cars.sas7bdat') dmh = swat.datamsghandlers.SAS7BDAT(myFile) tbl = self.s.addtable(table='cars', **dmh.args.addtable).casTable f = tbl.to_frame() s = sas7bdat.SAS7BDAT(myFile).to_data_frame() self.assertTablesEqual(f, s, sortby=SORT_KEYS)
def convert_files(files: List[str]) -> None: """Create a csv file for each matching sas7bdat file.""" matching = [f for f in files if f.endswith('.sas7bdat')] num_files = len(matching) if num_files != len(files): print( f'Ignoring {len(files) - num_files:d}/{len(files):d} files that were not ' '.sas7bdat files.') print(f'Attempting to convert {num_files:d} files') for i, filename in enumerate(sorted(matching)): csv_filename = filename.rsplit('.sas7bdat', 1)[0] + '.csv' print(f'{i + 1:d}/{num_files:d}: Converting {filename} to {csv_filename}') if os.path.exists(csv_filename): print('Skipping, resulting csv already exists') continue with sas7bdat.SAS7BDAT(filename) as infile: infile.convert_file(csv_filename)
def flatten_iterator(files_pattern: str) -> Iterator[Dict[str, str]]: """Iterate over all FHS files as if they were one big file. It iterates through the file and yields each result separately. It adds an extra field at the end '__file__' which contains the filename from which the record was extracted. Args: files_pattern: a glob pattern for the files to flatten. They should all have the same schema. Must end with .csv or .sas7bdat. Yields: each record as a dict using the headers as keys. """ files = glob.glob(files_pattern) if not files: raise ValueError(f'No files found matching {files_pattern}') headers = None print(f'Flattening {len(files):d} files') for current_file in sorted(files): reader = None if files_pattern.endswith('sas7bdat'): reader = sas7bdat.SAS7BDAT(current_file).readlines() elif files_pattern.endswith('csv'): reader = csv.reader(open(current_file)) else: raise ValueError( f'Can only process .csv and .sas7bdat files. Got pattern {files_pattern}' ) header_line = next(reader) if headers is None: headers = header_line + ['__file__'] elif headers[:-1] != header_line: raise ValueError( f"Headers from file {current_file} don't match those of previous " # pylint: disable=unsubscriptable-object f'files. Was expecting:\n{headers[:-1]}\n' f' got:\n{header_line}') for line in reader: yield dict(zip(headers, line + [current_file]))
def read_sas(filePath: 'string') -> 'dataFrame': pieces = [] with sas.SAS7BDAT(filePath) as f: pieces.append(f.to_data_frame()) data = pd.concat(pieces) return data
def __init__(self, path, nrecs=1000, transformers=None, **kwargs): import sas7bdat super(SAS7BDAT, self).__init__(sas7bdat.SAS7BDAT(path, **kwargs).to_data_frame(), nrecs=nrecs, transformers=transformers)
def main(args): filelist = [ x for x in os.listdir(args.inputdir) if os.path.isfile(args.inputdir + x) and ".zip" in x ] filelist = group_by_filename(filelist) for grp in filelist: # skip these data sets as they require special handing if grp in ["AllClinical", "AccelData"]: continue # create tmp directory for tmp_dir = "%s%s/" % (TMP_ROOT, grp) if not os.path.exists(tmp_dir): os.mkdir(tmp_dir) bdatfmt = {} var_map, var_fmt, var_labels = {}, {}, {} for i, zipfname in enumerate(filelist[grp]): filename = "%s%s" % (args.inputdir, zipfname) zf = zipfile.ZipFile(filename, 'r') manifest = sorted(zf.namelist()) sasfiles = [x for x in manifest if "sas7bdat" in x] if len(sasfiles) > 1: continue # dump SAS to a temporary file data = zf.read(sasfiles[0]) tmpfile = "%s%s.sas7bdat" % (tmp_dir, i) with open(tmpfile, "wb") as tmp: tmp.write(data) d = sas7bdat.SAS7BDAT(tmpfile) # SAS header format var_ids = [(col.name, col.format, col.label) for col in d.header.parent.columns] for var, dtype, label in var_ids: key = norm_col_name(var) if var not in ["ID", "VERSION" ] else var.lower() if key not in var_map: var_map[key] = {} var_fmt[key] = {} var_labels[key] = [] var_map[key][var] = 1 var_fmt[key][dtype] = 1 var_labels[key] += [label] # types actually created by sas2bdat header = [ norm_col_name(col.name) for col in d.header.parent.columns ] for idx, row in enumerate(d): if idx == 0: continue for j in range(0, len(row)): t = type(row[j]) if header[j] not in bdatfmt: bdatfmt[header[j]] = {} bdatfmt[header[j]][t] = bdatfmt[header[j]].get(t, 0) + 1 # normalize labels for var in var_labels: var_labels[var] = norm_col_label(var_labels[var]) var_fmt["id"] = var_fmt["version"] = {"$": 1} # assign majority type as column data type for var in bdatfmt: types = sorted(bdatfmt[var].items(), key=operator.itemgetter(1), reverse=1) bdatfmt[var] = types[0][0] # confirm consistent data types across all fields # use this as a data type in SQL schema sql_types = {} for var in var_fmt: if var not in bdatfmt: sys.stderr.write("FATAL ERROR -- exiting") sys.stderr.write("%s %s" % (grp, var)) sys.exit() sql_types[var] = "TEXT" if bdatfmt[var] in [unicode, str ] else "NUMERIC" if "MMDDYY" in var_fmt[var]: sql_types[var] = "DATE" # manually add visit column to tables containing # multiple visits if grp not in ["Outcomes"]: sql_types["vid"] = "INTEGER" pkeys = primary_key_defs[grp] if grp in primary_key_defs else [] schema = create_table_schema(grp, sql_types, var_labels, pkeys) print schema print if grp == "Enrollees": tmpfile = "%s%s.sas7bdat" % (tmp_dir, 0) data = sas7bdat.SAS7BDAT(tmpfile) enrollees_sql_insert(grp, data, sql_types) continue # only 1 file (don't use VID) if len(filelist[grp]) == 1: tmpfile = "%s%s.sas7bdat" % (tmp_dir, 0) data = sas7bdat.SAS7BDAT(tmpfile) sql_insert(grp, data, sql_types, vid=None) continue for i in range(0, len(filelist[grp])): tmpfile = "%s%s.sas7bdat" % (tmp_dir, i) data = sas7bdat.SAS7BDAT(tmpfile) sql_insert(grp, data, sql_types, vid=i)
columns=['col3', 'col4'], index=[0, 1]) df1.join(df2, how='inner') # 内关联(还有outer外连接、left左连接、right右连接),直接通过index关联 # 横向合并 df1 = pandas.DataFrame({'col1': [1, 2, 3], 'col2': [4, 5, 6]}) df2 = pandas.DataFrame({'col2': [4, 5, 7], 'col3': [1, 2, 2]}) print(df1.merge(df2)) # 通过index关联并横向扩展,内关联,col2列不一致的值去掉了 print(df1.merge(df2, how='left', left_on='col1', right_on='col3')) # 指定左关联字段并横向合并 """ 数据输入、输出 """ # 和SAS数据文件的交互,pandas.read_sas处理gb2312的中文是乱码,这里使用sas7bdat包中的函数,先转换成DataFrame sasdata1 = pandas.read_sas('D:\\06-JerryTech\\pf_data1.sas7bdat') sasdata2 = sas7bdat.SAS7BDAT('D:\\06-JerryTech\\pf_data1.sas7bdat', encoding="gb2312").to_data_frame() writer = pandas.ExcelWriter('D:\\06-JerryTech\\pf_data11.xlsx') sasdata2.to_excel(writer) writer.save() """ 其他常用、有用功能 """ # 透视表,参数:index行,columns列,values要统计的列,aggfunc统计函数,margins加汇总值,normalize='index'算占比 df = pandas.DataFrame([[1, 11, 3], [4, 17, 6], [7, 14, 9], [7, 11, 12], [4, 14, 15], [1, 17, 18]], columns=["A", "B", "C"]) g = pandas.crosstab(index=df.A, columns=df.B, values=df.C, aggfunc=numpy.mean, margins=True,
def readsas(file_path): pieces = [] with sas.SAS7BDAT(file_path) as f: pieces.append(f.to_data_frame()) data = pd.concat(pieces) return data
def __init__(self, fname): self.infile = sas.SAS7BDAT(fname) self.lines = self.infile.readlines() # For now, we just throw this out header = self.lines.next() # noqa