def read(self): """ Read the SAS dataset and return as a Pandas Data Frame or an iterator to read the file in chunks. """ self.reader = None retry = False # If encoding is not specified, we try some common codecs if self.encoding is None: # Try encoding with each of the default codecs for cp in self.default_encoding: try: self.reader = pd.read_sas(self.filepath, encoding=cp, **self.read_sas_kwargs) self.encoding = cp break except UnicodeDecodeError: continue except (OverflowError, ValueError) as e: # Pandas has some known limitations when reading SAS files # https://github.com/pandas-dev/pandas/issues/20927 # https://github.com/pandas-dev/pandas/issues/16615 self._print_exception("Exception when reading the file with pandas. A second attempt will be made using the SAS7BDAT module", e) retry = True break # If pandas failed to read the file we retry with the SAS7BDAT module if retry: for cp in self.default_encoding: try: handle = SAS7BDAT(self.filepath, skip_header=False, encoding=cp, encoding_errors="strict") self.reader = handle.to_data_frame() handle.close() self.encoding = cp break except UnicodeDecodeError: handle.close() continue # Instantiate the reader if we haven't already done so if self.reader is None: try: self.reader = pd.read_sas(self.filepath, **self.read_sas_kwargs) except (OverflowError, ValueError) as e: self._print_exception("Exception when reading the file with pandas. A second attempt will be made using the SAS7BDAT module", e) # Check if encoding has been specified cp = self.encoding if cp is None: cp = 'utf_8' # If pandas failed to read the file we retry with the SAS7BDAT module handle = SAS7BDAT(self.filepath, skip_header=False, encoding=cp, encoding_errors="ignore") self.reader = handle.to_data_frame() handle.close() # Send metadata on the result to Qlik self._send_table_description() # Read the SAS dataset return self.reader
def sas2sqlite(sasfile, sqlitedb): import sqlite3 from sas7bdat import SAS7BDAT # Read data from SAS f = SAS7BDAT(sasfile) x = f.header.cols y = [''] * len(x) for i, n in enumerate(x): if n[1][2] == "numeric": y[i] = n[0] + ' real' else: y[i] = n[0] + ' varchar({})'.format(n[1][1]) _table = f.header.dataset.title() cmd1 = "CREATE TABLE {} ({})".format(_table, ', '.join(y)) cmd2 = 'INSERT INTO {} VALUES ( {} )'.format(_table, ','.join(['?'] * len(x))) conn = sqlite3.connect(sqlitedb) c = conn.cursor() for i, line in enumerate(f.readData()): if i == 0: c.execute('DROP TABLE IF EXISTS {}'.format(_table)) c.execute(cmd1) else: c.execute(cmd2, line) conn.commit() c.close()
def read_sas7bdata_pd(fname): data = [] with SAS7BDAT(fname) as f: for row in f: data.append(row) return pd.DataFrame(data[1:], columns=data[0])
def _read_sas(filename): filename = os.path.join(_WS[0], filename) with SAS7BDAT(filename) as f: reader = f.readlines() header = [c.strip() for c in next(reader)] for line in reader: yield {k: v for k, v in zip(header, line)}
def main(): usage = "\n%prog [options]" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option("-i", "--sasFile", action="store", dest="sasFile", help="Input sas file") parser.add_option("-o", "--csvFile", action="store", dest="csvFile", help="Output csv file") (options, args) = parser.parse_args() for file in ([options.sasFile, options.csvFile]): if not (file): parser.print_help() sys.exit(0) sasFile = options.sasFile outFile = options.csvFile FW = csv.writer(open('outFile', 'wb')) with SAS7BDAT(sasFile) as f: for row in f: FW.writerow(row)
def ex_11(): """ Importing SAS files In this exercise, you'll figure out how to import a SAS file as a DataFrame using SAS7BDAT and pandas. The file 'sales.sas7bdat' is already in your working directory and both pandas and matplotlib.pyplot have already been imported as follows: import pandas as pd import matplotlib.pyplot as plt The data are adapted from the website of the undergraduate text book Principles of Economics by Hill, Griffiths and Lim. :return: """ # from sas7bdat import SAS7BDAT -> at the top with the imports # Save file to a DataFrame: df_sas with SAS7BDAT('sales.sas7bdat') as file: df_sas = file.to_data_frame() # Print head of DataFrame print(df_sas.head()) # Plot histogram of DataFrame features (pandas and pyplot already imported) pd.DataFrame.hist(df_sas[['P']]) plt.ylabel('count') plt.show()
def region_iteratior(base_path, file_name): """Iterate over all region folders. The data we received is split over 26 folders for the individual regions. This iterator helps to load a specific data file for each region and to extract the region ID from the path. base_path -- folder containing the folders for individual regions file_name -- which of the 12 files per folder should be loaded returns -- iterator of (region_id, DataFrame) tuples """ input_path = os.path.join(base_path, '**', file_name) for f_name in glob.glob(input_path): region_id = re.search(r'/Reg(\d+)/', f_name).group(1) _LOGGER.info('>>> importing data for region %s', region_id) _LOGGER.info('reading from SAS file') if file_name.endswith('.sas7bdat'): data_frame = SAS7BDAT(f_name).to_data_frame() elif file_name.endswith('.csv'): data_frame = pd.DataFrame.from_csv(f_name) else: raise ValueError( 'Expected sas7bdat or csv files only, got {}'.format( file_name)) yield (region_id, data_frame)
def creatNewPay(self, filename): print(" start " + str(datetime.datetime.now())) result_file = open(filename, 'wt') for source in glob.glob(self.settings['pay_old']['location'] + self.settings['pay_old']['pattern']): print("processing " + source) i = source.index('payee_pmt_line_') year = int(source[i + 15:i + 19]) c = 0 count = 0 with SAS7BDAT(source, skip_header=True) as reader: for row in reader: row.append(year) result_file.write(", ".join(map(str, row)) + '\n') c += 1 if c >= int(self.settings['pay_old']['blocksize']): print(" " + str(count) + " " + str(datetime.datetime.now())) count += 1 blocks = int(self.settings['pay_old']['blocks']) if blocks > 0 and count > blocks: break c = 0 print(" " + str(count) + " " + str(datetime.datetime.now())) print(" end " + str(datetime.datetime.now())) result_file.close()
def main(): usage = "\n%prog [options]" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option("-i", "--sasFile", action="store", dest="sasFile", help="Input sas file") (options, args) = parser.parse_args() for file in ([options.sasFile]): if not (file): parser.print_help() sys.exit(0) sasFile = options.sasFile with SAS7BDAT(sasFile) as f: for row in f: line = str(row[0]) for i in range(1, len(row)): row[i] = str(row[i]) row[i] = row[i].replace(",", ";") line += ", %s" % (str(row[i])) print(line)
def create_data( X: dt.Frame = None ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]: if dai_file_path is None: return [] # check the datatype of user-defined columns if not isinstance(dai_file_path, str): raise ValueError("Variable: 'dai_file_path' should be <str>") if not isinstance(file_names, list): raise ValueError("Column: 'file_names' should be <list>") # TODO: add checks that files exist data_sets = {} for f in file_names: full_path_to_file = os.path.join(dai_file_path, f) if not os.path.exists(full_path_to_file): raise ValueError("File <<" + full_path_to_file + ">> does not exists!") with SAS7BDAT(full_path_to_file, skip_header=False) as reader: X = reader.to_data_frame() print(X.head()) data_sets.update({f: X}) return data_sets
def conditionsymptoms(sclst, condlst, filepath): with SAS7BDAT(filepath) as f: intrv = f.to_data_frame() symcond = intrv[sclst] symcond['C3Q03'] = symcond['C3Q03'].apply(lambda x: (-1) if x in [6, 7] else x) symcond['C3Q03'] = symcond['C3Q03'].apply(lambda x: 3 if np.isnan(x) else x) symcond.fillna(0, inplace=True) conditions = symcond[condlst] conditions.columns = [ 'IDNUMXR', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18 ] condition = pd.melt(conditions, id_vars='IDNUMXR') condition = condition[condition.value != 0] condlst.remove('IDNUMXR') symcond.drop(condlst, axis=1, inplace=True) return symcond, condition
def read_data(self): """Use the package sas7bdata to read the meta information from every sas file under the specified directory """ data = OrderedDict() for x in self.sas_files: current_sas_name = x.replace('.sas7bdat', '') current_file = os.path.join(self.directory, x) current = SAS7BDAT(current_file) statinfo = os.stat(current_file) meta = [['Position', 'Name', 'Type', 'Length', 'Format', 'Label']] for i, col in enumerate(current.header.parent.columns, 1): _metaline = [i] + map(self.to_unicode, [col.name, col.type, \ col.length, col.format, col.label]) meta.append(_metaline) _detail = current.header.properties.__dict__ blacklist = ['col_count_p1', 'col_count_p2', 'lcp', 'lcs', \ 'filename', 'endianess', 'u64'] detail = OrderedDict() for k in sorted(_detail): if k in blacklist: continue v = _detail[k] if isinstance(v, str): v = v.lower() detail[k.upper()] = v detail['FILE_SIZE'] = self.convert_size(statinfo.st_size) data[current_sas_name] = {'meta': meta, 'detail': detail} current.close() return data
def loadSAS(self): try: f = SAS7BDAT(str(self.filename)) df = f.to_data_frame() except: print "Unexpected error occured" return l = list(df.columns) print l head = self.tableWidget_3.horizontalHeader() head.setStretchLastSection(True) nrow = len(df.index) if nrow > 100: nrow = 100 else: nrow = nrow #self.datatable = QtGui.QTableWidget(parent=self) self.tableWidget_3.setColumnCount(len(df.columns)) self.tableWidget_3.setRowCount(nrow) for i in range(nrow): for j in range(len(df.columns)): self.tableWidget_3.setItem( i, j, QtGui.QTableWidgetItem(str(df.iget_value(i, j)))) self.tableWidget_3.setHorizontalHeaderLabels(l) self.headerName = l self.nonSelectedVariables = self.headerName self.data = df st = str(nrow) + " of " + str(len(df.index)) + " rows has been shown" self.label.setText(st) self.label.setVisible(True) self.initDict() self.initComboBox()
def main(options, args): logLevel = logging.DEBUG if options.debug else logging.INFO input_paths = [args[0]] if len(args) == 1: output_paths = ['%s.csv' % os.path.splitext(args[0])[0]] elif len(args) == 2 and (args[1] == '-' or args[1].lower().endswith('.csv')): output_paths = [args[1]] else: assert all(x.lower().endswith('.sas7bdat') for x in args) input_paths = args output_paths = ['%s.csv' % os.path.splitext(x)[0] for x in input_paths] assert len(input_paths) == len(output_paths) for input_path, output_path in zip(input_paths, output_paths): parser = SAS7BDAT(input_path, logLevel=logLevel) if options.header: parser.logger.info(str(parser.header)) continue rows_iter = parser.readData() columns = rows_iter.next() data_frame = pd.DataFrame( (row for row in rows_iter if row), columns=columns, ) print data_frame
def SAS2DataFrame(inputFullPath, year=[]): with SAS7BDAT(inputFullPath, skip_header=False) as reader: df = reader.to_data_frame() df.columns = [col.lower() for col in df.columns] if (not not year) & any(df.columns == 'wave'): df = df[df['wave'] == year] return df
def main(): with SAS7BDAT(IN_FILE) as f: raw = [row for row in f][1:] # delete if the returns in the beginning years are empty country = "" start = False for obs in raw: if obs[0] != country: start = False country = obs[0] if obs[-1] != None: start = True if start: data.append(obs) # if ret[i] is empty, ret[i] = avg(ret[:i-1]) for i in range(len(data)): obs = data[i] if obs[-1] == None: pre = [x[-1] for x in data[:i] if x[0] == obs[0]] obs[-1] = sum(pre) / len(pre) # output file with open(OUT_FILE, 'w', newline='') as csvfile: writer = csv.writer(csvfile) writer.writerow(['country', 'year', 'ret', 'ret2']) for obs in data: writer.writerow(obs)
def get_list2(): with open("tickers2.txt", 'w', encoding='utf8') as wf: with SAS7BDAT("data_fin_merged.sas7bdat") as rf: l = [row[1] for row in rf] print(l[:5]) tickers = set(l[1:]) for t in tickers: wf.write(t + '\n')
def read_from_sas(filename): """ Return a dataframe from given SAS table. https://pypi.python.org/pypi/sas7bdat """ with SAS7BDAT(filename + ".sas7bdat") as f: df = f.to_data_frame() return df
def merge_qc(rtdf, cog_file, qcVars): """ Merge Reaction Time data with metadata from core dataset. This includes rater Z score, computer, complete and time administered.""" with SAS7BDAT(cog_file) as f: cogdf = f.to_data_frame() rtdf.index.names = ['vetsaid', 'TrialType'] rt_qc = rtdf.join(cogdf[qcVars].set_index('vetsaid'), how='left') return rt_qc
def read_next(self): with SAS7BDAT(self.fp) as fh: for i, line in enumerate(fh): columns = {x: i for i, x in enumerate(line)} yield TextItem( meta_list=[line[columns[m]] for m in self.meta], text=[line[columns[t]] for t in self.text] )
def _send_table_description(self, func=None): """ Send the table description to Qlik as meta data. Only used when the SSE is called from the Qlik load script. """ # Set up the table description to send as metadata to Qlik self.table = SSE.TableDescription() if func is None: self.table.name = "SAS_Dataset" if isinstance(self.reader, pd.DataFrame): self.sample_data = self.reader.head(5) else: # Read the SAS file to get sample data sample_response = pd.read_sas(self.filepath, format=self.format, encoding=self.encoding, chunksize=5) # Get the first chunk of data as a Pandas DataFrame self.sample_data = sample_response.__next__() # Close the file reader sample_response.close() # Fetch field labels from SAS variable attributes if required # This may fail for wide tables due to meta data limits. For such cases use the get_labels function. if self.labels: # Use the sas7bdat library to read the file handle = SAS7BDAT(self.filepath, skip_header=False) # Get labels for the variables labels = [col.label.decode(self.encoding) for col in handle.columns] # Close the file reader handle.close() else: # Get the variable names from the sample data labels = self.sample_data.columns # Set field names for col in labels: # Set up fields for the table self.table.fields.add(name=col) if self.debug: self._print_log(2) elif func == "get_labels": self.table.name = "SAS_Labels" self.table.fields.add(name="variable") self.table.fields.add(name="label") if self.debug: self._print_log(4) # Send table description table_header = (('qlik-tabledescription-bin', self.table.SerializeToString()),) self.context.send_initial_metadata(table_header)
def load_sas(fn, schema): '''Load a SAS file according to the given schema. Parameters: fn : str Path to the SAS file. schema : DataFrame Data frame containing the schema of the study. Returns: df_sub : DataFrame The data available in the SAS file, subset to the columns listed in the schema. ''' with SAS7BDAT(fn) as f: df = f.to_data_frame() if 'id' in df.columns: df['id'] = df['id'].astype(int) df['centre'] = np.asarray(df['id']) // 1000 df = df.set_index(['centre', 'id']) elif 'centre' in df.columns: print('WARNING: no id column found in df {}'.format(fn)) df['id'] = -9 df['centre'] = df['centre'].astype(int) df = df.set_index(['centre', 'id']) else: print('WARNING: no id column found in df {}'.format(fn)) df['id'] = -9 df['centre'] = -9 df = df.set_index(['centre', 'id']) schema_cols = list(schema.index) cols = [c for c in df.columns if c in schema_cols] dtypes = schema.loc[cols].groupby('Name').first()['Type'] df_sub = df[cols] for col in dtypes.index: if dtypes[col] == 'int': try: df_sub[col] = df_sub[col].astype(pd.Int64Dtype()) except: msg = 'error converting int column {}, leaving as float' print(msg.format(col)) try: df_sub[col] = df_sub[col].astype(float) except: print('not a valid float either! dropping {}'.format(col)) df_sub = df_sub.drop(col, axis=1) elif dtypes[col] == 'choice' or dtypes[col] == 'check': try: df_sub[col] = pd.Categorical(df_sub[col].astype( pd.Int64Dtype())) except: msg = 'error converting categorical column {}, dropping it' print(msg.format(col)) df_sub = df_sub.drop(col, axis=1) elif dtypes[col] == 'string' or dtypes[col] == 'date': df_sub = df_sub.drop(col, axis=1) return df_sub
def sas_to_csv(dirt,dataset): print("\n\nReading data from",dirt+"sasdat/" + dataset+".sas7bdat") with SAS7BDAT(dirt+"sasdat/" + dataset+".sas7bdat") as f: df = f.to_data_frame() print("\n\nData description:\n\n",df.describe()) cols = df.columns df.to_csv(dirt+dataset+'d.csv',encoding = 'utf-8',index = False,header =True) print("\n\nCheck column\n\n",cols) return df
def sas_to_csv(self): print("\n\nReading data from", self.path + self.taskname + '/' + self.dataname) with SAS7BDAT(self.path + self.taskname + '/' + self.dataname + '.sas7bdat') as f: df = f.to_data_frame() print("\n\nData description:\n\n", df.describe()) return df
def importing_sas_files(): with SAS7BDAT('data/sales.sas7bdat') as file: df_sas = file.to_data_frame() print(df_sas.head()) print(df_sas.info()) pd.DataFrame.hist(df_sas[['P']]) plt.ylabel('count') plt.show()
def merge_qc(axcptdf, cog_file, qcVars): """ Merge AX-CPT data with metadata from core dataset. This includes rater Z score, computer, version, complete and time administered.""" with SAS7BDAT(cog_file) as f: cogdf = f.to_data_frame() axcpt_qc = pd.merge(axcptdf, cogdf[qcVars], left_index=True, right_on='vetsaid', how='left') return axcpt_qc
def sas2pd(sasfile): import pandas as pd from sas7bdat import SAS7BDAT a = [] for i, x in enumerate(SAS7BDAT(sasfile).readData()): if i == 0: cols = x else: a.append(x) df = pd.DataFrame(a) df.columns = cols return df
def sasConvert(bucket, key): print('--Loading function') try: print('--Acquiring Current Time') t = datetime.now() formatted_time = t.strftime('%d-%m-%y%H%M%S') print('--Formatted Time: ' + formatted_time) if key[-9:] == '.sas7bdat': print('--Generating File Names') keySubExt = key[:-9] keyName = keySubExt.rsplit('/', 1) tmpFile = 'sastocsv/tmp/' + keyName[ 1] + '-' + formatted_time + '.sas7bdat' tmpOutFile = 'sastocsv/output/' + keyName[ 1] + '-' + formatted_time + '.csv' s3OutName = keyName[0] + '/' + 'output/' + keyName[ 1] + '-' + 'convertedSAS' + '-' + formatted_time + '.csv' # Print filenames print('*keySubExt: ' + keySubExt) print('*keyName: ' + keyName[0] + ' 2: ' + keyName[1]) print('*tmpFile: ' + tmpFile) print('*tmpOutFile: ' + tmpOutFile) print('*s3OutName: ' + s3OutName) print('--Downloading SAS7BDAT File') s3.meta.client.download_file(bucket, key, tmpFile) print('--Reading SAS7BDAT File to DataFrame') with SAS7BDAT(tmpFile) as f: df = f.to_data_frame() print('--Converting to CSV File') df.to_csv(tmpOutFile, index=False) print('--Uploading to S3 Bucket') s3.meta.client.upload_file(tmpOutFile, bucket, s3OutName) os.remove(tmpFile) os.remove(tmpOutFile) return (s3OutName) else: print('Invalid File Type') return ('Invalid File Type') except Exception as e: print(e) print('Error converting file to CSV') raise e return (e)
def sas2meta(self, encode): '''encode like latin localpath is a path to the folder where the dataset is stored ''' if self.isS3 == True: self.load_sas_from_s3() ## bring SAS to local under data folder sas = SAS7BDAT(os.path.join('data', self.dataset), encoding=encode) elif self.isS3 == False: sas = SAS7BDAT(os.path.join(self.localpath, self.dataset), encoding=encode) varlist = list(map(lambda i: i.name.decode("utf-8"), sas.columns)) variablelabels = list( map(lambda i: i.label.decode("utf-8"), sas.columns)) df = pd.DataFrame({ 'ID': list(map(lambda i: i.name.decode("utf-8").upper(), sas.columns)), 'text': list(map(lambda i: i.label.decode("utf-8"), sas.columns)) }) df['text'] = df["ID"] + ' ' + df["text"] df['text'] = df['text'].str.replace('_', ' ') return df
def read(self): logging.info('Getting Terms and Negation.') res = [] with SAS7BDAT(self.fp) as fh: columns = fh.column_names_strings for row in fh: res.append([ row[columns['id']], row[columns['text']], row[columns['cui']], row[columns['valence']], row[columns['regexvariation']], row[columns['wordorder']], ]) return res