def denormalize(zip_name): """ Denormalize diary and hhold NFS data. We do this so we don't have to do this join everytime. It's not that expensive to store. Write it to CSV and to feather format. """ path, ext = os.path.splitext(zip_name) if not os.path.exists(path): zip_file = zipfile.ZipFile(zip_name) zip_file.extractall(path=path) fnames = glob.glob(pjoin(path, '*')) diary_name = glob.fnmatch.filter(fnames, '*diary data.txt')[0] hhold_name = glob.fnmatch.filter(fnames, '*household data.txt')[0] diary = pd.read_csv(diary_name, sep="\t") hhold = pd.read_csv(hhold_name, sep="\t", usecols=hhold_cols) dta = diary.merge(hhold) dta.to_csv(path.rstrip('/') + '.csv', index=False) if has_feather: feather.write_dataframe(dta, path.rstrip('/') + '.feather')
def _ft(self, tblname, dbname=None, type=None, df=None): if type is None: type = self.type if dbname is None: dbname = self.name if df is None: # return the dataframe if it exists df = ft.read_dataframe( os.path.expanduser( os.path.join(cf.options.basedir, "databases", "{}.{}.{}.ft".format(type, dbname, tblname)) ) ) if "idx" in df.columns.values: df.set_index("idx", drop=True, inplace=True) df.index.name = None return df else: if not (df.index.dtype_str == "int64") and not (df.empty): df = df.copy() df["idx"] = df.index ft.write_dataframe( df, os.path.expanduser( os.path.join(cf.options.basedir, "databases", "{}.{}.{}.ft".format(type, dbname, tblname)) ), ) if "idx" in df.columns.values: del df return
def parse_all(nrows=None): # debug d = glob_files() dd = dict() for x in d: # df = pd.read_csv(x['filename'], dtype='str', header=None, skiprows=x['skiprows']) df = pd.read_csv(x['filename'], header=None, skiprows=x['skiprows']) # if header == ',PDG Application Service,FR,DFL,TP,DLG,OS(W),OS(NPW),OS(P),OS(NPP),SIMS,OC1,OC2,Total': # header = 'Account Customer,FR,DFL,TP,DLG,OS(W),OS(NPW),OS(P),OS(NPP),SIMS,OC1,OC2,Total' # elif header == ',Application Service,FR,DFL,TP,DLG,Total': # header = 'Account Customer,FR,DFL,TP,DLG,Total' # elif header == 'Local Authority,FR,DFL,TP,DLG,OS(W),OS(NPW),OS(P),OS(NPP),SIMS,OC1,OC2,Total': # header = 'Account Customer,FR,DFL,TP,DLG,OS(W),OS(NPW),OS(P),OS(NPP),SIMS,OC1,OC2,Total' # elif header == 'Local Authority,FR,DFL,TP,DLG,OS(W),OS(NPW),OS(P),OS(NPP),SIMS,OC1,OC2,Total,': # header = 'Account Customer,FR,DFL,TP,DLG,OS(W),OS(NPW),OS(P),OS(NPP),SIMS,OC1,OC2,Total,' # elif header == ',PDG Application Service,OS(W),OS(NPW),OS(P),OS(NPP),Total': # header = 'Account Customer,OS(W),OS(NPW),OS(P),OS(NPP),Total' # assert header.startswith('Account Customer,') or header.startswith('Region'), 'got {}'.format(header) df = _mangle_with_header_etc(df, x) if x['name'] not in dd: dd[x['name']] = list() dd[x['name']].append(df) for k in dd: df = pd.concat(dd[k]) kk = k.lower().replace(' ', '_') filename = kk + '.feather' print("writing {} {}".format(filename, df.shape)) feather.write_dataframe(df, filename) return dd
def mergeFeathers(files, mergedFilename, writeCSV, deleteSource=True): data = [feather.read_dataframe(f) for f in files if not f == ''] if len(data) > 0: df = pd.concat(data, sort=False, axis=0, ignore_index=True, copy=False) else: print('mergeFeathers: No files to merge!') return '' if writeCSV: df.to_csv(mergedFilename) else: try: feather.write_dataframe(df, mergedFilename) except: print('Error writing merged feather: Trying CSV') print(df.shape) traceback.print_exc() try: df.to_csv(mergedFilename.replace('.feather', '.csv')) except: print('Error writing merged CSV: Writing list of unmerged temp files.') with open(mergedFilename.replace('.feather', '.csv'), 'w') as fh: for f in files: fh.write(f + '\n') deleteSource = False if deleteSource: for f in files: if not f == '': try: os.remove(f) except: print('Could not delete merged temp file: %s' % f) return mergedFilename
def extract_svs(in_file, depth, chroms): """Create CSV file of structural variants of interest, for Circos plots. """ allowed_chroms = set([str(x) for x in range(1, 23)]) # print(allowed_chroms + 'X') # allowed_chroms = set(allowed_chroms.append('X')) df = pd.DataFrame(columns = ["chrom1", "start1", "end1", "chrom2", "start2", "end2", "file", "caller", "svtype"]) log.debug("Building dataframe from VCF file...") callers = _find_svcaller(in_file) for idx, caller in enumerate(callers): for p1, p2, svtype in parse_svs(in_file, depth): if len(chroms) == 0 or (p1[0] in chroms or p2[0] in chroms): if p1[0] in allowed_chroms and p2[0] in allowed_chroms: row = pd.Series({"chrom1": p1[0], "start1": p1[1], "end1": p1[2], "chrom2": p2[0], "start2": p2[1], "end2": p2[2], "file": in_file, "caller": caller, "svtype": svtype}) df = df.append(row, ignore_index=True) try: out_file = os.path.join(OUTPUT_DIR, os.path.basename(in_file)) log.info("Exporting to interoperable feather file {}.feather".format(out_file)) feather.write_dataframe(df, "{}.feather".format(out_file)) except feather.ext.FeatherError: log.error("Failed to serialize feather object (most likely empty source dataframe)")
def test_factor_rep(): fpath1 = util.random_path() fpath2 = util.random_path() rcode = """ library(feather) iris <- read_feather("{0}") iris$Species <- as.factor(as.character(iris$Species)) write_feather(iris, "{1}") """.format(fpath1, fpath2) tmp_paths = [] try: iris = pd.read_csv('iris.csv') levels = ['setosa', 'versicolor', 'virginica'] iris['Species'] = pd.Categorical(iris['Species'], categories=levels) feather.write_dataframe(iris, fpath1) util.run_rcode(rcode) result = feather.read_dataframe(fpath2) tmp_paths.extend([fpath1, fpath2]) assert_frame_equal(result, iris) finally: util.remove_paths(tmp_paths)
def __to_feather__(self, uri: str): if not "feather" in DataObject.registered_types: raise RuntimeError("Cannot convert to feather.") import feather feather.write_dataframe(self.inner_data, uri) return DataObject.registered_types["feather"].from_uri(uri, source=self)
def import_data_set(in_path, sep="\s", name=None): data = pd.read_csv(in_path, sep=sep, header=None) if name is None: name = os.path.basename(in_path) name = os.path.splitext(name) name = name[0] + ".data" feather.write_dataframe(data, join(out_path, name))
def test_overwritten_file(self): path = random_path() num_values = 100 np.random.seed(0) values = np.random.randint(0, 10, size=num_values) feather.write_dataframe(pd.DataFrame({"ints": values}), path) df = pd.DataFrame({"ints": values[0 : num_values // 2]}) self._check_pandas_roundtrip(df, path=path)
def _check_pandas_roundtrip(self, df, expected=None): path = random_path() self.test_files.append(path) feather.write_dataframe(df, path) if not os.path.exists(path): raise Exception('file not written') result = feather.read_dataframe(path) if expected is None: expected = df assert_frame_equal(result, expected)
def maybe_parse(path): feather_file = path + ".feather" if os.path.exists(feather_file): print("loading %s from cache" % path) df = feather.read_dataframe(feather_file) df = df.set_index("ut_ms") return df else: print("parsing %s" % path) df = parse(path) feather.write_dataframe(df.reset_index(), feather_file) return df
def save_df(df, path, index=False): if path == '-' or path is None: print(default_csv_writer(df, None, index=index)) elif file_format(path) != 'feather': default_csv_writer(df, path, index=index) elif featherpmm and feather: featherpmm.write_dataframe(featherpmm.Dataset(df, name='verification'), path) elif feather: feather.write_dataframe(df, path) else: raise Exception('The Python feather module is not installed.\n' 'Use:\n pip install feather-format\n' 'to add capability.\n')
def mergeSamples(batchFolder, extractionFunc, extractionKwargs, matchStr='*.feather', test=False, metaCols=None, filters=None): """Go through each feather file (sample) in a batch folder, apply the analysis function, and merge together.""" mDf = pd.read_csv(opj(batchFolder, 'metadata.csv')) featherList = glob(opj(batchFolder, matchStr)) featherLU = matchSamples(batchFolder, matchStr=matchStr, test=test) if not metaCols is None: if not 'sample_name' in metaCols: metaCols.append('sample_name') mDf = mDf[metaCols] mDf = mDf.set_index('sample_name') feathers = [] i = 1 print('Extracting from batch %s (%s)' % (batchFolder, time.ctime())) sttime = time.time() for sample_name, fn in featherLU.items(): filterOut = False if not filters is None: """Keep only samples whose meta data matches all of the filters""" filterOut = False for col, valList in filters.items(): if not mDf.loc[sample_name, col] in valList: filterOut = True break if not filterOut: f = feather.read_dataframe(fn) # print('Extracting from sample %s (%d of %d)' % (sample_name, i, len(featherLU))) try: x = extractionFunc(f, **extractionKwargs) x.loc[:, 'sample_name'] = sample_name except: print('Error extracting from batch %s, sample %s (%d)' % (batchFolder, sample_name, i)) print(x.shape) print(x.head()) traceback.print_exc() feathers.append(x) i += 1 if len(feathers) > 0: outDf = pd.merge(pd.concat(feathers, axis=0), mDf.reset_index(), how='left', left_on='sample_name', right_on='sample_name') print('Finished batch %s (%1.0f minutes)' % (batchFolder, (time.time() - sttime) / 60), flush=True) """Write to a temporary merge file and return filename""" with tempfile.NamedTemporaryFile(mode='w', suffix='.feather', prefix='merged_tmp_', dir=batchFolder, delete=False) as fh: tmpFilename = fh.name feather.write_dataframe(outDf, tmpFilename) else: tmpFilename = '' return tmpFilename
def __setitem__(self, key, df): fn = self._filename_from_key(key) feather.write_dataframe(df, fn) self._fn_cache[key] = fn self._sz_cache[key] = os.stat(fn).st_size if key in self._heap_map: self._heap_map[key][0] = time.time() # ensure the heap invariant heapq.heapify(self._heap) else: heap_entry = [time.time(), key] self._heap_map[key] = heap_entry heapq.heappush(self._heap, heap_entry) self.__prune_files()
def to_feather(df, path): """ Write a DataFrame to the feather-format Parameters ---------- df : DataFrame path : string File path """ path = _stringify_path(path) if not isinstance(df, DataFrame): raise ValueError("feather only support IO with DataFrames") feather = _try_import() valid_types = {'string', 'unicode'} # validate index # -------------- # validate that we have only a default index # raise on anything else as we don't serialize the index if not isinstance(df.index, Int64Index): raise ValueError("feather does not support serializing {} " "for the index; you can .reset_index()" "to make the index into column(s)".format( type(df.index))) if not df.index.equals(RangeIndex.from_range(range(len(df)))): raise ValueError("feather does not support serializing a " "non-default index for the index; you " "can .reset_index() to make the index " "into column(s)") if df.index.name is not None: raise ValueError("feather does not serialize index meta-data on a " "default index") # validate columns # ---------------- # must have value column names (strings only) if df.columns.inferred_type not in valid_types: raise ValueError("feather must have string column names") feather.write_dataframe(df, path)
def test_num_rows_attr(self): df = pd.DataFrame({'foo': [1, 2, 3, 4, 5]}) path = random_path() self.test_files.append(path) feather.write_dataframe(df, path) reader = feather.FeatherReader(path) assert reader.num_rows == len(df) df = pd.DataFrame({}) path = random_path() self.test_files.append(path) feather.write_dataframe(df, path) reader = feather.FeatherReader(path) assert reader.num_rows == 0
def read_dtas(fname): savename = fname.split(".")[0] + ".feather" with open(fname, "rb") as pa: df = pd.read_stata(pa) df2 = df.copy() cols = [] for i in range(len(df.columns)): try: cols.append(df.columns[i].encode('latin-1').decode('gb18030')) except: cols.append("") if cols[22] == "": cols[22] = '登记注册机关级鸸ど绦姓管聿棵' df2.columns = cols print(df2.columns) ##E.g.: #Index(['_组织机构代码', '_单位详细名称', '_行业代码', '_主要业务活动1', '_主要业务活动2', '_主要业务活动3', # '_行政区划代码', '_省', '_地', '_县', '_乡', '_地址', '_街道办事处', '_法定代表人', '_开业年', # '_开业月', '_区号', '_固定电话', '_分机号', '_传真号码', '_传真分机号', '_邮政编码', # '登记注册机关级鸸ど绦姓管聿棵', '_登记注册号工商行政管理部门', '_登记注册机关级别编制部门', '_登记注册号编制部门', # '_登记注册机关级别民政部门', '_登记注册号民政部门', '_登记注册机关级别国家税务部门', '_登记注册号国家税务部门', # '_登记注册机关级别地方税务部门', '_登记注册号地方税务部门', '_登记注册机关级别其他', '_登记注册号其他', '_登记注册类型', # '_企业控股情况', '_隶属关系', '_企业营业状态', '_执行会计制度类别', '_代码', '_名称', '_年初存货', # '_年初产成品', '_流动资产合计', '_应收账款', '_存货', '_产成品', '_固定资产合计', '_固定资产原价', # '_累计折旧', '_本年折旧', '_资产总计', '_流动负债合计', '_应付账款', '_非流动负债合计', '_负债合计', # '_所有者权益合计', '_实收资本', '_国家资本', '_集体资本', '_法人资本', '_个人资本', '_港澳台资本', # '_外商资本', '_营业收入', '_主营业务收入', '_营业成本', '_主营业务成本', '_营业税金及附加', # '_主营业务税金及附加', '_其他业务利润', '_销售费用', '_管理费用', '_税金', '_财务费用', '_利息收入', # '_利息支出', '_资产减值损失', '_公允价值变动收益', '_投资收益', '_营业利润', '_营业外收入', '_补贴收入', # '_营业外支出', '_利润总额', '_应交所得税', '_应付职工薪酬', '_应交增值税', '_工业总产值', '_工业销售产值', # '_出口交货值'], # dtype='object') for i, col in enumerate(cols): print(i, col) if df2[col].dtype == 'object': print("Object type found, attempting conversion") df2[col] = df2[col].apply(lambda x: x.encode('latin-1').decode('gb18030')) feather.write_dataframe(df2, savename)
def _load_data(self, data_file, columns, usecols): fth_file = data_file + '.fth' if not os.path.exists(fth_file): logging.info('convert csv file to feather') df_tmp = pd.read_csv(data_file, sep='\t', names=columns, usecols=usecols) logging.info('csv data shape {}'.format(df_tmp.shape)) feather.write_dataframe(df_tmp, fth_file) df_tmp.head() logging.info('loading data {}'.format(fth_file)) df_data = feather.read_dataframe(fth_file, columns=columns, use_threads=True) logging.info('data shape {}'.format(df_data.shape)) #print df_data.head()['package_name'] return df_data
def concat_pieces(pieces, fname, featherfile, statafile, label, dblabel, add2db, user, password, host): #add_frame_to_db(all_float_pieces,'FLOAT_VARS') if len(pieces) > 0: df = pd.concat(pieces, ignore_index=True, sort=False) #changed 20180716 (sort=False) df = order(df, ['RSSD9001', 'RSSD9999', 'year', 'qid']) df.to_csv(fname, index=False, sep="^") feather.write_dataframe(df, featherfile) if statafile != "0": if check_file_exists(statafile, "w"): df.to_stata(statafile) makelables(df, statafile) print '\n%s has a count of %s.' % (label, df["qid"].count()) if add2db == 1: add_frame_to_db(df, dblabel.upper(), user, password, host) else: print label + "leeg"
def dataframe_to_display(data_frame): """Save an array of floats to JSON. """ # Write the array to a temporary file filepath = tempfile.mkstemp()[1] feather.write_dataframe(data_frame, filepath) # Read the temporary file as bytes array_data = open(filepath, 'rb').read() os.remove(filepath) # Convert raw bytes to a list of ints array_bytes_as_ints = [] for d in array_data: array_bytes_as_ints.append(d) # Return the JSON representation of the list of ints return json.dumps(array_bytes_as_ints)
def gen_df_bec_by_hs6(): df_bec_by_hs6_path_str = r'SourceMaterial\df_bec_by_hs6.feather' df_bec_by_hs6_path = Path(df_bec_by_hs6_path_str) if df_bec_by_hs6_path.exists(): df_bec_by_hs6 = feather.read_dataframe(df_bec_by_hs6_path) else: df_bec_raw = gen_df_bec_hs() df_bec_by_hs6 = pd.DataFrame() for index, row in df_bec_raw.iterrows(): df = pd.DataFrame() df['hs6'] = pd.Series(row['hs6'].split(',')) df['category'] = row['大類'] df_bec_by_hs6 = pd.concat([df_bec_by_hs6, df]) df_bec_by_hs6 = df_bec_by_hs6.drop_duplicates(['hs6']) feather.write_dataframe(df_bec_by_hs6, df_bec_by_hs6_path_str) return df_bec_by_hs6
def _check_pandas_roundtrip(self, df, expected=None, path=None, columns=None): if path is None: path = random_path() self.test_files.append(path) feather.write_dataframe(df, path) if not os.path.exists(path): raise Exception('file not written') result = feather.read_dataframe(path, columns) if expected is None: expected = df assert_frame_equal(result, expected)
def rewrite_as_feather_file(): with open('spo2_records.csv') as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') csv_lines = [row for row in csv_reader][1:] num_total = len(csv_lines) data_list = list() for i, row in enumerate(csv_lines[:10]): record = row[1] patient = int(record[1:7]) with open('SpO2_and_hypoxemia_labels/' + record + '.txt', 'r') as json_file: data = json.load(json_file) data['patient'] = patient data['experiences_hypoxemia'] = np.any( np.asarray(data['hypoxemia']) == 1) data_list.append(data) df = pd.DataFrame(data_list) feather.write_dataframe(df, 'spo2_hypoxemia.feather')
def save_to_r_dataset(df, path): """Convert pandas dataframe to r dataframe. Parameters ---------- df : dataframe Pandas dataframe. path : str Path to save. Returns ------- None Description of returned object. """ feather.write_dataframe(df, path) return None
def feather_clean(in_directory): """ Utility function to clean feather files""" # in_directory = UpdateSP500Data.TOP_LEVEL_PATH / 'feather' Path.is_dir(in_directory) all_files = os.listdir(in_directory) for item in all_files: if item.endswith('.feather'): # Remove options with strikes at 5$ option_df = feather.read_dataframe(in_directory / item) idx = option_df['strike'] == 5 option_df = option_df.drop(option_df.index[idx]) # # Remove Quarterly options # idx2 = option_df['root'] == 'SPXQ' # option_df = option_df.drop(option_df.index[idx2]) # # Remove Monthly options # idx2 = option_df['root'] == 'SPXM' # option_df = option_df.drop(option_df.index[idx2]) feather.write_dataframe(option_df, str(in_directory / item))
def convert_to_feather(file_path, out_path): '''Convert the Environment & Climate Change Canada csv files into feather files, to allow for faster processing Parameters ---------- file_path : string file path to the csv files provided by Environment & Climate Change Canada, not including the name of the file out_path : string where you want the new feather file to be written to in the computer, not including the new file name ''' for station_name in os.listdir(file_path): file = file_path + station_name df = pd.read_csv(file, sep=',', engine='c', low_memory=False, encoding='latin1') feather.write_dataframe(df, out_path + station_name[:-4] + '.feather')
def import_solar_data(): file = request.files['file'] file_name = os.path.splitext(file.filename)[0] file_path = 'data/temp/' + file_name file.save(file_path) solar_data = generic_data_to_dataframe(file_path) # Check if the file format is in the correct format try: solar_data.rename(columns={solar_data.columns[0]: 'Datetime'}, inplace=True) except: return jsonify({'error': 'Invalid data format.'}) feather.write_dataframe(solar_data, 'data/solar_profiles/' + file_name + '.feather') return jsonify({'message': "Successfully imported file."})
def main(argv): args = _argument_parser().parse_args(argv) if args.data_frame is not None and os.path.exists(args.data_frame): df = feather.read_dataframe(args.data_frame) else: from . import parsers parser = getattr(parsers, args.format).parser print('reading network data') network = parser(args.datafile, max_num_nodes=args.max_num_nodes) print('extracting data') df = network_properties(network, in_degree_threshold=args.in_degree_threshold, pagerank_threshold=args.pagerank_threshold, damping=args.damping) if args.data_frame is not None: feather.write_dataframe(df, args.data_frame) print('preparing plots') bokeh_plot(df, output=args.output_file, loglog=args.loglog)
def main(data_input, cluster_input, output): basicConfig(level=INFO, handlers=[ StreamHandler(), FileHandler('{}.log'.format(output), mode='w') ]) # Load the data. info('Loading data') data = feather.read_dataframe(data_input) info('Result: {}'.format(data.shape)) info('Loading clusters') clusters = pd.read_csv(cluster_input, index_col='subject_id', squeeze=True) info('Result: {}'.format(clusters.shape)) # Filter the data. info('Filtering data') data = data.loc[data['visit_id'] == 1].drop('visit_id', axis=1).set_index('subject_id') data = data.loc[clusters.index] # Merge the cluster assignments in. info('Merging clusters') data['classification'] = clusters # Write the output. info('Writing output') data['classification'] = data['classification'].astype('category') feather.write_dataframe(data.reset_index(), output)
def _check_pandas_roundtrip(self, df, expected=None, path=None, columns=None, null_counts=None): if path is None: path = random_path() self.test_files.append(path) feather.write_dataframe(df, path) if not os.path.exists(path): raise Exception("file not written") result = feather.read_dataframe(path, columns) if expected is None: expected = df assert_frame_equal(result, expected) if null_counts is None: null_counts = np.zeros(len(expected.columns)) np.testing.assert_array_equal(self._get_null_counts(path, columns), null_counts)
def process_all(allrundirs): allcats = [] for rundir in allrundirs: runname = rundir.split('/')[0] print('Loading {} ... '.format(runname), end='') sys.stdout.flush() allcats.append(load_catalog(rundir, runname)) print('{:,}'.format(len(allcats[-1]))) print('Loading finished. Saving merged table... ', end='') sys.stdout.flush() allcats = pd.concat(allcats, axis=0).sort_values( by=['run_name', 'start_time', 'channel']).reset_index(drop=True) print('({:,} reads total)'.format(len(allcats))) feather.write_dataframe(allcats, 'sequencing_summary.feather') print('Done.')
def subset_data(session_id, pvalue_slider_value, foldchange_slider_value, basemean_slider_value, cluster_dropdown_value, go_dropdown_value, organism_type): if session_id is None: raise dash.exceptions.PreventUpdate() else: df = feather.read_dataframe('temp_data_files/' + session_id) df = df.rename(index=str, columns={'symbol': 'gene_ID'}) if cluster_dropdown_value is not None: df = df[df['cluster'] == cluster_dropdown_value] if go_dropdown_value is None: pass elif len(go_dropdown_value) == 0: pass else: print("go dropdown triggered") # df = df[df['gene_ID'].isin(go_assocs.golist_to_collapsed_gene_list(go_dropdown_value))] if organism_type == 'mouse': df = df[df['gene_ID'].isin( mouse_go_assocs.golist_to_collapsed_gene_list( go_dropdown_value))] print(go_dropdown_value) # print(cluster_dropdown_value) # print(go_dropdown_value) if pvalue_slider_value is not None: min_slider = pvalue_slider_value[0] max_slider = pvalue_slider_value[1] df = df[df['neg_log10_padj'].between(min_slider, max_slider)] if foldchange_slider_value is not None: min_slider = foldchange_slider_value[0] max_slider = foldchange_slider_value[1] df = df[df['log2FoldChange'].between(min_slider, max_slider)] if basemean_slider_value is not None: min_slider = basemean_slider_value[0] max_slider = basemean_slider_value[1] # Handle exception for scRNAseq data case where no basemean try: df = df[df['log10basemean'].between(min_slider, max_slider)] except: pass feather.write_dataframe(df, 'temp_data_files/' + session_id + '_subset') return None
def __saveToS32(obj, bucket, s3path, prefix=""): clz = obj.__class__.__name__ lastobject = None if isinstance(obj, pandas.core.frame.DataFrame): import feather as ft ft.write_dataframe(obj, "/tmp/{}_lastobject.feather".format(prefix)) lastobject = "{}_lastobject.feather".format(prefix) elif clz == "dict": import json with open("/tmp/{}_lastobject.json".format(prefix), "w") as outfile: json.dump(obj, outfile) lastobject = "{}_lastobject.json".format(prefix) CHUNK = 52428800 if lastobject is not None: import math, os import boto from boto.s3.connection import S3Connection from boto.s3.key import Key c = boto.connect_s3() b = c.get_bucket(bucket) source_size = os.stat("/tmp/{}".format(lastobject)).st_size keyname = "{}/{}".format(s3path, lastobject) if source_size >= CHUNK: from filechunkio import FileChunkIO ## multipart upload ## http://boto.cloudhackers.com/en/latest/s3_tut.html#storing-large-data chunk_count = int(math.ceil(source_size / float(CHUNK))) mp = b.initiate_multipart_upload(keyname) try: for i in range(chunk_count): offset = chunk_size * i bytes = min(CHUNK, source_size - offset) with FileChunkIO(lastobject, 'r', offset=offset, bytes=bytes) as fp: mp.upload_part_from_file(fp, part_num=i + 1) except: mp.complete_upload() else: k = Key(b) k.key = keyname k.set_contents_from_filename("/tmp/{}".format(lastobject))
def main(basic_input, medication_input, joint_injection_input, joint_input, output, filter_output): basicConfig(level=INFO, handlers=[ StreamHandler(), FileHandler('{}.log'.format(output), mode='w') ]) basic_data, medication_data, joint_injection_data, joint_data = load_data( basic_input, medication_input, joint_injection_input, joint_input) info('Generating masks') masks_basic = get_basic_masks(basic_data) masks_medications = get_medication_masks(medication_data, joint_injection_data) mask_joints = get_joint_count_masks(joint_data) mask_all = masks_basic['basic_combined'] & masks_medications[ 'medications_combined'] & mask_joints['joint_count'] masks_all = masks_basic.join(masks_medications, how='outer').join(mask_joints, how='outer') masks_all['all_combined'] = mask_all info('{} patients will be retained'.format(mask_all.sum())) info('Filtering data') data = joint_data.set_index('subject_id').loc[mask_all.index[ mask_all == True]].reset_index() info('Writing outputs') data.info() feather.write_dataframe(data, output) masks_all.to_csv(filter_output)
def write_conslengths_feather(strand_lengths, strand_ids, seq_id): strand_ids = [Counter(x) for x in strand_ids] for x in range(0, 23): if len(strand_ids[x]) == 0: strand_ids[x] = 0 else: temp_size = [] for y in range(0, x): try: temp_size.append(strand_ids[x][y]) except: temp_size.append(0) strand_ids[x] = temp_size strand_lengths = [Counter(x) for x in strand_lengths] for x in range(0, 23): if len(strand_lengths[x]) == 0: strand_lengths[x] = 0 else: temp_size = [] for y in range(1, x + 1): try: temp_size.append(strand_lengths[x][y]) except: temp_size.append(0) strand_lengths[x] = temp_size #print(strand_lengths) barrel_sizes = [8, 10, 12, 14, 16, 18, 22] export_lengths = np.zeros([7, 22]) export_IDs = np.zeros([7, 22]) for value in range(0, len(barrel_sizes)): for x in range(0, barrel_sizes[value]): export_lengths[value][x] = strand_lengths[barrel_sizes[value]][x] export_IDs[value][x] = strand_ids[barrel_sizes[value]][x] print(export_IDs) #print(export_lengths) export_IDs = pandas.DataFrame(export_IDs) feather.write_dataframe(export_IDs.copy(), "data/ConsStrandIDs%s.feather" % seq_id) export_lengths = pandas.DataFrame(export_lengths) feather.write_dataframe(export_lengths.copy(), "data/ConsLengths%s.feather" % seq_id)
def main(location, output_file, supported_loc={"BC", "WA"}): """Downloads data from the url and saves the dataframe in feather format Parameters =========== location: str Location id(s) of the page to fetch the data. For example 'BC', 'WA' output_file: str File name along with the path for saving the data supported_loc: set Set conatains string of all supported location ids. """ loc_ids = location.split(" ") location_df = list() for loc_id in loc_ids: if loc_id not in supported_loc: raise Exception("{} location not supported. Location should be from: ".format(loc_id) + str(supported_loc)) try: url = "http://www.nuforc.org/webreports/ndxl"+loc_id+".html" location_df.append(pd.read_html(url)[0]) except: raise Exception("URL " + url + " is not reachable") aliens_df = pd.concat(location_df, ignore_index=True) if output_file.split(".")[-1] == "feather": try: feather.write_dataframe(aliens_df, output_file) except: raise NotADirectoryError(output_file + "path does not exists.") elif output_file.split(".")[-1] == "csv": try: aliens_df.to_csv(output_file, index=False) except: raise NotADirectoryError(output_file + "path does not exists.") else: raise Exception("File format not supported")
def appData(): #fetch tables from energydata.uct.ac.za apikey = input( 'Enter your APIKEY from http://energydata.uct.ac.za/user/YOUR_USERNAME: '******'Authorization': apikey} ckan = ckanapi.RemoteCKAN('http://energydata.uct.ac.za/', apikey=apikey, get_only=True) tables = ckan.action.package_show(id='dlr-database-tables-94-14') for i in range(0, len(tables['resources'])): name = tables['resources'][i]['name'] print('... fetching ' + name + ' from energydata.uct.ac.za') r_url = tables['resources'][i]['url'] # Download resources from data portal request = urllib.request.Request(r_url, headers=headers) with urllib.request.urlopen(request) as response, open( os.path.join(csv_table, name + '.csv'), 'wb') as out_file: shutil.copyfileobj(response, out_file) table = pd.read_csv(os.path.join(csv_table, name + '.csv')) #write profiles to disk feather.write_dataframe(table, os.path.join(feather_table, name + '.feather')) profiles = ckan.action.package_show(id='dlr-seasonal-adtd-profiles') for i in range(0, len(profiles['resources'])): name = profiles['resources'][i]['name'] print('... fetching ' + profiles['resources'][i]['name'] + ' from energydata.uct.ac.za') r_url = profiles['resources'][i]['url'] # Download resources from data portal request = urllib.request.Request(r_url, headers=headers) with urllib.request.urlopen(request) as response, open( os.path.join(csv_adtd, name + '.csv'), 'wb') as out_file: shutil.copyfileobj(response, out_file) adtd = pd.read_csv(os.path.join(csv_adtd, name + '.csv')) #write profiles to disk feather.write_dataframe(adtd, os.path.join(feather_adtd, name + '.feather')) return
def get_current_stus_ada(save_path, **context): students = feather.read_dataframe("{0}/students.feather".format(save_path)) attend_student = feather.read_dataframe( "{0}/attend_student.feather".format(save_path)) current_students = students[students.enroll_status == 0].copy() current_students = pd.DataFrame(current_students['student_number']) attend_student_current = pd.merge(attend_student, current_students, on="student_number", how="inner") attend_student_current_grouped = attend_student_current.groupby( ['student_number', 'lastfirst', 'grade_level', 'school_abbrev'], as_index=False) attend_student_ytd = (attend_student_current_grouped.aggregate({ 'enrolled': 'sum', 'present': 'sum', 'absent': 'sum' })) attend_student_ytd = attend_student_ytd.assign( ada=attend_student_ytd.present / attend_student_ytd.enrolled * 100) attend_student_ytd['ada_rank'] = (attend_student_ytd.groupby( ['school_abbrev', 'grade_level'], group_keys=False)['ada'].rank("dense", ascending=True)) attend_student_ytd = attend_student_ytd.sort_values( ['school_abbrev', 'grade_level', 'ada_rank']) write_path = "{0}/attend_student_ytd.feather".format(save_path) feather.write_dataframe(attend_student_ytd, write_path) return write_path
def saveFeatherFullData(output_npy, label_npy, u_dates, lakename, trial, PGRNN=True, includeTest=False): # convert predictions/labels numpy arrays into pandas dataframe and save as feather # @output_npy = prediction matrix (depths x days) # @label_npy = label matrix (depth x days) # @u_dates = numpy array of unique dates (np.datetime64 type) # @lakename = string nhd id (str) # @n_hid = number of best hidden units for experiment (str or int) # @realization = realization index from randomization (str or int) # @l1_norm (optional) = if l1 norm is used in hyperparameter optimization this is the value trial = str(trial) output_df = pd.DataFrame({'date': u_dates}) label_df = pd.DataFrame({'date': u_dates}) n_test_dates = u_dates.shape[0] n_depths = output_npy.shape[0] for i in range(0, n_depths): data = np.empty((n_test_dates)) data[:] = np.nan new_col = pd.DataFrame({'depth_' + str(i): output_npy[i, :]}) new_col2 = pd.DataFrame({'depth_' + str(i): label_npy[i, :]}) output_df = pd.concat([output_df, new_col], axis=1) label_df = pd.concat([label_df, new_col2], axis=1) pg = '' if PGRNN: pg = 'PGRNN' else: pg = 'RNN' o_path = '../../scripts/manylakes2/outputs_full/' + lakename + pg + '_output_' + 'trial' + trial + '.feather' l_path = '../../scripts/manylakes2/labels/' + lakename + '_label.feather' # save em exists = os.path.isfile(l_path) if not exists: feather.write_dataframe(label_df, l_path) feather.write_dataframe(output_df, o_path)
def csv_to_feather(csv, featherFileName, featherOutLoc=None): ''' Inputs: csv - (string) csv file; Location of .csv file + fileName; INCLUDE .csv featherFileName - (string) name to save feather file as; INCLUDE .feather featherOutLoc - (string) folder to save feather file to Output: Returns None; creates .feather file at specified location, defaults to current working directory Input example: r'H:\Data\HF data\heart failure data.csv' ''' dataframe = pd.read_csv(csv) if featherOutLoc is None: feather.write_dataframe(dataframe, featherFileName) else: feather.write_dataframe(dataframe, featherOutLoc + featherFileName)
def main(): #get args: args = sys.argv in_file = args[1] checkExists(in_file) print("Reading in file: ", in_file) df = pandas.read_csv(in_file) print("Successfully read in file: ", in_file) #write feather: out_file_name = os.path.splitext(in_file)[0] out_file_name += ".feather" print("Writing file: ", out_file_name) try: feather.write_dataframe(df, out_file_name) print("Successfully wrote file: ", out_file_name) except: print("File not written.")
def _check_pandas_roundtrip(self, df, expected=None, path=None, columns=None, null_counts=None): if path is None: path = random_path() self.test_files.append(path) feather.write_dataframe(df, path) if not os.path.exists(path): raise Exception('file not written') result = feather.read_dataframe(path, columns) if expected is None: expected = df assert_frame_equal(result, expected) if null_counts is None: null_counts = np.zeros(len(expected.columns)) np.testing.assert_array_equal(self._get_null_counts(path, columns), null_counts)
def save_data_to_R(outpath, header, row, data): if data is None: return if '.feather' not in outpath: outpath += '.feather' import pandas as pd try: import feather except ImportError as e: raise RuntimeError( "Cannot export to R, require python package 'feather-format'") row = np.array(row) if isinstance(header, string_types): header = header.split(',') header = np.array(header) df = pd.DataFrame(data=data, index=row, columns=header, dtype=data.dtype) feather.write_dataframe(df, outpath)
def averageTrialsToFinalOutput(lakename, realization, best_hid, best_norm='NA', trials=2, PGRNN=True): # for a given lake and realization of randomly chosen observations, compile the experiment results into an averaged prediction # @lakename = string nhd id # @best hid = number of best hidden units for experiment # @best norm (optional) = if l1 norm is used in hyperparameter optimization this is the value # @trials (not implemented yet) = number of trials per experiment setup # @PGRNN = True if PGRNN, False if RNN realization = str(realization) pg = '' if PGRNN: pg = 'PGRNN' else: pg = 'RNN' o_path1 = '../../scripts/manylakes/outputs' + realization + '/' + lakename + pg + '_output_' + 'nhid' + str( best_hid) + '_norm' + str(best_norm) + '_trial0.feather' o_path2 = '../../scripts/manylakes/outputs' + realization + '/' + lakename + pg + '_output_' + 'nhid' + str( best_hid) + '_norm' + str(best_norm) + '_trial1.feather' merge_path = '../../scripts/manylakes/outputs' + realization + '/' + lakename + pg + '_output_' + 'nhid' + str( best_hid) + '_norm' + str(best_norm) + '_BESTmerged.feather' merge_path2 = '../../scripts/manylakes/outputs' + realization + '/' + lakename + pg + '_output_' + 'nhid' + str( best_hid) + '_norm' + str(best_norm) + '_BESTmerged2.feather' obs1 = pd.read_feather(o_path1) obs2 = pd.read_feather(o_path2) obs_merged = obs1.copy() obs_merged2 = pd.DataFrame().reindex_like(obs1) # obs_merged.values[:, 1:] = (obs1.values[:, 1:] + obs2.values[:, 1:]) / 2 # average the two trials feather.write_dataframe(obs_merged, merge_path) obs_merged2.values[:, 1:] = (obs1.values[:, 1:] + obs2.values[:, 1:]) / 2 obs_merged2['date'] = pd.to_datetime(df['date']) # obs_merged2.values[:,0] = obs1.values[:,0] feather.write_dataframe(obs_merged2, merge_path2)
def save_to_r_dataset(df, path, save_as_csv=False): """Convert pandas dataframe to r dataframe. Parameters ---------- df : dataframe Pandas dataframe. path : str Path to save. Returns ------- None Description of returned object. """ if save_as_csv: df.to_csv(path, index=False) else: feather.write_dataframe(df, path) return None
def parse_files(cols_to_keep=['ts', 'url', 'languages']): i = 1 master_df = pd.DataFrame() for file in os.listdir(index_folder): if 'cdx-' in file: if i % 10 == 0 or i == 1: sys.stdout.write('\rParsing file {} out of {}'.format(i, len(os.listdir(index_folder)))) sys.stdout.flush() file_path = os.path.join(index_folder, file) df = _parse_index_file(file_path, cols_to_keep) master_df = pd.concat([master_df, df], sort=False) i += 1 print('Parsing complete! {} total records extracted.'.format(len(master_df))) logging.debug('Saving DataFrame...') data_path = '../data/raw/' master_file = 'cc_urls_' + yearmonth feather.write_dataframe(master_df, os.path.join(data_path, master_file))
def main(input, output): basicConfig(level=INFO, handlers=[ StreamHandler(), FileHandler('{}.log'.format(output), mode='w') ]) # Concatenate away. info('Concatenating data') data = (feather.read_dataframe(x) for x in input) concatenated = pd.concat(data) # Write the data. info('Writing data') feather.write_dataframe(concatenated, output)
def write_player_id_file(): """ Writes the player id file to disk in feather format This file maps player IDs to names, positions, handedness, teams, and jersey numbers. Using IDs is a way to avoid having to correct the numerous spelling inconsistencies in the data. """ import feather try: PLAYER_IDS.sort_values(by="ID", inplace=True) except UnboundLocalError: PLAYER_IDS = get_player_id_file() PLAYER_IDS.sort_values(by="ID", inplace=True) PLAYER_IDS['#'] = PLAYER_IDS['#'].astype(int) PLAYER_IDS['ID'] = PLAYER_IDS['ID'].astype(str) PLAYER_IDS['Name'] = PLAYER_IDS['Name'].astype(str) PLAYER_IDS['Pos'] = PLAYER_IDS['Pos'].astype(str) PLAYER_IDS['Team'] = PLAYER_IDS['Team'].astype(str) PLAYER_IDS['Hand'] = PLAYER_IDS['Hand'].astype(str) PLAYER_IDS = PLAYER_IDS.drop_duplicates() feather.write_dataframe(PLAYER_IDS, PLAYER_ID_FILE)
def _R_repr(obj): if isinstance(obj, bool): return 'TRUE' if obj else 'FALSE' elif isinstance(obj, (int, float, str)): return repr(obj) elif isinstance(obj, Sequence): if len(obj) == 0: return 'c()' # if the data is of homogeneous type, let us use c() # otherwise use list() # this can be confusion but list can be difficult to handle if homogeneous_type(obj): return 'c(' + ','.join(_R_repr(x) for x in obj) + ')' else: return 'list(' + ','.join(_R_repr(x) for x in obj) + ')' elif obj is None: return 'NULL' elif isinstance(obj, dict): return 'list(' + ','.join('{}={}'.format(x, _R_repr(y)) for x,y in obj.items()) + ')' elif isinstance(obj, set): return 'list(' + ','.join(_R_repr(x) for x in obj) + ')' else: import numpy import pandas if isinstance(obj, (numpy.intc, numpy.intp, numpy.int8, numpy.int16, numpy.int32, numpy.int64,\ numpy.uint8, numpy.uint16, numpy.uint32, numpy.uint64, numpy.float16, numpy.float32, \ numpy.float64)): return repr(obj) elif isinstance(obj, numpy.matrixlib.defmatrix.matrix): try: import feather except ImportError: raise UsageError('The feather-format module is required to pass numpy matrix as R matrix' 'See https://github.com/wesm/feather/tree/master/python for details.') feather_tmp_ = tempfile.NamedTemporaryFile(suffix='.feather', delete=False).name feather.write_dataframe(pandas.DataFrame(obj).copy(), feather_tmp_) return 'data.matrix(read_feather("{}"))'.format(feather_tmp_) elif isinstance(obj, numpy.ndarray): return 'c(' + ','.join(_R_repr(x) for x in obj) + ')' elif isinstance(obj, pandas.DataFrame): try: import feather except ImportError: raise UsageError('The feather-format module is required to pass pandas DataFrame as R data.frame' 'See https://github.com/wesm/feather/tree/master/python for details.') feather_tmp_ = tempfile.NamedTemporaryFile(suffix='.feather', delete=False).name try: data = obj.copy() feather.write_dataframe(data, feather_tmp_) except: # if data cannot be written, we try to manipulate data # frame to have consistent types and try again for c in data.columns: if not homogeneous_type(data[c]): data[c] = [str(x) for x in data[c]] feather.write_dataframe(data, feather_tmp_) return 'read_feather("{}")'.format(feather_tmp_) else: return repr('Unsupported datatype {}'.format(short_repr(obj)))
try: sim_data = [x.get(timeout = 300) for x in res] except mp.TimeoutError: print("Simulation failed: timeout.") del(pool) sys.exit(1) print("Pool.map finished for risk_model %d" % risk_model) for sid in sim_data: iter_data = iter_data.append(sid, ignore_index = True) print("Finishing job for risk model %d" % risk_model) return iter_data eu_results = do_simulations(100, risk_model = 0) eu_iter_data = eu_results # eu_iter_data.to_csv('eu_iter_data_p.csv') feather.write_dataframe(eu_iter_data, 'eu_iter_data_p.feather') regret_results = do_simulations(100, risk_model = 1) regret_iter_data = regret_results # regret_iter_data.to_csv('regret_iter_data_p.csv') feather.write_dataframe(regret_iter_data, 'regret_iter_data_p.feather') prospect_results = do_simulations(100, risk_model = 2) prospect_iter_data = prospect_results #prospect_iter_data.to_csv('prospect_iter_data_p.csv') feather.write_dataframe(prospect_iter_data, 'prospect_iter_data_p.feather') mixed_results = do_simulations(100, risk_model = 3) mixed_iter_data = mixed_results # mixed_iter_data.to_csv('mixed_iter_data_p.csv') feather.write_dataframe(mixed_iter_data, 'mixed_iter_data_p.feather')
try: bikedata['dtstoptime'] = pd.to_datetime(bikedata.stoptime, format="%m/%d/%Y %H:%M:%S") except ValueError: try: bikedata['dtstoptime'] = pd.to_datetime(bikedata.stoptime, format="%m/%d/%Y %H:%M") except ValueError: bikedata['dtstoptime'] = pd.to_datetime(bikedata.stoptime, format="%Y-%m-%d %H:%M:%S") # Set the startdate and stopdate - minutes and seconds reset to 0 (in the following format - 2016-03-01 06:00:00) # This has been done so that we could aggregate departures and arrivals per hour to identify bike usage bikedata['dtstartdatehour'] = bikedata.dtstarttime.apply(lambda x:x.replace(minute=0,second=0)) bikedata['dtstopdatehour'] = bikedata.dtstoptime.apply(lambda x:x.replace(minute=0,second=0)) bikedata['startdatehour'] = bikedata.dtstartdatehour.apply(lambda x:x.strftime('%Y-%m-%d %H:%M:%S')) # bikedata['startdate'] = bikedata.dtstarttime.dt.date.apply(lambda x:x.strftime('%Y-%m-%d')) # datetime.datetime columns are not supported by Feather # So, deleting the column after pre-processing. # TODO: Can we do without converting the dates in the first place? bikedata.drop('dtstarttime', axis=1, inplace=True) bikedata.drop('dtstoptime', axis=1, inplace=True) print fname, ' - ', len(bikedata) listFiles.append(bikedata) df = pd.concat(listFiles) print len(df) feather.write_dataframe(df, dirname + feather_output_filename) print "Successfully written into feather format"
import datetime import feather import pandas import sys if __name__ == '__main__': _, type_, date = sys.argv csv_file = '{}.{}.csv'.format(type_, date) df = pandas.DataFrame.from_csv(csv_file).reset_index() feather_file = '{}.{}.feather'.format(type_, date) feather.write_dataframe(df, feather_file) print('{} {} {} done'.format(datetime.datetime.now(), type_, date))
import pandas import feather # Read flights data and select flights to O'Hare flights = pandas.read_csv("tests/testthat/data/flights.csv") flights = flights[flights['dest'] == "ORD"] # Select carrier and delay columns and drop rows with missing values flights = flights[['carrier', 'dep_delay', 'arr_delay']] flights = flights.dropna() flights.head(10) # Write to feather file for reading from R feather.write_dataframe(flights, "tests/testthat/data/flights.feather")
co is, on average, 2.4 days after ci """ df.date_time = pd.to_datetime(df.date_time, errors='coerce') df.srch_ci = pd.to_datetime(df.srch_ci, errors='coerce') df.srch_co = pd.to_datetime(df.srch_co, errors='coerce') df.srch_ci = df.srch_ci.fillna(df.date_time + timedelta(days=35)) df.srch_co = df.srch_co.fillna(df.srch_ci + timedelta(days=2)) return df print(78*'=') print("Reading train...") df_train = feather.read_dataframe('../data/train_only_booked.feather') print("Creating Features for Train...") df_train_features = create_features(df_train, train=True) print("Writing Feather...") feather.write_dataframe(df_train_features, '../data/train_only_booked_features.feather') gc.collect() print(78*'=') print("Reading holdout...") df_hold = feather.read_dataframe('../data/holdout.feather') print("Munging Holdout") df_hold_feat = create_features(df_hold) print("Writing Feather...") feather.write_dataframe(df_hold_feat, '../data/holdout_features.feather') gc.collect() print(78*'=') print("Reading LB Test...") df_lb = feather.read_dataframe('../data/test.feather') print("Creating Features for Public_LB")
import feather, pandas, numpy as np, datetime Abool = np.array([True,True,False]) Aint8 = np.array([1,1,0], dtype=np.int8) Aint16 = np.array([1,1,0], dtype=np.int16) Aint32 = np.array([1,1,0], dtype=np.int32) Aint64 = np.array([1,1,0], dtype=np.int64) Auint8 = np.array([1,1,0], dtype=np.uint8) Auint16 = np.array([1,1,0], dtype=np.uint16) Auint32 = np.array([1,1,0], dtype=np.uint32) Auint64 = np.array([1,1,0], dtype=np.uint64) Afloat32 = np.array([1.0, "NaN", 0.0], dtype=np.float32) Afloat64 = np.array(["Inf", 1.0, 0.0], dtype=np.float64) Autf8 = ["hey","there","sailor"] Abinary = [b"hey",b"there",b"sailor"] # Adate = [datetime.datetime(2016,1,1).date(),datetime.datetime(2016,1,2).date(),datetime.datetime(2016,1,3).date()] Adatetime = [datetime.datetime(2016,1,1),datetime.datetime(2016,1,2),datetime.datetime(2016,1,3)] Acat = pandas.Categorical(["a","b","c"], categories=["a","b","c","d"],ordered=False) # don't conform to Arrow! Acatordered = pandas.Categorical(["d","e","f"], categories=["d","e","f"],ordered=True) # don't conform to Arrow! df = pandas.DataFrame({"Abool": Abool,"Aint8": Aint8,"Aint16": Aint16,"Aint32": Aint32,"Aint64": Aint64,"Auint8": Auint8,"Auint16": Auint16,"Auint32": Auint32,"Auint64": Auint64,"Afloat32": Afloat32,"Afloat64": Afloat64,"Autf8": Autf8,"Abinary": Abinary,"Adatetime": Adatetime, "Acat": Acat,"Acatordered":Acatordered}) feather.write_dataframe(df, "/home/test.feather")
from pandas.util.testing import assert_frame_equal import pandas as pd import feather import uuid nrows = 4000000 ncols = 100 data = np.random.randn(nrows) df = pd.DataFrame({'c{0}'.format(i): data for i in range(ncols)}) def guid(): return uuid.uuid4().hex path = 'test_{0}.feather'.format(guid()) try: feather.write_dataframe(df, path) df2 = feather.read_dataframe(path) assert_frame_equal(df, df2) finally: try: os.remove(path) except os.error: pass
def convert_feather(df, output_filename): feather.write_dataframe(df, output_filename)
shop_id = df_shop.shop_id.iloc[0] df_shop_test['shop_id'] = [shop_id] * 14 df_shop_test['day'] = pd.date_range('2016-11-01', '2016-11-14') days_max = df_shop.days_from_beginning.max() df_shop_test['days_from_beginning'] = np.arange(days_max + 1, days_max + 15) df_shop_test['pays_count'] = np.nan df_shop_test['week_id'] = [0] * 7 + [-1] * 7 df_shop_test['biweek_id'] = 0 return df_shop_test dfs = [] for i in tqdm(shops): df_shop = df_pays[df_pays.shop_id == i] dfs.append(df_shop) df_shop_test = generate_test_df(df_shop) dfs.append(df_shop_test) df_pays = pd.concat(dfs).reset_index(drop=1) df_pays['dow'] = df_pays.day.dt.dayofweek.astype('uint8') df_pays['is_weekend'] = df_pays.dow.isin([5, 6]) feather.write_dataframe(df_pays, 'data/df_pays_na_test.feather')
def nnd_hotdeck_using_feather(receiver = None, donor = None, matching_variables = None, z_variables = None): """ Not working """ import feather assert receiver is not None and donor is not None assert matching_variables is not None temporary_directory_path = os.path.join(config_files_directory, 'tmp') assert os.path.exists(temporary_directory_path) receiver_path = os.path.join(temporary_directory_path, 'receiver.feather') donor_path = os.path.join(temporary_directory_path, 'donor.feather') feather.write_dataframe(receiver, receiver_path) feather.write_dataframe(donor, donor_path) if isinstance(matching_variables, str): match_vars = '"{}"'.format(matching_variables) elif len(matching_variables) == 1: match_vars = '"{}"'.format(matching_variables[0]) else: match_vars = '"{}"'.format('todo') r_script = """ rm(list=ls()) gc() devtools::install_github("wesm/feather/R") library(feather) library(StatMatch) receiver <- read_feather({receiver_path}) donor <- read_feather({donor_path}) summary(receiver) summary(donor) # variables receiver = as.data.frame(receiver) donor = as.data.frame(donor) gc() match_vars = {match_vars} # don_class = c("sexe") out.nnd <- NND.hotdeck( data.rec = receiver, data.don = donor, match.vars = match_vars ) # out.nndsummary(out.nnd$mtc.ids) # head(out.nnd$mtc.ids, 10) # head(receiver, 10) fused.nnd.m <- create.fused( data.rec = receiver, data.don = donor, mtc.ids = out.nnd$mtc.ids, z.vars = "{z_variables}" ) summary(fused.nnd.m) """.format( receiver_path = receiver_path, donor_path = donor_path, match_vars = match_vars, z_variables = z_variables, ) print(r_script)
import os import pandas as pd import feather os.getcwd() fp = os.getcwd().replace("feature_eng", "") train = feather.read_dataframe(fp + "data/train.feather") df = pd.pivot_table( train, values="Demanda_uni_equil", index=[ "Cliente_ID", "Producto_ID", "Agencia_ID", "Canal_ID", "Ruta_SAK", "Venta_uni_hoy", "Venta_hoy", "Dev_uni_proxima", "Dev_proxima", ], columns="Semana", ) df = df.reset_index() feather.write_dataframe(df, fp + "data/week_split_train.feather")