def read_monthly(infile, dd): """ Parameters ---------- infile: str dd: DataFrame Returns ------- df: DataFrame """ widths = dd.length.tolist() logger.info("Reading monthly {}".format(infile)) if isinstance(infile, StringIO): df = pd.read_fwf(infile, widths=widths, names=dd.id.values) elif infile.endswith('.zip'): archive = zipfile.ZipFile(infile) filename = archive.namelist()[0] parent_dir = str(Path(infile).parent) colspec = pd.concat([dd.start - 1, dd.end], axis=1) colspec = colspec.values.tolist() with ensure_cleanup_zip(archive, filename, parent_dir): df = pd.read_fwf(os.path.join(parent_dir, filename), colspecs=colspec, names=dd.id.values) # TODO: Fix stripping of 0s return df
def load_names(): #Last names last_names = pd.read_fwf('Names/dist.all.last', header=None, widths=[14,7,7,7]) first_male = pd.read_fwf('Names/dist.male.first', header=None, widths=[14,7,7,7]) first_female = pd.read_fwf('Names/dist.female.first', header=None, widths=[14,7,7,7]) subset_last_name = last_names[last_names[2]<=70] subset_first_male = first_male[first_male[2]<=80] subset_first_femal = first_female[first_female[2]<=80] names = pd.concat([subset_last_name[0], subset_first_male[0], subset_first_femal[0]], ignore_index=True) return names
def parsing(filename, T_set='False',form='wtreg'): """filename must be in path/TxYYMMDD format. Returns Pandas dataframe The log file will be run through a checker to make sure that there are no bad lines. Thresholds will be converted from hex format to dBm If T_set is set to 'True' only the thresholds, latitudes, longitudes and altitudes will be returned with the station identifier as a suffix, otherwise the entire log file will be parsed. """ check_log(filename,form) if os.path.isfile(filename): dateparse = lambda x: pd.datetime.strptime(x, '%m/%d/%y %H:%M:%S') namelist = ['ID','Datetime','Version','Threshold','?', 'Triggers','GPS_Number','GPS_Mode','Temp', 'Lat','Lon','Alt'] if form=='wtreg': widths_list = [1,18,4,5,12,7,3,3,3,9,10,8] collist = [1,3,9,10,11] if form=='old7': widths_list = [1,18,4,5,7,7,3,3,3,9,10,8] collist = [1,3,9,10,11] if form=='newok': widths_list = [1,18,4,5,12,7,3,3,4,4,9,10,8] collist = [1,3,10,11,12] namelist = ['ID','Datetime','Version','Threshold','???', 'Triggers','GPS_Number','GPS_Mode','Temp','Batt', 'Lat','Lon','Alt'] if T_set=='True': df = pd.read_fwf(filename, widths=widths_list, names=namelist, usecols=collist, parse_dates = [0], date_parser = dateparse, na_values='\n') station=filename[-7] df['Threshold'] = df['Threshold'].apply(hex2page) df=df.rename(columns = {'Threshold':'Threshold_%s'%station, 'Lat':'Lat_%s'%station, 'Lon':'Lon_%s'%station, 'Alt':'Alt_%s'%station}) else: df = pd.read_fwf(filename, widths=widths_list, names=namelist, parse_dates = [1], date_parser = dateparse, na_values='\n') df['Threshold'] = df['Threshold'].apply(hex2page) df=df.set_index('Datetime') return df
def run(): session = get_session() class AveragePriceData(Model): # ['footnote_codes', 'item_name', 'end_year', 'area_name', 'begin_year', 'area_code', 'item_code', 'begin_period', 'end_period'] series_id = Text(primary_key=True) footnote_codes = Text() item_name = Text() begin_year = Integer() end_year = Integer() area_name = Text() area_code = Text() item_code = Text() begin_period = Text() end_period = Text() sync_table(AveragePriceData) # read the master data (ap.series) series = pandas.read_csv(path.format("ap/ap.series"), sep='\t', skiprows=1, names=["series_id", "area_code", "item_code", "footnote_codes", "begin_year", "begin_period", "end_year", "end_period"] ) # not sure why i'm getting extra spaces, cleaning that up series["item_code"] = series["item_code"].map(lambda x: str(x).strip()) series.set_index("series_id", inplace=True) # load areas area = pandas.read_fwf(path.format("ap/ap.area"), widths=[4,100], names=["area_code", "area_name"], skiprows=2) area.set_index("area_code", inplace=True) footnotes = pandas.read_fwf(path.format("ap/ap.footnote"), skiprows=1, widths=[1,100], names=["footnote_code", "footnote_text"]) footnotes.set_index("footnote_code", inplace=True) items = pandas.read_fwf(path.format("ap/ap.item"), widths=[7, 100], skiprows=2, names=["item_code", "item_name"]) items.set_index("item_code", inplace=True) result = series.join(area, on="area_code").join(items, on="item_code") print result.head(5) for k, v in result.iterrows(): vals = v.to_dict() vals["series_id"] = k try: AveragePriceData.create(**vals) except Exception as e: print e print vals break print "Created {}".format(k)
def _parse_ghcnd_stnmeta(fpath_stns, fpath_stninv, elems, start_end=None, bbox=None): stns = pd.read_fwf(fpath_stns, colspecs=[(0, 11), (12, 20), (21, 30), (31, 37), (38, 40), (41, 71), (2, 3), (76, 79)], header=None, names=['station_id', 'latitude', 'longitude', 'elevation', 'state', 'station_name', 'network_code', 'hcn_crn_flag']) stns['station_name'] = stns.station_name.apply(unicode, errors='ignore') stns['provider'] = 'GHCND' stns['sub_provider'] = (stns.network_code.apply(lambda x: _NETWORK_CODE_TO_SUBPROVIDER[x])) if bbox is not None: mask_bnds = ((stns.latitude >= bbox.south) & (stns.latitude <= bbox.north) & (stns.longitude >= bbox.west) & (stns.longitude <= bbox.east)) stns = stns[mask_bnds].copy() stn_inv = pd.read_fwf(fpath_stninv, colspecs=[(0, 11), (31, 35), (36, 40), (41, 45)], header=None, names=['station_id', 'elem', 'start_year', 'end_year']) stn_inv['elem'] = stn_inv.elem.str.lower() stn_inv = stn_inv[stn_inv.elem.isin(elems)] stn_inv = stn_inv.groupby('station_id').agg({'end_year': np.max, 'start_year': np.min}) stn_inv = stn_inv.reset_index() stns = pd.merge(stns, stn_inv, on='station_id') if start_end is not None: start_date, end_date = start_end mask_por = (((start_date.year <= stns.start_year) & (stns.start_year <= end_date.year)) | ((stns.start_year <= start_date.year) & (start_date.year <= stns.end_year))) stns = stns[mask_por].copy() stns = stns.reset_index(drop=True) stns = stns.set_index('station_id', drop=False) return stns
def _basis_set_order(chunk, mapr, sets): # Gaussian only prints the atom center # and label once for all basis functions first = len(chunk[0]) - len(chunk[0].lstrip(' ')) + 1 df = pd.read_fwf(six.StringIO('\n'.join(chunk)), widths=[first, 4, 3, 2, 4], header=None) df[1].fillna(method='ffill', inplace=True) df[1] = df[1].astype(np.int64) - 1 df[2].fillna(method='ffill', inplace=True) df.rename(columns={1: 'center', 3: 'N', 4: 'ang'}, inplace=True) df['N'] = df['N'].astype(np.int64) - 1 if 'XX' in df['ang'].values: df[['L', 'l', 'm', 'n']] = df['ang'].map({'S': [0, 0, 0, 0], 'XX': [2, 2, 0, 0], 'XY': [2, 1, 1, 0], 'XZ': [2, 1, 0, 1], 'YY': [2, 0, 2, 0], 'YZ': [2, 0, 1, 1], 'ZZ': [2, 0, 0, 2], 'PX': [1, 1, 0, 0], 'PY': [1, 0, 1, 0], 'PZ': [1, 0, 0, 1], }).apply(tuple).apply(pd.Series) else: df['L'] = df['ang'].str[:1].str.lower().map(lmap).astype(np.int64) df['ml'] = df['ang'].str[1:] df['ml'].update(df['ml'].map({'': 0, 'X': 1, 'Y': -1, 'Z': 0})) df['ml'] = df['ml'].astype(np.int64) cnts = {key: -1 for key in range(10)} pcen, pl, pn, shfns = 0, 0, 1, [] for cen, n, l, seht in zip(df['center'], df['N'], df['L'], df['center'].map(sets)): if not pcen == cen: cnts = {key: -1 for key in range(10)} if (pl != l) or (pn != n) or (pcen != cen): cnts[l] += 1 shfns.append(mapr[(seht, l)][cnts[l]]) pcen, pl, pn = cen, l, n df['shell'] = shfns df.drop([0, 2, 'N', 'ang'], axis=1, inplace=True) df['frame'] = 0 return df
def parse_classification_report(classification_report): """Parse a sklearn classification report to a dict.""" return pd.read_fwf( StringIO(classification_report), index_col=0, colspecs=[(0, 12), (12, 22), (22, 32), (32, 42), (42, 52)] ).dropna()
def ReadFemResp1995(): """Reads respondent data from NSFG Cycle 5. returns: DataFrame """ dat_file = '1995FemRespData.dat.gz' names = ['a_doi', 'timesmar', 'mardat01', 'bdaycenm', 'post_wt'] colspecs = [(12359, 12363), (3538, 3540), (11758, 11762), (13, 16), (12349, 12359)] df = pandas.read_fwf(dat_file, compression='gzip', colspecs=colspecs, names=names) df['cmmarrhx'] = df.mardat01 df['cmbirth'] = df.bdaycenm df['cmintvw'] = df.a_doi df['finalwgt'] = df.post_wt df.timesmar.replace([98, 99], np.nan, inplace=True) df['evrmarry'] = (df.timesmar > 0).astype(int) CleanData(df) return df
def load_dataframe(fobj, compression='gzip'): """Given an open file for `hip_main.dat.gz`, return a parsed dataframe. If your copy of ``hip_main.dat`` has already been unzipped, pass the optional argument ``compression=None``. """ try: from pandas import read_fwf except ImportError: raise ImportError(PANDAS_MESSAGE) names, colspecs = zip( ('hip', (2, 14)), ('magnitude', (41, 46)), ('ra_degrees', (51, 63)), ('dec_degrees', (64, 76)), ('parallax_mas', (79, 86)), # TODO: have Star load this ('ra_mas_per_year', (87, 95)), ('dec_mas_per_year', (96, 104)), ) df = read_fwf(fobj, colspecs, names=names, compression=compression) df = df.assign( ra_hours = df['ra_degrees'] / 15.0, epoch_year = 1991.25, ) return df.set_index('hip')
def classify_1(k, training_set): a, test_files = commands.getstatusoutput(\ 'ls ./testDigits') test_files = test_files.split('\n') count = np.empty(len(test_files)) for i, test_file in enumerate(test_files): test = pd.read_fwf('./testDigits/' + test_file,\ header=None, widths=[1] * 32) result_list = [] for key in training_set.keys(): diff = (training_set[key] == test).sum().sum() result_list.append([diff, key[0]]) result_frame = DataFrame(result_list, columns=['distance', 'number']) indice = result_frame['distance'].argsort()[-k:] result = result_frame.ix[indice, 'number'].value_counts().index[0] count[i] = result == test_file[0] print test_file[0], result, count[i] p = sum(count)/float(count.size) print count.size, 'test case', sum(count), 'right', p return p #train_file_digit() #print classify_1(30, train_file_digit())
def process_sst(txtfile): """ Read mhl_sst_data from a CSV file (in current directory, unless otherwise specified) and convert it to a netCDF file. If successful, the name of the saved file is returned. """ extension = txtfile[-4:] if extension == '.TXT': format = 'old' colspecs = colspecs_old header = 7 skip_rows = 7 names = names_old elif extension == '.txt': format = 'new' colspecs = colspecs_new header = 9 skip_rows = 9 names = names_new # CSV file to extract array of formatted data data = pandas.read_fwf(txtfile, colspecs=colspecs, names=names, header=header, skip_rows=skip_rows) # convert time from string to decimal time, IMOS compliant (dtime, time) = convert_to_utc(data['Date_Time'], format) # use source filename to get deployment number. # extract spatial infor from summary file site_code_short = os.path.basename(txtfile)[:3] spatial_data = get_spatial_data(txtfile, site_code_short, format) # generate NetCDF create_mhl_sst_ncfile(txtfile, site_code_short, data, time, dtime, spatial_data)
def parse_momatrix(self): """ Parse the :class:`~exatomic.core.orbital.MOMatrix` dataframe. Note: Must supply 'print "final vectors" "final vectors analysis"' for momatrix """ key0 = "Final MO vectors" key1 = "center of mass" found = self.find(key0, key1) if found[key0]: start = found[key0][0][0] + 6 end = found[key1][0][0] - 1 c = pd.read_fwf(StringIO("\n".join(self[start:end])), widths=(6, 12, 12, 12, 12, 12, 12), names=list(range(7))) self.c = c idx = c[c[0].isnull()].index.values c = c[~c.index.isin(idx)] del c[0] nbas = len(self.basis_set_order) n = c.shape[0]//nbas coefs = [] # The for loop below is like numpy.array_split(df, n); using numpy.array_split # with dataframes seemed to have strange results where splits had wrong sizes? for i in range(n): coefs.append(c.iloc[i*nbas:(i+1)*nbas, :].astype(float).dropna(axis=1).values.ravel("F")) c = np.concatenate(coefs) del coefs orbital, chi = _square_indices(len(self.basis_set_order)) self.momatrix = MOMatrix.from_dict({'coef': c, 'chi': chi, 'orbital': orbital, 'frame': 0})
def stations(): stations = read_fwf("tests/ghcnd-stations.txt", header=None, colspecs=[(0,11), (12, 20), (21, 30), (31,37), (38,40), (41,71), (72,75), (76,79), (80,85)], names=["station_id", "lat", "long", "elevation", "state", "name", "gsn_flag", "hcn_flag", "wmo_id"]) stations['wmo_id'] = stations['wmo_id'].astype(str) return stations
def read(filename): """ reads a fwf file into a pandas data frame Usage: data = read(filename) Note: ('ETA','DwellTime','Activity' are floats as they contain NaNs and cannot be automatically converted to integers) """ # creating the widths of each column widths = [12,21,12,12,12,9,12,12,21,255,255,25,25,12,12,12,12,17,51,51,12] # creating colspecs (containing starting and ending point [) of a column) cumsum = [sum(widths[:i+1]) for i in range(len(widths))] # excluding the commas cumsum0 = [0]+cumsum[:-1] cumsum_short = [item-1 for item in cumsum] colspecs = list(zip(cumsum0,cumsum_short)) # reading the file try: data = pd.read_fwf(filename,colspecs = colspecs,skiprows = [1]) except(IOError): print('This file does not exist. Please, check the filename or the directory.') sys.exit() # spectifying explicitly the data types data = data.astype('object') numeric_list = ['LON','LAT','ETA','DwellTime'] data[numeric_list] = data[numeric_list].astype('float') return(data)
def parse_monthly(fp, cache=True, nrows=None): with open('interesting.json') as f: col_map = json.load(f) month = zip_fp_to_month(fp) hdf_key = month_to_hdf_key(month) dd_key = month_to_dd_key(month) span = dd_key_to_span(dd_key) cm = col_map[span] if cache: with pd.HDFStore('data/store.h5') as store: if hdf_key in store: return None data_fp = extract(fp) try: dd = month_to_dd(month) subset = dd.loc[dd.field.isin(cm.keys())] names = list(subset.field) colspecs = (subset[['start', 'end']] .assign(start=lambda df: df.start - 1) .values.tolist()) # TALK: run through with subset first (nrows=100) df = (pd.read_fwf(data_fp, colspecs=colspecs, names=names, usecols=names, nrows=nrows) .rename(columns=cm) .sort(['mis'])) key = month_to_hdf_key(month) with pd.HDFStore(STOREPATH) as store: store.append(key, df, format='table', data_columns=True) except: os.remove(data_fp) raise os.remove(data_fp)
def refreshGetij(self,dt): Logger.info("refreshGetij: ") # http://getij.rws.nl/export.cfm?format=txt&from=02-04-2016&to=08-04-2016&uitvoer=1&interval=10&lunarphase=yes&location=SCHEVNGN&Timezone=MET_DST&refPlane=NAP&graphRefPlane=NAP getijPars = { 'format' : 'txt', 'from' : strftime("%d-%m-%Y"), 'to' : strftime("%d-%m-%Y"), 'uitvoer': 1, 'interval': 10, 'lunarphase': 'yes', 'location': 'SCHEVNGN', 'Timezone':'MET_DST', 'refPlane':'NAP', 'graphRefPlane':'NAP' } r = requests.get("http://getij.rws.nl/export.cfm",params=getijPars) Logger.info(r.text) dataFile = StringIO(r.text) df = pd.read_fwf(dataFile,widths=[17,6,3],names=['timestamp','height','unit'],skiprows=14) self.loggerDataframe(df.head()) dataFile.close() for index, row in df.iterrows(): try: self.ws.addObservation(pd.Timestamp(row['timestamp']).tz_localize('MET'),'getij',float(row['height']),'getij.rws.nl') except ValueError: Logger.info("oops") pass except AttributeError: Logger.info("oops") pass
def get_janus_epimetheus_resonances(): w = [len(" Janus1"), len(" reson"), len(" Resonance radius R")] def get_janos_epi_order(reso): a, b = reso.split(":") return int(a) - int(b) fname = pr.resource_filename("pyciss", "data/ring_janus_epimetheus_resonances.txt") with open(fname) as f: jan_epi_resonances = pd.read_fwf( f, skiprows=15, header=0, widths=w, skipfooter=1 ) # replace column names jan_epi_resonances.columns = ["moon", "reson", "radius"] # calculate order from resonance name jan_epi_resonances["order"] = jan_epi_resonances.reson.map(get_janos_epi_order) def func(x): "Remove space from resonce string" return ":".join(i.strip() for i in x.split(":")) jan_epi_resonances.reson = jan_epi_resonances.reson.map(func) # calculate name for axes display jan_epi_resonances["name"] = ( jan_epi_resonances.moon + " " + jan_epi_resonances.reson ) return jan_epi_resonances
def get_flare_catalog_fromfile(data_path): """ program to read in GOES h-alpha and x-ray flare information from file""" """ usage: [ha, xray]=get_flare_catalog; ha is a dict""" """ ha['location'][300] prints the 300th location""" """ keys are ha.keys() -- station_num, group_num, initial_time, final_time""" """ peak_time, optical_importance, optical_brightness, xray_class, """ """ xray_size, NOAA_AR """ #define data file location # ha_file=data_path+"/ha.txt" xray_file=data_path+"/xray.txt" print("Getting from file, years are only those downloaded") print("Reading X-ray flares from: ", xray_file) #code to read in xray data names=["data code", "station code", "year", "month", "day", "init_ind", "init_time", "final_ind", "final_time", "peak_ind", "peak_time", "location", "optical", "something", "xray_class", "xray_size", "station", "blank", "NOAA_AR", "etc"] widths=[2, 3, 2, 2, 2, 2, 4, 1, 4, 1, 4, 7, 3, 22, 1, 3, 8, 8, 6, 24] xray_df=pd.read_fwf(xray_file, widths=widths, header=None, names=names, parse_dates=[[2, 3, 4]]) #translates dates to datetime xray_df["location"]=[x if str(x)[0]=="N" or str(x)[0]=="S" else None for x in xray_df["location"]] xray_df["init_date"]=create_datetime(xray_df["year_month_day"], xray_df["init_time"]) xray_df["peak_date"]=create_datetime(xray_df["year_month_day"], xray_df["peak_time"]) xray_df["final_date"]=create_datetime(xray_df["year_month_day"], xray_df["final_time"]) xray_df=xray_df[["init_date", "peak_date", "final_date", "location", "xray_class", "xray_size", "NOAA_AR"]] #remove all the lines that don't have either a valid peak data or a valid init date (both have to be lacking) # print("len before", len(xray_df)) # xray_df = xray_df[np.isfinite(xray_df['peak_date']) | np.insfinite(xray_df['init_date'])] # print("len after", len(xray_df)) ha_df="not yet implemented" return (xray_df, ha_df)
def download_station_data(station_id): #station_id = 'USC00167344' station_url = "ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/all/" + station_id + ".dly" # Changing position 1 and 2 ( values being 4 and 2 (4 being the year, and 2 being the month)) # Combining them to a width of 6, instead of two widths of 4 and 2. widths = [11, 4, 2, 4, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1] # starts at 1 to exclude the station ID from the data # 1, 2, 3 are the year, month and measurment type. cols = [1, 2, 3, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124] #cols = [1, 2, 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63, # 67, 71, 75, 79, 83, 87, 91, 95, 99, 103, 107, 111, 115, 119, 123] names = ['year', 'month', 'data_type', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] df = pd.read_fwf(station_url, widths=widths, usecols=cols, header=None, names=names, parse_dates=[['year', 'month']], index_col=0, na_values='-9999') return df
def main(fileNameS, year): pathS = os.path.join(config.rawDataPathS, 'unemployment_statistics') conversionD = {'state_fips_code': (lambda x: str(x)), 'county_fips_code': (lambda x: str(x)), 'year': (lambda x: int(x)), 'labor_force': (lambda x: int(str(x).translate(None, ','))), 'employed': (lambda x: int(str(x).translate(None, ','))), 'unemployed_level': (lambda x: int(str(x).translate(None, ',')))} tableDF = pd.read_fwf(os.path.join(pathS, fileNameS), converters=conversionD, names=['laus_code', 'state_fips_code', 'county_fips_code', 'county_and_state', 'year', 'labor_force', 'employed', 'unemployed_level', 'unemployed_rate'], skipfooter=3, skiprows=6, widths=[18, 7, 6, 50, 4, 14, 13, 11, 9]) tableDF.loc[:, 'fips_code'] = (tableDF.state_fips_code + tableDF.county_fips_code).astype(int) # Select relevant columns and set index finalDF = tableDF.loc[:, ['fips_code', 'unemployed_rate']] finalDF.columns = ['FIPS', 'URate' + str(year)] finalDF = finalDF.sort(columns='FIPS') finalDF = finalDF.set_index('FIPS') return finalDF
def handle_data(self, data): ''' Function to parse data between \<pre\> tags @param data: Input data ''' if self.in_pre_tag == True and self.read_data == True: self.data_dict[self.label] = pd.read_fwf(StringIO(data), widths=[7,7,7,7,7,7,7,7,7,7,7], header=0, skiprows=[0,1,3,4]) split_data = data.split('\n') headings = split_data[2].split() units = split_data[3].split() self.metadata_dict[self.label] = OrderedDict() self.metadata_dict[self.label]['units'] = [(heading, unit) for heading, unit in zip(headings, units)] self.read_data = False self.tmp = data elif self.in_pre_tag == True and self.read_data == False: station_metadata_dict = OrderedDict() for line in data.splitlines(): if line != '': metadata = line.split(':') station_metadata_dict[metadata[0].strip()] = metadata[1].strip() self.metadata_dict[self.label]['metadata'] = station_metadata_dict self.read_data = True elif self.read_data == True and self.in_header == True: self.label = data.strip()
def ps(self, args=None, options='', all=True, verbose=True, as_frame='auto', raise_on_error=True): if args is None: args = '' if all: args += 'A' if verbose: args += 'f' if len(args) > 0 and args[0] != '-': args = '-' + args results = self.wait(('ps %s %s' % (args, options)).strip(), raise_on_error=raise_on_error) if as_frame == 'auto': as_frame = has_pandas if as_frame: if not has_pandas: raise ImportError("Unable to import pandas") df = pd.read_fwf(StringIO(results)) cmd_loc = df.columns.get_loc('CMD') if cmd_loc < len(df.columns): col = cmd_loc.fillna('') for i in range(cmd_loc + 1, len(df.columns)): col = col + df.icol(i).fillna('') df['CMD'] = col return df return results
def index_to_df(indexpath, label, convert_times=True): """The main reader function for PDS Indexfiles. In conjunction with an IndexLabel object that figures out the column widths, this reader should work for all PDS TAB files. Parameters ---------- indexpath : str or pathlib.Path The path to the index TAB file. label : pdstools.IndexLabel object Label object that has both the column names and the columns widths as attributes 'colnames' and 'colspecs' convert_times : bool Switch to control if to convert columns with "TIME" in name (unless COUNT is as well in name) to datetime """ indexpath = Path(indexpath) df = pd.read_fwf( indexpath, header=None, names=label.colnames, colspecs=label.colspecs ) if convert_times: for column in [i for i in df.columns if "TIME" in i and "COUNT" not in i]: if column == "LOCAL_TIME": # don't convert local time continue print(f"Converting times for column {column}.") try: df[column] = pd.to_datetime(df[column]) except ValueError: df[column] = pd.to_datetime( df[column], format=utils.nasa_dt_format_with_ms ) print("Done.") return df
def read_raso_fwf(pid, path): """Read a fixed-width format radiosonde file. These are the ones containing the climatology that was also used by Giovanni Massaro and Daniel Meyer. """ colspecs = [(8, 17), (17, 26), (26, 36), (43, 49)] names = ["p", "z", "T", "Td"] def errfloat(x): return None if "/" in x else float(x) file = filename(path) valid = (dt.datetime .strptime(file, "%Y%m%d_%H%M.reduced.txt") .replace(tzinfo=dt.timezone.utc) .timestamp() ) df = pd.read_fwf(path, colspecs=colspecs, names=names, converters={n: errfloat for n in names}, skiprows=1) df["T"] = 273.15 + df["T"] df["Td"] = 273.15 + df["Td"] ps = pd.Series(np.repeat(pid, len(df)), name="profile") # Calculate specific humidity and cloud water content qvap = fml.qvap(df["p"], df["Td"]) qliq = fml.qliq(df["z"], df["p"], df["T"], df["Td"]) data = pd.concat([ps, df, qvap, qliq], axis=1).as_matrix().tolist() cloudy = 1 if (qliq > 0).any() else 0 return pid, data, valid, cloudy, file
def parse_form_13f(fname): # note 'OTHER MANAGERS' field is absent from the fixed width column # definitions presumably because the no data is present in these documents # for this field. # The assertion will pick up if we get unexpected no. of columns conformed_period_of_report, filed_as_of_date, no_of_columns = \ parse_form_13f_head(fname) assert no_of_columns == len(Form13F.column_names), \ 'Not enough column_names/columns' # construct pandas.DataFrame from fixed with file, use the 0th column as # the label for the row (security) data_frame = pandas.read_fwf( fname, # ranges of the fixed width columns colspecs=[(0, 29), (29, 45), (45, 57), (57, 64), (64, 73), (73, 79), (79, 92), (92, 112), (112, 123), (123, 132)], skiprows=[0, 1, 2, 3, 4], index_col=0, names=Form13F.column_names, ) # drop label if all values in row are Na data_frame = data_frame.dropna(how='all') # for each column apply a function on each of the rows which strips strings # of tabs, newlines and spaces data_frame = data_frame.apply(lambda x: x.apply( lambda x: x.strip() if isinstance(x, str) else x) ) return Form13F(fname, conformed_period_of_report, filed_as_of_date, data_frame)
def readHypo71Sum(sumfile): """ Read a summary file from hypoinverse in the y2k compliant hypo71 format Parameters ---------- sumfile : str Path the the sum file Returns ----------- DataFrame populated with sumfile info """ fw = [(0, 20), (19, 22), (22, 23), (23, 28), (28, 32), (32, 33), (33, 38), (38, 45), (52, 55), (55, 59), (59, 64), (64, 69), (69, 74), (74, 79)] cols = ['ds', 'latd', 'latc', 'latm', 'lond', 'lonc', 'lonm', 'depth', 'numphase', 'azgap', 'stadist', 'rms', 'horerr', 'vererr'] toDrop = ['ds', 'latd', 'latc', 'latm', 'lond', 'lonc', 'lonm'] df = pd.read_fwf(sumfile, colspecs=fw, names=cols) latmul = [1 if x else -1 for x in df['latc'].isnull()] df['lat'] = np.multiply((df['latd'] + df['latm'] / 60.), latmul) lonmul = [1 if x else -1 for x in df['lonc'].isnull()] df['lon'] = np.multiply((df['lond'] + df['lonm'] / 60.), lonmul) utcs = [obspy.UTCDateTime(x.replace(' ', '')) for x in df.ds] irisws = [x.format_iris_web_service().replace(':', '-') for x in utcs] times = [x.timestamp for x in utcs] names = [x.split('.')[0] for x in irisws] df['times'] = times df['names'] = names df.drop(toDrop, axis=1, inplace=True) return df
def _read_stns(self): if self.download_updates and not self._download_run: self.download_local() stns = pd.read_fwf( os.path.join(self.path_ushcn_data, "ushcn-v2.5-stations.txt"), colspecs=[(0, 11), (12, 20), (21, 30), (31, 37), (38, 40), (41, 71)], header=None, names=["station_id", "latitude", "longitude", "elevation", "state", "station_name"], ) stns["station_name"] = stns.station_name.apply(unicode, errors="ignore") stns["provider"] = "USHCN" stns["sub_provider"] = "" if self.bbox is not None: mask_bnds = ( (stns.latitude >= self.bbox.south) & (stns.latitude <= self.bbox.north) & (stns.longitude >= self.bbox.west) & (stns.longitude <= self.bbox.east) ) stns = stns[mask_bnds].copy() stns = stns.set_index("station_id", drop=False) return stns
def main(): mpl.rcParams['font.size'] = 11 mpl.rcParams['font.family'] = 'sans-serif' mpl.rcParams['text.usetex'] = 'True' pgf_preamble = (r'\usepackage{/home/luismi/doc/2016/' + r'spaceapps/latex_stylesheet}') mpl.rcParams['text.latex.preamble'] = pgf_preamble data = [] for i in range(1, 6): data.append(pd.read_fwf('data/rocket/{}0000.csv'.format(i))) height = [] p_c = [] isp = [] for d in data: height += (h(d.p.values[2:] * 1E5) / 1E3).tolist() isp += d.isp.values[2:].tolist() for i in range(1, 6): p_c += len(data[0].values[2:]) * [i * 10] triang = mpl.tri.Triangulation(p_c, height) fig, ax = new_ax() c1 = ax.tricontourf(triang, isp, np.arange(2900, 3151, 25), cmap=mpl.cm.viridis) c2 = ax.tricontour(triang, isp, np.arange(2900, 3151, 50), colors='k', linestyles='--') cb = plotty.colorbar(c1, pad=0.15, fraction=0.1) cb.set_label(r'$Isp$ [\si{\metre\per\second}]') ax.clabel(c2, fmt=r'%.0f \si{\metre\per\second}') ax.set_xlabel(r'$p_c$ [\si{\mega\pascal}]') ax.set_ylabel(r'$h$ [\si{\kilo\metre}]') plotty.candy(ax, ncol=2, cb=True, pad=0.15, fraction=0.1) ax.xaxis.set_ticks([10, 20, 30, 40]) ax.yaxis.set_ticks([-4., -3., -2., -1., 0.]) fig.savefig('img/isp.pdf')
def run_pandas(): """ Load records into pandas data frame. * PyPy: OK * Source: https://github.com/pydata/pandas * Docs: amazing * Independent: no * Small: no * Can specify column data types: yes * Can read in chunks: yes * Can skip columns: yes * Can stream: yes but it won't be a DataFrame * Return type: DataFrame * Memory usage: about 60Mb * Timing: around 0.5 sec """ zp = pd.read_fwf( 'data/ZIP.DAT', widths=[5, 2, 28, 1, 5, 7, 8, 3, 6, 1, 1, 4, 4, 3], names=['zip_code', 'state_code', 'city_name', 'type', 'county_fips', 'lat', 'lon', 'area_code', 'fin_code', 'last_line', 'facility', 'msa_code', 'pmsa_code', 'filler'], usecols=[0, 1, 2, 4, 5, 6, 7, 11, 12], converters={'zip_code': str, 'county_fips': str, 'area_code': str, 'msa_code': str, 'pmsa_code': str}, header=None, skiprows=2 ) print 'Records:', len(zp)
def read_sica_file(self, file_path): arquivo = pd.read_fwf(file_path) print(arquivo)
core_file_extpath = glob.glob(path + "/*.xls") core_fileext = [os.path.split(h)[1] for h in core_file_extpath] core_file = [os.path.splitext(os.path.basename(j))[0] for j in core_fileext] if len(core_file) != 0: core_file_s = core_file[0] else: core_file_s = input("Excel file not found - please manually type sample name:") #Output excel file is created writer = pd.ExcelWriter(core_file_s+" summary.xlsx") for file in filenames: title = str(file)+" region in "+str(core_file_s) sn = pd.read_fwf(file+".par", delim_whitespace=True) #Confirms file read print("Collected "+file) #calculates percentage integration and confirms totals add up to 100% columns = (list(sn.columns)) ints = list(sn[columns[3]]) int_percents = [] total_int = sum(ints) for integration in ints: int_percents.append(round(((integration/total_int)*100), 4)) sn["Percent Integration"] = int_percents #Total Integration Percentage Sum should be 100 as a sanity check sn["Total Integration Percentage Sum"] = sum(int_percents)
# Define data file field widths field_widths = ([4, 2] + [1] * 12 + [2] + [1] * 2 + [2] + [1] * 2 + [2] * 2 + [1] * 2 + [4] * 2 + [2] * 2 + [4, 2] + [4] * 2 + [5] + [4] * 4) # Data file field names field_names = [ 'd1', 'd2', 'd3', 'worklic', 'd5', 'nwlic', 'd7', 'd8', 'd9', 'cars', 'd11', 'd12', 'd13', 'sex', 'd15', 'household_position', 'driving_licences', 'occupation', 'd19', 'd20', 'd21', 'mode', 'd23', 'd24', 'pt_owalk_h', 'pt_dwalk_h', 'origin_zone', 'destination_zone', 'pt_tot', 'pt_lines', 'pt_owalk', 'pt_dwalk', 'dist', 'car_time', 'park_orig', 'park_dest', 'pt_wait' ] # Read data data = pd.read_fwf('./grenoble.dat', widths=field_widths, header=None, names=field_names) # Clean data # Drop unecessary columns data = data.drop(columns=[ 'd1', 'd2', 'd3', 'd5', 'd7', 'd8', 'd9', 'd11', 'd12', 'd13', 'd15', 'd19', 'd20', 'd21', 'd23', 'd24', 'destination_zone', 'pt_dwalk_h', 'pt_dwalk', 'park_orig', 'pt_wait' ]) # Set Nan to zero data = data.applymap(lambda x: 0.0 if np.isnan(x) else x) # Convert all data to integers data = data.applymap(lambda x: int(x))
def get_average_payload(comm): all_payloads_list = attributes_for_full[( attributes_for_full['2'] == comm)]['8'].tolist() return sum(all_payloads_list) / len(all_payloads_list) def get_average_tare_weight(comm): all_tare_weight_list = attributes_for_full[( attributes_for_full['2'] == comm)]['9'].tolist() return sum(all_tare_weight_list) / len(all_tare_weight_list) # using cost.dat attributes_for_full = pandas.read_fwf( cost_dat_output, colspecs=split_width([5, 5, 10, 10, 10, 10, 10, 5, 5, 10]), header=None) # attributes_for_full.columns = [['RR-Code','Commod.','TrainCost/hr','Cost/gross-ton-mile', # 'terminal-processing-cost/car,fixed','terminal-cost/car-hr','transfer-cost/car','car-payload', # 'car-tare-wt','Gross Car Weight','Cars per Train','Gross Train Weight']] attributes_for_full.columns = [[ '1', '2', '3', '4', '5', '6', '7', '8', '9', '12' ]] attributes_for_full = attributes_for_full[['1', '2', '8', '9']] for i in range(1, no_of_commodity + 1): attributes_for_empty = pandas.DataFrame({ '1': [0], '2': [i], '8': [get_average_payload(i)],
import pandas as pd from sklearn import linear_model import matplotlib.pyplot as plt data = pd.read_fwf('brain_body.txt') brain = data[['Brain']] body = data[['Body']] # Create linear regression object regr = linear_model.LinearRegression() regr.fit(brain.head(3), body.head(3)) predicted_data = regr.predict(pd.DataFrame({"values": [3.385]})) challenge_dataframe = pd.read_table('challenge_dataset.txt', delim_whitespace=False, names=("testing", )) challenge_dataframe_xvalues = pd.DataFrame( challenge_dataframe.testing.str.split(",").tolist(), columns=["xvalue", "yvalue"])[[0]] print(challenge_dataframe_xvalues.head()) challenge_dataframe_yvalues = pd.DataFrame( challenge_dataframe.testing.str.split(",").tolist(), columns=["xvalue", "yvalue"])[[1]] print(predicted_data.shape) print(predicted_data) print(challenge_dataframe_yvalues.shape) print(challenge_dataframe_yvalues.head())
-h --help Show this screen weather_cleanse takes in an csv file and write to an out_file that is a csv. a command should thus be: "python weather_cleanse.py input_file.csv out_file.csv" ''' from docopt import docopt import pandas as pd ARGS = docopt(__doc__) print(ARGS) saved_cols = [1, 2, 3, 4] station_df = pd.read_fwf('ghcnd-stations.txt') station_df.columns = [ "station_id", 'long', 'lat', 'elevation', 'location', 'gsn_flag', 'hcn_flag', 'wmo_id' ] station_df2 = station_df[['station_id', 'location']] NYC_list = [] for row in station_df2.itertuples(): if 'NEW YORK CNTRL PK TWR' in row.location or 'NEW YORK LAGUARDIA AP' in row.location or 'NEW YORK JFK INTL AP' in row.location: NYC_list.append([row.station_id, row.location]) station_df_result = pd.DataFrame(NYC_list, columns=['station_id', 'location']) df = pd.read_csv(ARGS['FILE_IN']) df.columns = ['station_id', 'date', 'condition', 'value', 'E', 'F', 'G', 'H']
import pandas as pd from Potential import * from Graph import * from HybridLBPLogVersion import HybridLBP from EPBPLogVersion import EPBP import numpy as np from show_image import show_images import time row = 50 col = 50 data = pd.read_fwf('../Data/noisyImage.dat', header=None) m = data.iloc[0:row, 0:col].values m = m * 100 # show_images((m,), vmin=-30, vmax=130) domain = Domain((-30, 130), continuous=True) evidence = [None] * (col * row) for i in range(row): for j in range(col): evidence[i * col + j] = RV(domain, m[i, j]) rvs = [] for _ in range(row * col): rvs.append(RV(domain)) fs = []
import pandas as pd from sklearn import linear_model import matplotlib.pyplot as plt # read data dataframe = pd.read_fwf('Lesson1/data/brain_body.txt') x_values = dataframe[['Brain']] y_values = dataframe[['Body']] # train model on data body_reg = linear_model.LinearRegression() body_reg.fit(x_values, y_values) # visualize results plt.scatter(x_values, y_values) plt.plot(x_values, body_reg.predict(x_values)) plt.show()
import os from plotnine import * #This file reads the fide ranking files from 2005 to 2021 and shows the proportion of women in ches and the #mean rating per sex #FIDE files have 2 different formats. Files with each format are in a different folder #Read files with old format old_names = os.listdir("data\old_format") df_list_number = []#Stores the number of women df_list_elo = []#Stores the mean rating for name in old_names: year = name[3:5]#Get year from file name data = pd.read_fwf(f'data\old_format\{name}', delimiter=' ') #Split the data in men and women data_w = data.loc[(data.Flag == "w") | (data.Flag == "wi")] data_m = data.loc[(data.Flag != "w") & (data.Flag != "wi")] #Calculate percentage of men and women percent_w = data_w.shape[0] / data.shape[0] *100 percent_m = 100 - percent_w #Calculate the mean rating of men and women elo_w = data_w.iloc[:,3 ].mean() elo_m = data_m.iloc[:,3 ].mean() #Add info to list as dictionary df_list_number.append({"year": "20" + str(year), "percent": percent_w, "Sex": "Women"}) df_list_number.append({"year": "20" + str(year), "percent": percent_m, "Sex": "Men"}) df_list_elo.append({"year": "20" + str(year), "elo": elo_w, "Sex": "Women"}) df_list_elo.append({"year": "20" + str(year), "elo": elo_m, "Sex": "Men"})
from metpy.units import units ########################################### # Change default to be better for skew-T plt.rcParams['figure.figsize'] = (9, 9) ########################################### # Upper air data can be obtained using the siphon package, but for this example we will use # some of MetPy's sample data. col_names = ['pressure', 'height', 'temperature', 'dewpoint', 'direction', 'speed'] df = pd.read_fwf(get_test_data('jan20_sounding.txt', as_file_obj=False), skiprows=5, usecols=[0, 1, 2, 3, 6, 7], names=col_names) df['u_wind'], df['v_wind'] = mpcalc.wind_components(df['speed'], np.deg2rad(df['direction'])) # Drop any rows with all NaN values for T, Td, winds df = df.dropna(subset=('temperature', 'dewpoint', 'direction', 'speed', 'u_wind', 'v_wind'), how='all').reset_index(drop=True) ########################################### # We will pull the data out of the example dataset into individual variables and # assign units. p = df['pressure'].values * units.hPa T = df['temperature'].values * units.degC Td = df['dewpoint'].values * units.degC
print("Error: geocode failed on input %s with message %s" % (loc, e)) df = pd.DataFrame(np.array(coordinate).reshape(-1, 3)) df.columns = ["city", "latitude", "longitude"] return df df = coordinate(list_city) # the process of getting latitude and longitude might crash sometimes, so we need to store the data after we get all the # latitude and longitude tfile = open('coordinate.txt', 'a') tfile.write(df.to_string()) tfile.close() # we can use the coordinate directly from the text file data = pd.read_fwf('coordinate.txt') '''creat Map object''' m = folium.Map([35.8781, -100.6298], zoom_start=5) # mark each city as a point for index, row in data.iterrows(): folium.CircleMarker( location=[float(row['latitude']), float(row['longitude'])], radius=3, popup=row['city'], fill_color='#ffe6e6', # divvy color ).add_to(m) # plot heatmap m.add_children(
def read_crn(filename, map_variables=True): """Read a NOAA USCRN fixed-width file into a pandas dataframe. The CRN network consists of over 100 meteorological stations covering the U.S. and is described in [1]_ and [2]_. The primary goal of CRN is to provide long-term measurements of temperature, precipitation, and soil moisture and temperature. Additionally, global horizontal irradiance (GHI) is measured at each site using a photodiode pyranometer. Parameters ---------- filename: str, path object, or file-like filepath or url to read for the fixed-width file. map_variables: boolean, default: True When true, renames columns of the Dataframe to pvlib variable names where applicable. See variable :const:`VARIABLE_MAP`. Returns ------- data: Dataframe A dataframe with DatetimeIndex and all of the variables in the file. Notes ----- CRN files contain 5 minute averages labeled by the interval ending time. Here, missing data is flagged as NaN, rather than the lowest possible integer for a field (e.g. -999 or -99). Air temperature is in deg C and wind speed is in m/s at a height of 1.5 m above ground level. Variables corresponding to standard pvlib variables are by default renamed, e.g. `SOLAR_RADIATION` becomes `ghi`. See the :const:`pvlib.iotools.crn.VARIABLE_MAP` dict for the complete mapping. CRN files occasionally have a set of null characters on a line instead of valid data. This function drops those lines. Sometimes these null characters appear on a line of their own and sometimes they occur on the same line as valid data. In the latter case, the valid data will not be returned. Users may manually remove the null characters and reparse the file if they need that line. References ---------- .. [1] U.S. Climate Reference Network `https://www.ncdc.noaa.gov/crn/qcdatasets.html <https://www.ncdc.noaa.gov/crn/qcdatasets.html>`_ .. [2] Diamond, H. J. et. al., 2013: U.S. Climate Reference Network after one decade of operations: status and assessment. Bull. Amer. Meteor. Soc., 94, 489-498. :doi:`10.1175/BAMS-D-12-00170.1` """ # read in data # TODO: instead of parsing as strings and then post-processing, switch to # pd.read_fwf(..., dtype=dict(zip(HEADERS, DTYPES)), skip_blank_lines=True) # when our minimum pandas >= 1.2.0 (skip_blank_lines bug for <1.2.0). # As a workaround, parse all values as strings, then drop NaN, then cast # to the appropriate dtypes, and mask "sentinal" NaN (e.g. -9999.0) data = pd.read_fwf(filename, header=None, names=HEADERS, widths=WIDTHS, dtype=str) # drop empty (bad) lines data = data.dropna(axis=0, how='all') # can't set dtypes in read_fwf because int cols can't contain NaN, so # do it here instead data = data.astype(dict(zip(HEADERS, DTYPES))) # finally, replace -999 values with NaN data = data.replace(NAN_DICT, value=np.nan) # set index # UTC_TIME does not have leading 0s, so must zfill(4) to comply # with %H%M format dts = data[['UTC_DATE', 'UTC_TIME']].astype(str) dtindex = pd.to_datetime(dts['UTC_DATE'] + dts['UTC_TIME'].str.zfill(4), format='%Y%m%d%H%M', utc=True) data = data.set_index(dtindex) if map_variables: data = data.rename(columns=VARIABLE_MAP) return data
if not (args.text or args.data): parser.error("No action specified, must be either --text or --data") # Get router text and split into sections status_text = get_router_adsl_status_text(IP_ADDR) sections = extract_sections(status_text) if args.data: # Import pandas here, as it is quite slow (1-2 seconds) to import import pandas as pd # Get dataframe from port section, fixed width port_stringio = StringIO("\n".join(sections['port'])) port_df = pd.read_fwf( port_stringio, index_col=0, # First column is index skipinitialsep=True, # Remove extra whitespace delimiter=" :" # Both space and colon are delimiters ) # Convert to list if only one value given if isinstance(args.data, str): args.data = [args.data] # Return each value for n, k in enumerate(args.data): if n > 0: print(":", end="") # Add seperator # Get correct series, upstream or downstream direction = "Upstream" if k.endswith("up") else "Downstream" series = port_df[direction] # Get row
""" Created on Wed Nov 6 17:35:27 2019 @author: Ahmad Aiman Mohd Nazir """ #import the libraries import numpy as np import pandas as pd import matplotlib.pyplot as plt #read txt file f = open("Conductivity(raw).txt", "r") df = pd.read_fwf('Conductivity(raw).txt', sep=" ", header=None) df.columns = ['xk', 'yk'] #plot the raw data plt.scatter(df['xk'], df['yk'], label='raw data') plt.xlabel('T') plt.ylabel('Conductivity') plt.title('Conductivity versus T') plt.legend(loc='upper right') plt.show() #calculate the corresponding values for linear regression sumx = sum(df['xk']) sumx2 = sum((df['xk'])**2) sumy = sum(df['yk']) sumxy = sum(df['xk'] * df['yk']) print('\nSum of \nX: {:.4f}\nY: {:.4f}\nX^2: {:.4f}\nXY: {:.4f}'.format( sumx, sumy, sumx2, sumxy))
def test_value_counts_inferred(self): klasses = [Index, Series] for klass in klasses: s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a'] s = klass(s_values) expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c']) tm.assert_series_equal(s.value_counts(), expected) self.assert_numpy_array_equal(s.unique(), np.unique(s_values)) self.assertEqual(s.nunique(), 4) # don't sort, have to sort after the fact as not sorting is platform-dep hist = s.value_counts(sort=False) hist.sort() expected = Series([3, 1, 4, 2], index=list('acbd')) expected.sort() tm.assert_series_equal(hist, expected) # sort ascending hist = s.value_counts(ascending=True) expected = Series([1, 2, 3, 4], index=list('cdab')) tm.assert_series_equal(hist, expected) # relative histogram. hist = s.value_counts(normalize=True) expected = Series([.4, .3, .2, .1], index=['b', 'a', 'd', 'c']) tm.assert_series_equal(hist, expected) # bins self.assertRaises(TypeError, lambda bins: s.value_counts(bins=bins), 1) s1 = Series([1, 1, 2, 3]) res1 = s1.value_counts(bins=1) exp1 = Series({0.998: 4}) tm.assert_series_equal(res1, exp1) res1n = s1.value_counts(bins=1, normalize=True) exp1n = Series({0.998: 1.0}) tm.assert_series_equal(res1n, exp1n) self.assert_numpy_array_equal(s1.unique(), np.array([1, 2, 3])) self.assertEqual(s1.nunique(), 3) res4 = s1.value_counts(bins=4) exp4 = Series({ 0.998: 2, 1.5: 1, 2.0: 0, 2.5: 1 }, index=[0.998, 2.5, 1.5, 2.0]) tm.assert_series_equal(res4, exp4) res4n = s1.value_counts(bins=4, normalize=True) exp4n = Series({ 0.998: 0.5, 1.5: 0.25, 2.0: 0.0, 2.5: 0.25 }, index=[0.998, 2.5, 1.5, 2.0]) tm.assert_series_equal(res4n, exp4n) # handle NA's properly s_values = [ 'a', 'b', 'b', 'b', np.nan, np.nan, 'd', 'd', 'a', 'a', 'b' ] s = klass(s_values) expected = Series([4, 3, 2], index=['b', 'a', 'd']) tm.assert_series_equal(s.value_counts(), expected) self.assert_numpy_array_equal( s.unique(), np.array(['a', 'b', np.nan, 'd'], dtype='O')) self.assertEqual(s.nunique(), 3) s = klass({}) expected = Series([], dtype=np.int64) tm.assert_series_equal(s.value_counts(), expected) self.assert_numpy_array_equal(s.unique(), np.array([])) self.assertEqual(s.nunique(), 0) # GH 3002, datetime64[ns] txt = "\n".join([ 'xxyyzz20100101PIE', 'xxyyzz20100101GUM', 'xxyyzz20100101EGG', 'xxyyww20090101EGG', 'foofoo20080909PIE', 'foofoo20080909GUM' ]) f = StringIO(txt) df = pd.read_fwf(f, widths=[6, 8, 3], names=["person_id", "dt", "food"], parse_dates=["dt"]) s = klass(df['dt'].copy()) idx = pd.to_datetime([ '2010-01-01 00:00:00Z', '2008-09-09 00:00:00Z', '2009-01-01 00:00:00X' ]) expected_s = Series([3, 2, 1], index=idx) tm.assert_series_equal(s.value_counts(), expected_s) expected = np.array([ '2010-01-01 00:00:00Z', '2009-01-01 00:00:00Z', '2008-09-09 00:00:00Z' ], dtype='datetime64[ns]') if isinstance(s, DatetimeIndex): expected = DatetimeIndex(expected) self.assertTrue(s.unique().equals(expected)) else: self.assert_numpy_array_equal(s.unique(), expected) self.assertEqual(s.nunique(), 3) # with NaT s = df['dt'].copy() s = klass([v for v in s.values] + [pd.NaT]) result = s.value_counts() self.assertEqual(result.index.dtype, 'datetime64[ns]') tm.assert_series_equal(result, expected_s) result = s.value_counts(dropna=False) expected_s[pd.NaT] = 1 tm.assert_series_equal(result, expected_s) unique = s.unique() self.assertEqual(unique.dtype, 'datetime64[ns]') # numpy_array_equal cannot compare pd.NaT self.assert_numpy_array_equal(unique[:3], expected) self.assertTrue(unique[3] is pd.NaT or unique[3].astype('int64') == pd.tslib.iNaT) self.assertEqual(s.nunique(), 3) self.assertEqual(s.nunique(dropna=False), 4) # timedelta64[ns] td = df.dt - df.dt + timedelta(1) td = klass(td) result = td.value_counts() expected_s = Series([6], index=[86400000000000]) self.assertEqual(result.index.dtype, 'int64') tm.assert_series_equal(result, expected_s) # get nanoseconds to compare expected = np.array([86400000000000]) self.assert_numpy_array_equal(td.unique(), expected) self.assertEqual(td.nunique(), 1) td2 = timedelta(1) + (df.dt - df.dt) td2 = klass(td2) result2 = td2.value_counts() self.assertEqual(result2.index.dtype, 'int64') tm.assert_series_equal(result2, expected_s) self.assert_numpy_array_equal(td.unique(), expected) self.assertEqual(td.nunique(), 1)
import pandas as pd from sklearn import linear_model import matplotlib.pyplot as plt #read data dataframe = pd.read_fwf('D:/1brain_body.txt') x_values = dataframe[['Brain']] y_values = dataframe[['Body']] #train model on data body_reg = linear_model.LinearRegression() body_reg.fit(x_values, y_values) #visualize results plt.scatter(x_values, y_values) plt.plot(x_values, body_reg.predict(x_values)) plt.show()
tar_gz_file_name = 'GRGS_anomaly.tar.gz' # remove the existing extracted files and freshly extract again #call('rm GSM-* 2> /dev/null',shell=True) #call('tar xvzf '+tar_gz_file_name , shell=True) my_file = open(names_file, 'r') raw = my_file.read() my_file.close() # list of all the file names names = raw.split() # Original data is not delimited and should be seperated from the following column numbers mywidths = [8, 5, 3, 19, 19, 11, 11, 14, 14] filename = 'GSM-2_2011113-2011122_0010_GRGS_0080_03v3.anomaly' data = pd.read_fwf(filename, widths=mywidths, header=None, skiprows=3) npway = np.genfromtxt(filename, delimiter=mywidths, skip_header=3) np_data = np.asarray(data) new = data.to_numpy() nnew = new[:, 1:7] print(npway[:, 1:7]) print(nnew) np.savetxt('testfile.txt', npway[:, 1:7], delimiter=' ', fmt='%d %d %1.12e %1.12e %1.4e %1.4e') print('saved the file : ' + filename)
write_file.write(r.read()) merge_books("Tigrigna", ti_books) merge_books("Amharic", am_books) merge_books("English", en_books) # Creating a parallel Corpus ti = pd.read_csv('Scrapped/Tigrigna/All.txt', delimiter="\n", header=None) ti.columns = ["Tigrigna"] en = pd.read_csv('Scrapped/English/All.txt', delimiter="\n", header=None) en.columns = ["English"] data = pd.concat([en, ti], axis=1) print(data.head()) data.to_csv("en_ti.csv", index=False) am = pd.read_fwf('Scrapped/Amharic/All.txt', delimiter="\n", header=None) am.columns = ["Amharic"] #reset 'data' dataframe data = [] data = pd.concat([en, am], axis=1) print(data.head()) data.to_csv("en_am.csv", index=False)
# importing libraries import numpy as np import matplotlib.pyplot as plt import pandas as pd # importing data set train_set = pd.read_fwf('kmeans_data.txt', header = None) # K-means # Feature Transformation (convert to polar coordinates) def my_transform(x, y): r = np.sqrt(x**2 + y**2) phi = np.arctan2(y, x) return(r, phi) def my_inv_transform(r, phi): x = r * np.cos(phi) y = r * np.sin(phi) return(x, y) # Transform the original data for i in range(np.shape(train_set)[0]): x = train_set[0][i] y = train_set[1][i] (train_set[0][i], train_set[1][i]) = my_transform(x, y) # initialize means mu_1 = train_set[0][0] mu_2 = train_set[0][1]
tofile_name = 'binary' # 定义导出二进制文件名 data.tofile(tofile_name) # 导出二进制文件 fromfile_data = np.fromfile(tofile_name, dtype='float32') # 读取二进制文件 print(fromfile_data) # 打印数据 #################################################################### # 3. 使用Pandas的read_csv、read_fwf、read_table读取数据 import pandas as pd # 导入Pandas库 csv_data = pd.read_csv('csv_data.csv', names=['col1', 'col2', 'col3', 'col4', 'col5']) # 读取csv数据 print(csv_data) # 打印输出数据 import pandas as pd # 导入Pandas库 fwf_data = pd.read_fwf('fwf_data', widths=[5, 5, 5, 5], names=['col1', 'col2', 'col3', 'col4']) # 读取csv数据 print(fwf_data) # 打印输出数据 import pandas as pd # 导入Pandas库 table_data = pd.read_table('table_data.txt', sep=';', names=['col1', 'col2', 'col3', 'col4', 'col5']) # 读取csv数据 print(table_data) # 打印输出数据 #################################################################### # 2.2.2 从Excel获取运营数据 import xlrd # 导入库 # 打开文件
def process_visits(self) -> None: col_breaks = [ (0, 16), (16, 24), (24, 32), (32, 40), (40, 43), (43, 48), (48, 50), (50, 52), (52, 57), (57, 62), (62, 64), (64, 73), (73, 82), (82, 91), (91, 100), (100, 109), (109, 118), (118, 127), (127, 136), (136, 145), (145, 154), (154, 162), (162, 170), (170, 178), (178, 186), (186, 194), (194, 202), (202, 203), (203, 206), (206, 208), (208, 212), (212, 216), (216, 218), (218, 220), (220, 221), (221, 231), (231, 232) ] col_names = [ 'MemberId', 'ServiceDate', 'AdmissionDate', 'DischargeDate', 'CoveredDays', 'CPT', 'CptMod1', 'CptMod2', 'HCPCS', 'CPT2', 'Cpt2Mod', 'PrincipalIcdDiagnosis', 'IcdDiagnosis2', 'IcdDiagnosis3', 'IcdDiagnosis4', 'IcdDiagnosis5', 'IcdDiagnosis6', 'IcdDiagnosis7', 'IcdDiagnosis8', 'IcdDiagnosis9', 'IcdDiagnosis10', 'PrincipalIcdProcedure', 'IcdProcedure2', 'IcdProcedure3', 'IcdProcedure4', 'IcdProcedure5', 'IcdProcedure6', 'IcdIdentifier', 'DRG', 'DischargeStatus', 'UbRevenue', 'UbBillType', 'NumberOfTimes', 'CmsPlaceOfService', 'ClaimStatus', 'ProviderId', 'SupplementalData' ] col_types = { 'MemberId': np.unicode, 'ServiceDate': np.object, 'AdmissionDate': np.object, 'DischargeDate': np.object, 'CoveredDays': np.unicode, 'CPT': np.unicode, 'CptMod1': np.unicode, 'CptMod2': np.unicode, 'HCPCS': np.unicode, 'CPT2': np.unicode, 'Cpt2Mod': np.unicode, 'PrincipalIcdDiagnosis': np.unicode, 'IcdDiagnosis2': np.unicode, 'IcdDiagnosis3': np.unicode, 'IcdDiagnosis4': np.unicode, 'IcdDiagnosis5': np.unicode, 'IcdDiagnosis6': np.unicode, 'IcdDiagnosis7': np.unicode, 'IcdDiagnosis8': np.unicode, 'IcdDiagnosis9': np.unicode, 'IcdDiagnosis10': np.unicode, 'PrincipalIcdProcedure': np.unicode, 'IcdProcedure2': np.unicode, 'IcdProcedure3': np.unicode, 'IcdProcedure4': np.unicode, 'IcdProcedure5': np.unicode, 'IcdProcedure6': np.unicode, 'IcdIdentifier': np.unicode, 'DRG': np.unicode, 'DischargeStatus': np.unicode, 'UbRevenue': np.unicode, 'UbBillType': np.unicode, 'NumberOfTimes': np.unicode, 'CmsPlaceOfService': np.unicode, 'ClaimStatus': np.unicode, 'ProviderId': np.unicode, 'SupplementalData': np.unicode } self.log.info('Reading input file') df = pd.read_fwf(self.config.read_value('setup', 'visit.input.filename'), colspecs=col_breaks, names=col_names, dtype=col_types, parse_dates=[1, 2, 3]) for index, rows in df.iterrows(): v = { 'ServiceDate': self.process_date(rows.ServiceDate), 'AdmissionDate': self.process_date(rows.AdmissionDate), 'DischargeDate': self.process_date(rows.DischargeDate), 'CoveredDays': self.validate_missing_field(rows.CoveredDays), 'CPT': self.validate_missing_field(rows.CPT), 'CptMod1': self.validate_missing_field(rows.CptMod1), 'CptMod2': self.validate_missing_field(rows.CptMod2), 'HCPCS': self.validate_missing_field(rows.HCPCS), 'CPT2': self.validate_missing_field(rows.CPT2), 'Cpt2Mod': self.validate_missing_field(rows.Cpt2Mod), 'PrincipalIcdDiagnosis': self.validate_missing_field(rows.PrincipalIcdDiagnosis), 'IcdDiagnosis2': self.validate_missing_field(rows.IcdDiagnosis2), 'IcdDiagnosis3': self.validate_missing_field(rows.IcdDiagnosis3), 'IcdDiagnosis4': self.validate_missing_field(rows.IcdDiagnosis4), 'IcdDiagnosis5': self.validate_missing_field(rows.IcdDiagnosis5), 'IcdDiagnosis6': self.validate_missing_field(rows.IcdDiagnosis6), 'IcdDiagnosis7': self.validate_missing_field(rows.IcdDiagnosis7), 'IcdDiagnosis8': self.validate_missing_field(rows.IcdDiagnosis8), 'IcdDiagnosis9': self.validate_missing_field(rows.IcdDiagnosis9), 'IcdDiagnosis10': self.validate_missing_field(rows.IcdDiagnosis10), 'PrincipalIcdProcedure': self.validate_missing_field(rows.PrincipalIcdProcedure), 'IcdProcedure2': self.validate_missing_field(rows.IcdProcedure2), 'IcdProcedure3': self.validate_missing_field(rows.IcdProcedure3), 'IcdProcedure4': self.validate_missing_field(rows.IcdProcedure4), 'IcdProcedure5': self.validate_missing_field(rows.IcdProcedure5), 'IcdProcedure6': self.validate_missing_field(rows.IcdProcedure6), 'IcdIdentifier': self.validate_missing_field(rows.IcdIdentifier), 'DRG': self.validate_missing_field(rows.DRG), 'DischargeStatus': self.validate_missing_field(rows.DischargeStatus), 'UbRevenue': self.validate_missing_field(rows.UbRevenue), 'UbBillType': self.validate_missing_field(rows.UbBillType), 'NumberOfTimes': self.validate_missing_field(rows.NumberOfTimes), 'CmsPlaceOfService': self.validate_missing_field(rows.CmsPlaceOfService), 'ClaimStatus': self.validate_missing_field(rows.ClaimStatus), 'ProviderId': self.validate_missing_field(rows.ProviderId), 'SupplementalData': self.validate_missing_field(rows.SupplementalData), } v['AggregatedCodes'] = self.aggregate_codes(v) member_id = rows.MemberId if member_id not in self.visits: self.visits[member_id] = [v] else: self.visits[member_id].append(v)
def recalculate_avg_hours(file_paths, age_bins): ''' -------------------------------------------------------------------- Creates a dataframe of all of the requested months, recalculates the working hours variable and calculates a weighted average number of hours worked for each age bin across the entire time period. -------------------------------------------------------------------- INPUTS: age_bins = (S,) vector, beginning cutoff ages for each age bin file_paths = list, location of file for each requested month OTHER FUNCTIONS AND FILES CALLED BY THIS FUNCTION: None OBJECTS CREATED WITHIN FUNCTION: names = length 6 tuple, names for each column in data file colspecs = length 6 tuple, tuples for indexes for each column list_months_df = list, dataframes for each month of data month_df = dataframe, data read from data file df = dataframe, concatenated dataframe of data from all months TotWklyHours = series, contains weighted averages per age bin df_hrs_age = dataframe, weighted averages of weekly hours per age bin FILES CREATED BY THIS FUNCTION: None RETURNS: df_hrs_age -------------------------------------------------------------------- ''' names = ('HWHHWGT', 'PRTAGE', 'PRTFAGE', 'PEHRUSL1', 'PEHRUSL2', 'PEHRFTPT') colspecs = ((46, 56), (121, 123), (123, 124), (217, 219), (219, 221), (221, 223)) list_months_df = [] for filename in file_paths: month_df = pd.read_fwf(filename, colspecs=colspecs, header=None, names=names, index_col=False) list_months_df.append(month_df) # concatenate all dataframes df = pd.concat(list_months_df) # Drop all observations that: # 1) have no hours in either response (PEHRUSL1=-1) and (PEHRUSL2=-1) # 2) have [(PEHRUSL1=-1), (PEHRUSL2=-4), and (PEHRFTPT!=1)] or # [(PEHRUSL1=-4), (PEHRUSL2=-1), and (PEHRFTPT!=1)] # 3) have age that is top-coded (PRTFAGE=1) df = df[( (df['PEHRUSL1'] >= 0) | (df['PEHRUSL2'] >= 0) | (df['PEHRFTPT'] == 1)) & (df['PRTFAGE'] == 0)] # Create empty total weekly hours series that has the index from df TotWklyHours = pd.Series(data=np.nan * np.ones(df.shape[0]), index=df.index) # Assume that observations that report at least 35 hours of work in the # typical week (PEHRFTPT=1) but report either n/a hours (-1) or varying # hours (-4) have a supply of 35.0 hours per week TotWklyHours[(df['PEHRUSL1'] < 0) & (df['PEHRUSL2'] < 0) & (df['PEHRFTPT'] == 1)] = 35.0 # Assume that observations that report at least 35 hours of work in the # typical week (PEHRFTPT=1) but report only positive hours in job 1 # (PEHRUSL1>=0) and report n/a or varying hours in job 2 (PEHRUSL2<0) # have a supply of the maximum of PEHRUSL1 and 35.0 TotWklyHours[(df['PEHRUSL1'] >= 0) & (df['PEHRUSL2'] < 0) & (df['PEHRFTPT'] == 1)] = np.maximum(35.0, df['PEHRUSL1']) # Assume that observations that report at least 35 hours of work in the # typical week (PEHRFTPT=1) but report n/a or varying hours in job 1 # (PEHRUSL1<0) and report only positive hours hours in job 2 # (PEHRUSL2>=0) have a supply of the maximum of PEHRUSL2 and 35.0 TotWklyHours[(df['PEHRUSL1'] < 0) & (df['PEHRUSL2'] >= 0) & (df['PEHRFTPT'] == 1)] = np.maximum(35.0, df['PEHRUSL2']) # Observations that report only positive hours in job 1 (PEHRUSL1>=0) # and report n/a or varying hours in job 2 (PEHRUSL2<0) and do not # report at least 35 hours of work in the typical week (PEHRFTPT!=1) # have hours given by PEHRUSL1 TotWklyHours[(df['PEHRUSL1'] >= 0) & (df['PEHRUSL2'] < 0) & (df['PEHRFTPT'] != 1)] = df['PEHRUSL1'] # Observations that report n/a or varying hours in job 1 (PEHRUSL1<0) # and report only positive hours in job 2 (PEHRUSL2>=0) and do not # report at least 35 hours of work in the typical week (PEHRFTPT!=1) # have hours given by PEHRUSL2 TotWklyHours[(df['PEHRUSL1'] < 0) & (df['PEHRUSL2'] >= 0) & (df['PEHRFTPT'] != 1)] = df['PEHRUSL2'] # Observations that report positive hours in job 1 (PEHRUSL1>=0) and # positive hours in job 2 (PEHRUSL2>=0) and report at least 35 hours of # work in the typical week (PEHRFTPT=1) have hours given by the maximum # of PEHRUSL1+PEHRUSL2 and 35.0 TotWklyHours[(df['PEHRUSL1'] >= 0) & (df['PEHRUSL2'] >= 0) & (df['PEHRFTPT'] == 1)] = np.maximum( 35.0, df['PEHRUSL1'] + df['PEHRUSL2']) # Observations that report positive hours in job 1 (PEHRUSL1>=0) and # positive hours in job 2 (PEHRUSL2>=0) and do not report at least 35 # hours of work in the typical week (PEHRFTPT!=1) have hours given by # PEHRUSL1+PEHRUSL2 TotWklyHours[(df['PEHRUSL1'] >= 0) & (df['PEHRUSL2'] >= 0) & (df['PEHRFTPT'] != 1)] = df['PEHRUSL1'] + df['PEHRUSL2'] # Add TotWklyHours to DataFrame df['TotWklyHours'] = TotWklyHours # mark and group according to bin if age_bins is not None: age_bins = np.append(age_bins, 80) age_bins = list(age_bins) df['age_bin'] = pd.cut(df['PRTAGE'], age_bins) # print('df HWHHWGT=0=', df['HWHHWGT'][df['HWHHWGT'] == 0].shape) df_hrs_age = \ df.groupby('age_bin').apply(lambda x: np.average(x.TotWklyHours, weights=x.HWHHWGT)) # group according to age else: df_hrs_age = \ df.groupby('PRTAGE').apply(lambda x: np.average(x.TotWklyHours, weights=x.HWHHWGT)) return df_hrs_age
""" import numpy as np from keras.models import Sequential from keras.layers import Dense from keras.layers import LSTM from keras.layers.convolutional import Conv1D from keras.layers.convolutional import MaxPooling1D from keras.layers.embeddings import Embedding import pandas as pd from keras.preprocessing import text as keras_text, sequence as keras_seq from sklearn.model_selection import train_test_split from keras.layers import LSTM, GRU, Dropout #Preparing training data raw = pd.read_fwf(r'D:/sap/offline_challenge_to_send/xtrain_obfuscated.txt', header=None) xtrain_obfuscated = pd.read_fwf(r'D:/sap/offline_challenge_to_send/xtrain_obfuscated.txt', header=None) ytrain = pd.read_fwf(r'D:/sap/offline_challenge_to_send/ytrain.txt',header=None) xtrain_obfuscated['label']=ytrain[0] xtrain_obfuscated.rename(columns={0:'text'}, inplace=True) #Reading test file xtest_obfuscated = pd.read_fwf(r'D:/sap/offline_challenge_to_send/xtest_obfuscated.txt',header=None) xtest_obfuscated.rename(columns={0:'text'}, inplace=True) #One-hot encoding on training data xtrain_encoded = pd.get_dummies(xtrain_obfuscated, columns=['label']) #df_encoded_copy=df_encoded.copy() #List sentences train
def read_schedule(path): return pd.read_fwf(path, widths=[8, 1], names=["lookup", "schedule"], header=None, dtype=str)
def test_value_counts_datetime64(self, klass): # GH 3002, datetime64[ns] # don't test names though txt = "\n".join([ 'xxyyzz20100101PIE', 'xxyyzz20100101GUM', 'xxyyzz20100101EGG', 'xxyyww20090101EGG', 'foofoo20080909PIE', 'foofoo20080909GUM' ]) f = StringIO(txt) df = pd.read_fwf(f, widths=[6, 8, 3], names=["person_id", "dt", "food"], parse_dates=["dt"]) s = klass(df['dt'].copy()) s.name = None idx = pd.to_datetime([ '2010-01-01 00:00:00', '2008-09-09 00:00:00', '2009-01-01 00:00:00' ]) expected_s = Series([3, 2, 1], index=idx) tm.assert_series_equal(s.value_counts(), expected_s) expected = np_array_datetime64_compat([ '2010-01-01 00:00:00', '2009-01-01 00:00:00', '2008-09-09 00:00:00' ], dtype='datetime64[ns]') if isinstance(s, Index): tm.assert_index_equal(s.unique(), DatetimeIndex(expected)) else: tm.assert_numpy_array_equal(s.unique(), expected) assert s.nunique() == 3 # with NaT s = df['dt'].copy() s = klass([v for v in s.values] + [pd.NaT]) result = s.value_counts() assert result.index.dtype == 'datetime64[ns]' tm.assert_series_equal(result, expected_s) result = s.value_counts(dropna=False) expected_s[pd.NaT] = 1 tm.assert_series_equal(result, expected_s) unique = s.unique() assert unique.dtype == 'datetime64[ns]' # numpy_array_equal cannot compare pd.NaT if isinstance(s, Index): exp_idx = DatetimeIndex(expected.tolist() + [pd.NaT]) tm.assert_index_equal(unique, exp_idx) else: tm.assert_numpy_array_equal(unique[:3], expected) assert pd.isna(unique[3]) assert s.nunique() == 3 assert s.nunique(dropna=False) == 4 # timedelta64[ns] td = df.dt - df.dt + timedelta(1) td = klass(td, name='dt') result = td.value_counts() expected_s = Series([6], index=[Timedelta('1day')], name='dt') tm.assert_series_equal(result, expected_s) expected = TimedeltaIndex(['1 days'], name='dt') if isinstance(td, Index): tm.assert_index_equal(td.unique(), expected) else: tm.assert_numpy_array_equal(td.unique(), expected.values) td2 = timedelta(1) + (df.dt - df.dt) td2 = klass(td2, name='dt') result2 = td2.value_counts() tm.assert_series_equal(result2, expected_s)
# set directory to read files from source_dir = "C:\\Users\\abhij\\Documents\\Career\\013 - Wire Wheel\\coding_challenge\\coding_challenge\\gfz-data" output_file = "C:\\Users\\abhij\\Documents\\Career\\013 - Wire Wheel\\coding_challenge\\coding_challenge\\challenge1.csv" file_list = glob.glob(source_dir + '/*.TAB') # create data frame, empty list, for loop to iterate through file_list, thus adding correct rows from each file to a master dataframe frame = pd.DataFrame() list_ = [] #setting column widths b/c of use of pd.read_fwf {fixed width file} col_widths = [(0, 6), (8, 10), (11, 13), (14, 16), (17, 19), (20, 23), (24, 26), (27, 29), (30, 32), (42, 45)] for filename_ in file_list: df = pd.read_fwf(filename_, colspecs=col_widths, header=None) df = df.dropna(axis=0, subset=[4]) df = df.dropna(axis=0, subset=[8]) list_.append(df) ''' #Test on a single file df = pd.read_fwf('C:\\Users\\abhij\\Documents\\Career\\013 - Wire Wheel\\coding_challenge\\coding_challenge\\gfz-data\\kp9706.tab', colspecs = col_widths, header = None) #df = df[(df[8]>0)] df = df.dropna(axis=0, subset=[4]) df = df.dropna(axis=0, subset=[8]) list_.append(df) ''' frame = pd.concat(list_) #provide column names since none provided in files
columns = [(0, 15), (16, 56), (57, 60), (61, 65), (66, 74), (75, 105), (106, 115), (116, 124), (125, 134), (135, 144), (145, 154), (155, 163), (164, 173), (174, 183), (184, 193), (194, 202), (203, 212), (213, 222), (223, 229), (230, 236), (237, 243), (244, 250), (251, 252)] if not os.path.exists(exceldir): os.mkdir(exceldir) for txtfile in os.listdir(indir): if txtfile.endswith(".txt") or txtfile.endswith(".TXT"): # remove null copyfile(indir + '/' + txtfile, indir + '/' + txtfile + '.ORI') fi = open(indir + '/' + txtfile, 'r') data = fi.read() fi.close() data = data.replace('\x00', '') data = data.replace('\x0A\x0A', '\x0A') #data = filter(lambda x: not re.match(r'^*$', x), data) fo = open(indir + '/' + txtfile, 'w') fo.write(data) fo.close() for csvfile in os.listdir(indir): if csvfile.endswith(".txt") or csvfile.endswith(".TXT"): df = pd.read_fwf(indir + '/' + csvfile, columns) df = df.fillna(' ') excelfile = exceldir + '/' + os.path.basename(csvfile) + '.xlsx' df.to_excel(excelfile, index=False)
import pandas as pa from sklearn import linear_model import matplotlib.pyplot as plt dataframe = pa.read_fwf('linear_regression_demo/brain_body.txt') x_values = dataframe[['Brain']] y_values = dataframe[['Body']] bodyreg = linear_model.LinearRegression() bodyreg.fit(x_values, y_values) print "x_values" print x_values print "y_values" print y_values predict = 0 predict = bodyreg.predict(x_values) print predict plt.scatter(x_values, y_values) plt.plot(x_values, predict) plt.show()
from collections import Counter import operator from google.colab import drive drive.mount('/content/drive') """# LOAD EVENTOS""" txt_event = '/content/drive/My Drive/TFM/03_DATASETS/eventos_2.rpt' widths = [ 10, 39, 12, 16, 12, 39, 20, 10, 41, 16, 50, 39, 17, 18, 39, 41, 41, 41, 41, 41, 30, 39 ] dfevent = pd.read_fwf(txt_event, widths=widths, header=1, index_col=None, index=True) rowcl, colcl = dfevent.shape dfevent = dfevent[0:(rowcl - 3)] new_header = [ 'TipoEvento', 'CodigoEvento', 'FechaEvento', 'UsuarioEvento', 'HoraEvento', 'ClienteEvento', 'CodigoPostalEvento', 'PaísEvento', 'RepresentanteEvento', 'TipoPortesEvento', 'FormaPagoEvento', 'PlazoPagoEvento', 'SkuArticuloEvento', 'TipoArticuloEvento', 'FamiliaArticuloEvento', 'SubfamiliaArticuloEvento', 'CantidadArticuloEvento', 'AlmacenArticuloEvento', 'TarifaArticuloEvento', 'DescuentoArticuloEvento', 'MotivoEvento', 'CosteEvento' ] dfevent.columns = new_header
# -*- coding: utf-8 -*- """ Created on Wed Feb 14 12:31:13 2018 @author: Dipika """ import pandas as pd from sklearn import linear_model import matplotlib.pyplot as plt #read data dataframe = pd.read_fwf('C:\\Dipika\\Resumes\\Siraj\\week1_0\\brain_body.txt') x_values = dataframe[['Brain']] y_values = dataframe[['Body']] #train model on data body_reg = linear_model.LinearRegression() body_reg.fit(x_values, y_values) #visualize results plt.scatter(x_values, y_values) plt.plot(x_values, body_reg.predict(x_values)) plt.show()
def get_fw_csv_data(filename, widths, header=False, remote=False, **kwargs): if remote: theory = CSVReader(FileName=[filename]) theory.HaveHeaders = 0 theory.MergeConsecutiveDelimiters = 1 theory.UseStringDelimiter = 0 theory.DetectNumericColumns = 1 theory.FieldDelimiterCharacters = ' ' theory.UpdatePipeline() theory_client = servermanager.Fetch(theory) table = Table(theory_client) data = table.RowData else: import pandas as pd if not header: data = pd.read_fwf(filename, sep=' ', header=None, widths=widths, **kwargs) else: data = pd.read_fwf(filename, sep=' ', width=widths, **kwargs) return data