def get_coords_info(self): ds = xr.open_dataset(self.infile) for c in ds.coords: filename = os.path.join(self.coords_info, c.strip() + '.tabular') pd = ds.coords[c].to_pandas() pd.index = range(len(pd)) pd.to_csv(filename, header=False, sep='\t')
def mc_distribution_to_distribution(mc_distribution, bins=10**3 + 1, to_csv=False, csv_file_name=None): count, binEdges = np.histogram(mc_distribution, bins) bincenters = .5 * (binEdges[1:] + binEdges[:-1]) probs = [i / sum(count) for i in count] relative_moves = [bincenter / mean(bincenters) for bincenter in bincenters] pct_moves = [relative_move - 1 for relative_move in relative_moves] distribution_info = { 'State': np.array(range(len(count))), 'Prob': probs, 'Pct_Move': pct_moves, 'Relative_Move': relative_moves } if to_csv is True: distribution_df = distribution_info_to_distribution( distribution_info).distribution_df pd.to_csv(csv_file_name) return distribution_info_to_distribution(distribution_info)
def run(self): global funds_df while self.ajax_queue.empty() is not True: fundcode = self.ajax_queue.get() url = '%s%s' % (BASE_URL, fundcode) response = self.session.get(url=url, headers=headers) fund_json = response.json() success_flag = fund_json['success'] print(success_flag) print('='*30) fund_data = fund_json['data'] fund_df = pd.DataFrame() if fund_data: fund_df = self.parse_data(fund_data) values = fund_df.loc('0').values gLock.acquire() # pd.concat([funds_df, fund_df], axis=0, ignore_index=True) # 第一种 CSV库写入方式 with open(FILE_PATH, 'a', encoding='utf-8', newline='') as f: writer = csv.writer(f) writer.writerow(values) # 第二种 pandas库写入方式 pd.to_csv(FILE_PATH, mode='a', header=False, index=False) gLock.release() else: pass
def save_sh(): # A share list stock_code_list = ts.get_stock_basics().index.tolist() stock_list = [] count = 0 # traverse all stock code in list for code in stock_code_list: # get a single stock's data data_temp = ts.get_h_data(code, start = '2017-01-01', end = '2008-01-01') date_list = data_temp.index.tolist() print(data_temp) #hanlde the stock's daily data, get data out and instert into the list for date in date_list: data_sinday = data_temp.loc[date] open_price = data_sinday['open'] high_price = data_sinday['high'] close_price = data_sinday['close'] low_price = data_sinday['low'] price_change = (close_price - open_price)/open_price #add data to list stock_list.append({'code': code, 'date' : date, 'open' : open_price, 'high' : high_price, \ 'low' : low_price, 'close' : close_price, 'change':price_change}) data_all_sh = pd.DataFrame(stock_list) print(data_all_sh) #! save data to file pd.to_csv(path, data_all_sh)
def write_to_csv(pd): if os.path.exists('E:\python\物联网云端数据处理实验-5-2018119161-廖瑞金\output.csv'): try: pd.to_csv('E:\python\物联网云端数据处理实验-5-2018119161-廖瑞金\output.csv', mode='a', index=False, encoding="gbk", header=False) #print(pd) print('Successful writing!') time.sleep(3) except: print('error') else: try: pd.to_csv('E:\python\物联网云端数据处理实验-5-2018119161-廖瑞金\output.csv', mode='a', encoding="gbk", index=False) #print(pd) print('Successful writing!') time.sleep(3) except: print('error!')
def get_table_and_cols(TABLE, saveFile): #-------------------------------------- try: if TABLE.startswith( '$' ): pd = C.export_to_pandas('SELECT * FROM {table_name!q}', {'table_name': TABLE}) pd.to_csv((TABLE + '.csv') , na_rep = 0) pd_rows, pd_cols = pd.shape log_and_print(f'EXPORTED ' + str({pd.shape[0]}) + f'rows from table {TABLE}') else: myFile = C.export_to_file(saveFile, TABLE, export_params={"with_column_names":True}) stmt = C.last_statement() log_and_print("EXPORTED " + str(stmt.rowcount()) + " rows from table " + TABLE + " in " + str(stmt.execution_time) + " sec") except Exception as e: log_and_print("#######################################") log_and_print("ERROR: unable to READ table " + TABLE + " Aborting with no action taken!") log_and_print("#######################################") log_and_print(e) sys.exit(12)
def create_result(task1_csv_path='results.csv'): # ------------------------- # Argument # task1_csv_path: path of task 1 csv # ------------------------- dtf = pd.read_csv(task1_csv_path) field_rank = { 'SELLER' : 1, 'ADDRESS' : 2, 'TIMESTAMP' : 3, 'TOTAL_COST' : 4 } res = [] for key, row in dtf.iterrows(): name = row['img_id'] name = name.split(".")[0] annot_path = os.path.join('result_txt', name+".txt") image_path = os.path.join('upload', name+".jpg") output_dict = get_submit_image(image_path, annot_path) field_value, field_name = get_output(output_dict) print(field_name) print(field_value) field = list(zip(field_value, field_name)) field.sort(key = lambda x: field_rank[x[1]]) res.append('|||'.join([str(x[0]) for x in field])) dtf['anno_texts'] = res if not os.path.exists("submit"): os.mkdir("submit") pd.to_csv(os.path.join('submit', 'results.csv'))
def generate_challenge_run(model, test_data, run_file=None): """Generate the test predictions in the format of the runs for the challenge. Optionally write the run in a csv file EXAMPLE: with open('experiments/random_forest_model.pkl', 'wb') as f: model = pickle.load(f) test_data = '../data/test_size1_noclc_scaled_pca.csv' run_file = 'runs/random_forest_run.csv' run_df = generate_challenge_run(model, test_data, run_file) display(run_df) """ # loading the test environmental data test_df = pd.read_csv(test_data, sep=';', header='infer', quotechar='"') # the resulting run dataframe run_df = pd.DataFrame( columns=['glc19TestOccId', 'glc19SpId', 'Rank', 'Probability']) # TO DO: FINISH IT # ... # save the run in a csv file pd.to_csv(run_file, sep=';', index=False, quotechar='"')
def stock_data( code=None, start_time=None, end_time=None, ): path = ct.PATH % (os.getcwd(), code, time, ktype) pd.to_csv(os.getcwd())
def remove_stops_file(filepath): datafile = pd.read_csv(filepath) tweets_list = datafile['Tweet'] filtered_tweets_list = [] for tweet in tweets_list: filtered_tweet = remove_stops(tweet, stop_words_std) filtered_tweets_list.append(filtered_tweet) new_tweet_list_column_values = pd.Series(filtered_tweets_list) datafile.insert(loc=-1, column='filtered_tweet', value = new_tweet_list_column_values) pd.to_csv(filepath + 'filtered', datafile, index = 'FALSE')
def main(): with open('futmondo.html', 'r', encoding="utf8") as f: transfers = f.readlines() transfers = [x.strip() for x in transfers] transfers = [transform_data(t) for t in transfers] pd.DataFrame(transfers.items(), columns=['Player', 'Date', 'Time', 'Amount', 'From', 'To']) pd.to_csv('test.csv')
def getEntropyData(startIndex, type, termList, workList, year, meal): """ 获得数据 """ stype, entropyDict = figureEntropy(startIndex, type, termList, workList, year, meal) pd = DataFrame(entropyDict) print pd result = 'D:/GraduationThesis/Data/' + stype + '_entropy.csv' pd.to_csv(result)
def stringify_pandas(pd): """ :param pd: A Dataframe :return: """ output = StringIO() pd.to_csv(output) pt = prettytable.from_csv(output) print(pt)
def store_best_attributes_for_label(nb, df, class_feature, cor_score=0.01): ''' Store the best attributes in the corresponding file. Parameters ========== nb: int Label to use. + Same parameters as best_cor_attributes(). ''' relevant_features = best_cor_attributes(nb, df, class_feature, cor_score) pd.to_csv(relevant_features, path_best_features(nb))
def fetchTimeTable(self, refresh=False): logging.debug("Fetching Time Table...") if not refresh: try: logging.debug("Loading time table.") self.timeTable = pd.read_csv("data/" + self.username + "_TimeTable.csv") return except Exception as e: logging.error("No time table found.") self.timeTable = None try: logging.debug("Loading driver...") driver = webdriver.Chrome() logging.info("Driver loaded.") driver.get(self.url) logging.debug("Finding elements...") user = driver.find_element_by_id("tbUserName") pswd = driver.find_element_by_id("tbPassword") logging.debug("Filling creds...") user.send_keys(self.username) pswd.send_keys(self.password) pswd.send_keys(Keys.RETURN) logging.info("Logged in.") logging.debug("Closing pop-ups...") pop_up = driver.find_element_by_id("popup_ok") pop_up.send_keys(Keys.RETURN) logging.debug("Fetching Time Table text...") tt = driver.find_element_by_id("divMytimetable") logging.info("Time Table fetched!") self.timeTable = self.parseRawTimeTable(tt) pd.to_csv("data/" + self.username + "_" + "TimeTable.csv", index=False) except Exception as e: logging.exception(e) logging.error("Unable to fetch time table!") finally: driver.close()
def write(self, obj, filename): ''' Writes ebLink object to file. ''' if filename == None: filename = 'links_' + datetime.today().strftime('%y%m%d-%H%M%S') + '.ebout' try: pd.to_csv(obj, filename) except: try: pd.to_csv(pd.DataFrame(obj), filename) except: with open(filename, w) as f: f.write(obj)
def write_csv(self, obj, name, mode="None", index=False, header=False, check_suffix=True): """ write csv """ if mode == "data": name = osp.join(self.rootdir, self.inputdir, name) elif mode == "model": name = osp.join(self.rootdir, self.pretraindir, name) elif mode == "result": name = osp.join(self.rootdir, self.outputdir, name) if check_suffix and name.split('.')[-1] != "csv": name+=".csv" pd.to_csv(obj, name, index=index, header=header) if self.notify: self.notify.notify(text=f"Export : {name}")
def export_data(dt_from, dt_to, states=None, districts=None, crops=None, water_sources=None): def parse_string(s): r = '(' for i in s: r += i + ' ,' r = r[:-2] r += ')' return r import pandas as pd import sqlalchemy as sql conn = sql.create_engine( 'postgresql://*****:*****@ec2-54-221-220-59.compute-1.amazonaws.com/db70oouohkh4bj' ) query = 'SELECT * FROM fact_table, weather_dim, crop_dim, location_dim ' + ' WHERE fact_table.location_id=location_dim.location_id ' + ' irrigation_date >= ' + '\'' + str( dt_from) + '\' AND irrigation_date <= ' + '\'' + str(dt_to) + '\'' if states != None: query += ' AND state IN ' + parse_string(states) if districts != None: query += ' AND district IN ' + parse_string(districts) if crops != None: query += ' AND crop_name IN ' + parse_string(crops) if water_sources != None: query += ' AND water_source IN ' + parse_string(water_sources) conn.close() return pd.to_csv(pd.read_sql(query + ';', conn))
def get_submission(model, loader, test_ids, device): all_preds = [] with torch.no_grad: for x, y in loader: x = x.to(device) y = y.to(device) score = model(x) prediction = score.float() all_preds += prediction.tolist() model.train() df = pd.DataFrame({ "ID_code": test_ids.values, "target": np.array(all_preds) }) pd.to_csv("../data/santander/sub.csv", index=False)
def main(): logging.getLogger().setLevel(logging.DEBUG) if not os.path.isdir(f'{NFLSTATS_DIR}/data'): os.mkdir(f'{NFLSTATS_DIR}/data') if not os.path.isdir(f'{NFLSTATS_DIR}/data/draft'): os.mkdir(f'{NFLSTATS_DIR}/data/draft') years = 1992,2019+1 # 1992 is first year that targets were recorded for year in range(*years): logging.info('scraping for year %s', year) url = f'https://www.pro-football-reference.com/years/{year}/draft.htm' page = urlopen(url) soup = BeautifulSoup(page, 'lxml') # select() instead of find() returns a list table_rows = soup.select('#drafts tr') pd = get_players(table_rows, ignore_cols=['career_av', 'draft_av', 'college_id', 'college_link']) pd['year'] = year fout = f'{NFLSTATS_DIR}/data/draft/class_{year}.csv' pd.to_csv(fout, index=False)
def r1_ensemble(): result_files = ['results_testb_vgg19.csv', 'results_testb_bcnn.csv', 'results_testb_vgg19_2.csv'] results = [] for result in result_files: results.append(pd.read_csv(os.path.join(output_dir, result))) img_names = [] preds = [] for i in range(len(results[0])): filename = results[0].loc[i]['filename'] prob = 0 for x in range(3): prob += weight[x] * results[x].loc[i]['probability'] img_names.append(filename) preds.append(prob) preds = [round(x-1e-4, 4) for x in preds] data = {'filename':img_names, 'probability':preds} pd = pd.DataFrame(data, columns=['filename', 'probability']) pd.to_csv('../output/results_testb.csv', index=False)
def add_new_data(JH_master, last_pull, JH_data_dir, root_dir): # identify csv's to add to master JH_files = os.listdir()[1:-1] new_JH_files = JH_files[JH_files.index(last_pull):] new_JH_files = [JH_data_dir + '/' + x for x in new_JH_files] for i in new_JH_files: new_data = pd.read_csv(i) new_data['Last_Update'] = pd.to_datetime( new_data['Last_Update'], format='%Y-%m-%d %H:%M:&S') new_data['Last_Update'] = new_data['Last_Update'].dt.date JH_master = pd.concat([JH_master, new_data], ignore_index=True) pd.to_csv(root_dir + '/' + JH_master)
def create_model(): print("Begin Classificaton....") feature_csv = 'D:\\My Source Codes\\Projects-Python' \ '\\TextBaseEmotionDetectionWithEnsembleMethod\\NewDataset\\features6clL.csv' RFmodel_save_csv = 'D:\\My Source Codes\\Projects-Python\\TextBaseEmotionDetectionWithEnsembleMethod\\Models\\RF\\' DTmodel_save_csv = 'D:\\My Source Codes\\Projects-Python\\TextBaseEmotionDetectionWithEnsembleMethod\\Models\\DT\\' MLPmodel_save_csv = 'D:\\My Source Codes\\Projects-Python\\TextBaseEmotionDetectionWithEnsembleMethod\\' \ 'Models\\MLP\\' pd = DataFrame(columns=('ModelType', 'ModelName', 'Score', 'F1-Score', 'ErrorRate', 'Feature-Count', 'Train-Size')) x, y = loaddata(feature_csv, 100) for i in range(1, 500): np.random.seed(42) indices = sample(range(1, x.shape[0]), 6000) test_size = int(0.1 * len(indices)) X_train = x[indices[:-test_size]] Y_train = y[indices[:-test_size]] X_test = x[indices[-test_size:]] Y_test = y[indices[-test_size:]] ModelName = "Model_KNN_" + str(i) + ".pkl" F1_Score, Score, ErrorRate = rf_model(RFmodel_save_csv + ModelName, X_train, Y_train , X_test, Y_test) pd.loc[len(pd)] = ["KNN ", ModelName , Score, F1_Score, ErrorRate, 0, 0] print(ModelName + ", Model Type=KNN , With Score Result " + str(Score) + " and Feature Count=" + str(100)) ModelName = "Model_RF_" + str(i) + ".pkl" F1_Score, Score, ErrorRate = dt_model(DTmodel_save_csv + ModelName, X_train, Y_train, X_test, Y_test) pd.loc[len(pd)] = ["Random Forest", ModelName, Score, F1_Score, ErrorRate, 0, 0] print(ModelName + ", Model Type=Random Forest , With Score Result " + str(Score) + " and Feature Count=" + str(100)) ModelName = "Model_MLP_" + str(i) + ".pkl" F1_Score, Score, ErrorRate = mlp_model(MLPmodel_save_csv + ModelName, X_train, Y_train, X_test, Y_test) pd.loc[len(pd)] = ["MLP Neural Network", ModelName, Score, F1_Score, ErrorRate, 0, 0] print(ModelName + ", Model Type=Neural Network , With Score Result " + str(Score) + " and Feature Count=" + str(100)) pd.to_csv("D:\\My Source Codes\\Projects-Python\\TextBaseEmotionDetectionWithEnsembleMethod\\Models\dataset.csv", mode='a', header=True, index=False) print("End Classification...")
def write_to_csv(pd): if os.path.exists('output.csv'): try: pd.to_csv('output.csv', mode='a', index=False, encoding="gbk", header=False) # print(pd) print('Successful writing!') time.sleep(3) except: print('error') else: try: pd.to_csv('output.csv', mode='a', encoding="gbk", index=False) # print(pd) print('Successful writing!') time.sleep(3) except: print('error!')
def measure_board(input_filename): tuner_set = pd.read_csv(input_filename) output = pd.data_frame() for i in range(len(tuner_set)): this_tuner = measure_tuner(tuner_set[i]) local_pcb = local_pcb_height(tuner_set[i]) tuner_depth = this_tuner[1] - local_pcb - pcb_thickness tuner_length = this_tuner[1] - this_tuner[0] output.append(tuner_set[i][0],tuner_set[i][1],tuner_depth,tuner_length) #add the x, y, depth, and length values to the ourput data frame. output_csv = pd.to_csv(output) return(output_csv)
def main(): dir = '../data' data = pandas.read_csv(join(dir, 'train.csv')) data = impute(data) data = normalize(data) print('-------- entire data set after normalize -------------') print(data.head(20)) print(data.isnull().sum()) train, validate, test = numpy.split( data.sample(frac=1), [int(.6 * len(data)), int(.8 * len(data))]) X = train.values[:, 1:] y = train.values[:, 0:1] test_X = test.values[:, 1:] test_y = test.values[:, 0:1] # rt = None print('-------- test data ---------') rt = pandas.read_csv(join(dir, 'test.csv')) passenger_ids = rt[['PassengerId']].values[:, 0] print(rt.head(20)) print(rt.isnull().sum()) rt = impute(rt) rt = normalize(rt) print('-------- test data after processing ---------') print(rt.head(20)) print(rt.isnull().sum()) prediction = build_model(X, y, test_X, test_y, rt.values) pd = pandas.DataFrame() pd['PassengerId'] = passenger_ids pd['Survived'] = prediction pd.to_csv(join(dir, 'submission.csv'), index=False)
def main(): from_http = bool(int(sys.argv[1])) file_name = str(sys.argv[2]) data = read_data(file_name=file_name, from_http=from_http) data = process_data(data) pd.to_csv(get_dataset_directory() / "2018_processed_data.zip", compression="zip") save_name = str(sys.argv[3]) if save_name == '2018_harmonized_shipper_sym': bag_of_words = create_BoW_harmonized_shipper(data) alpha = 'symmetric' elif save_name == '2018_harmonized_shipper_asym': bag_of_words = create_BoW_harmonized_shipper(data) alpha = 'asymmetric' elif save_name == '2018_shipper_harmonized_sym': bag_of_words = create_BoW_shipper_harmonized(data) alpha = 'symmetric' elif save_name == '2018_shipper_harmonized_asym': bag_of_words = create_BoW_shipper_harmonized(data) alpha = 'asymmetric' else: print('not reconize') corpus = create_corpus(bag_of_words, save_name, save=True) id2word = create_id2word(bag_of_words, save_name, save=True) num_topics = int(sys.argv[4]) model = compute_lda(save_name, corpus, num_topics, id2word, alpha=alpha) # Fro visualization dictionary = Dictionary.from_corpus(corpus, id2word=id2word) save_pyldavis2html(model, corpus, dictionary, save_name, num_topics) # For document_topic_distribution document_topic_distribution(corpus, bag_of_words, model, save_name, num_topics, minimum_probability=0.10)
def two_layer_stack(train_x, train_y, test, mode = "full", save_path = "", save_name = ""): if mode == "full": clf_names = ["XGBRegressor", "LGBMRegressor", "Lasso", "Ridge"] classifiers = [ XGBRegressor(max_depth = 3, learning_rate = 0.1, n_eatimators = 100), LGBMRegressor(num_leaves = 31, max_depth = 4, learning_rate = 0.1, n_estimators = 100), Lasso(alpha = 0.1), Ridge(alpha = 0.5) ] layer_1_train, layer_1_test = stack_layer(clf_names, classifiers, train_x, train_y, test) if mode == "save": clf_names = ["XGBRegressor", "LGBMRegressor", "Lasso", "Ridge"] classifiers = [ XGBRegressor(max_depth = 3, learning_rate = 0.1, n_eatimators = 100, n_jobs = -1), LGBMRegressor(num_leaves = 31, max_depth = 4, learning_rate = 0.1, n_estimators = 100, n_jobs = -1), Lasso(alpha = 0.1), Ridge(alpha = 0.5) ] layer_1_train, layer_1_test = stack_layer(clf_names, classifiers, train_x, train_y, test) pd.to_csv(layer_1_train, save_path + save_name) pd.to_csv(layer_1_test, save_path + save_name) if mode == "read": layer_1_train = pd.read_csv(save_path + save_name) layer_1_test = pd.read_csv(save_path + save_name) clf_names = ["XGBRegressor", "LGBMRegressor", "Lasso", "Ridge"] classifiers = [ XGBRegressor(max_depth = 3, learning_rate = 0.1, n_eatimators = 100), Lasso(alpha = 0.1)] _, layer_2_test = stack_layer(clf_names, classifiers, layer_1_train, train_y, layer_1_test) layer_2_test = np.average(layer_2_test,axis = 1) return layer_2_test
def get_game_play_by_play(game_id, write_to_file_path): if os.path.isfile(write_to_file_path): return pd.read_csv(write_to_file_path) pbp_url = f'{NBA_BASE_URL}/playbyplayv2' headers = {'User-Agent': 'test',} params = { 'EndPeriod': '4', 'EndRange': '55800', 'GameID': game_id, 'RangeType': '2', 'Season': '2019-20', 'SeasonType': 'Regular Season', 'StartPeriod': '1', 'StartRange': '0', } response = requests.get(pbp_url, headers=headers, params=params) results = response.json()['resultSets'] headers = results[0]['headers'] plays = results[0]['rowSet'] df = pd.DataFrame(plays, columns=headers) pd.to_csv(write_to_file_path, index=False) return df
def prep_germany(cached=False): ''' This function pulls and preps the Germany Energy Consumption dataframe for exploration if cached == False: collects the csv from the url if cached == True: pulls the already saved dataframe ''' if cached == False: # url to opsd_germany_daily.csv url = 'https://raw.githubusercontent.com/jenfly/opsd/master/opsd_germany_daily.csv' # uses pull_csv function from acquire.py to collect the dataset df = a.pull_csv(url) # caches the dataset as a csv df = pd.to_csv('opsd_germany_daily.csv') # cached == True else: # pulls csv as data from df = pd.read_csv('opsd_germany_daily.csv') # Lowercases the columns and renames 'wind+solar' columns to 'wind_and_solar' df.columns = df.columns.str.lower() df.rename(columns={'wind+solar': 'wind_and_solar'}, inplace=True) # Conver date to datetime and set date as index df.date = pd.to_datetime(df.date) df.set_index(df.date, inplace=True) # Creates the month and year columns df['month'] = df.index.month df['year'] = df.index.year # Fills nulls with 0 df.fillna(0, inplace=True) return df
// fields to ignore in the output eliminate = [] if keep_original else list(fields_to_bucket) if ignore: eliminate.extend(ignore) columns = [column for column in columns if column not in eliminate] // check if we have the necessary fields for complete PUMAs if full_pumas: if ('STATEFIP' not in columns or 'PUMA' not in columns) or 'PUMASUPR' not in columns: raise ValueError("For the --full_pumas option to work, you need to have both 'PUMA' and 'STATEFIP' in your original data request.") reader = pd.read_fwf(input_file, encoding='utf-8', dtype=str, header=None, names=names, colspecs=colspecs, chunksize=chunksize) for dataframe in reader: if full_pumas: dataframe['PUMA'] = data['PUMASUPR'] if 'PUMASUPR' in dataframe.columns else dataframe['STATEFIP'] + dataframe['PUMA'] mode = 'a' if os.path.exists(output_file) else 'w' header = (mode == 'w') pd.to_csv(data, encoding='utf-8', sep='\t', mode=mode, header=header, index=False, quoting=QUOTE_NONE) total += len(dataframe) print('Finished parsing {total} lines in {delta} seconds.'.format(total=total, delta=delta)) print('Wrote output to {output_file}.'.format(output_file=output_file) """ def create_bucketer(bucket): def _bucketer(value): for definition in bucket: if definition['lower'] <= float(value) <= definition['upper']: return definition['label'] else: return value return lru_cache(_bucketer)
def save_table(tab, pwd): import pandas as pd pd.to_pickle(pwd) pd.to_csv(pwd)
url = "https://pypi.python.org/pypi/%s/json" # Load functions folder = "/home/vanessa/Documents/Dropbox/Code/Python/repofish/analysis/pypi" packages = pandas.read_csv("%s/pypi.tsv" %folder,sep="\t",index_col=0) meta_folder = "%s/packages" %(folder) if not os.path.exists(meta_folder): os.mkdir(meta_folder) # We will keep track of rows to drop drop = [] for row in packages.iterrows(): package_name = row[1].package output_file = "%s/%s.json" %(meta_folder,package_name) if not os.path.exists(output_file): time.sleep(1) print "parsing %s of %s" %(row[0],packages.shape[0]) response = requests.get(url %package_name) if response.status_code == 200: save_json(response.json(),output_file) else: print "Error getting meta data for package %s" %(package_name) drop.append(row[0]) # Remove the functions we don't have meta data for packages = packages.drop(drop) pandas.to_csv(packages,"%s/pypi_filtered.tsv" %folder,sep="\t")
""" Appends bad data that has tripped up my algorithm before. """ import pandas as pd df = pd.read_csv('good_data.csv') # Have some categories be missing. df = df[df.purpose != 'credit_card'] df = df[df.home_ownership != 'RENT'] df = df[df.grade != 'A'] # Populate with 0 values df.annual_inc.iloc[[2,3,45,12]] = 0 pd.to_csv('bad_data.csv')
import os, sys; from pdb import set_trace from dectree import * import pandas as pd walk = os.walk def explore(dir): datasets = [] for (dirpath, dirnames, filenames) in walk(dir): datasets.append(dirpath) training = [] testing = [] for k in datasets[1:]: train = [[dirPath, fname] for dirPath, _, fname in walk(k)] test = [train[0][0] + '/' + train[0][1].pop(-1)] training.append([train[0][0] + '/' + p for p in train[0][1] if not p == '.DS_Store']); testing.append(test) return training, testing train, test = explore('./') data = [train[i]+test[i] for i in xrange(1,len(test))] template=pd.read_csv(data[0][0], header = 0).columns.get_values().tolist(); for i in data[1:]: for k in i: tmp = pd.read_csv(k) tmp.to_csv(path_or_buf = k , header = template, index=False) set_trace() pd.to_csv(data[1][0], header= template)
''' Builds a model (TO BE DETERMINED). Authors: Paul Hendricks Date: 2015-12-15 inputs: train.h5 test.h5 sample_submission.csv outputs: 01-model.h5 ''' # Load libraries import pandas as pd # Load data prepped_path = './data/prepped/' train = pd.read_hdf(prepped_path + 'train.h5', 'table') test = pd.read_hdf(prepped_path + 'test.h5', 'table') submission = pd.read_csv('./data/submission/sample_submission.csv') # Write data out pd.to_csv('./data/models/01-model.h5')
import codecs import pandas as pd from spelling.features import levenshtein_distance as dist from spelling.jobs import DistanceToNearestStem df = pd.read_csv('data/aspell-dict.csv.gz', sep='\t', encoding='utf8') job = DistanceToNearestStem() df = job.run(df.word, dist) # TODO: Merge the two aspell-dict files. pd.to_csv('data/aspell-dict-distances.csv.gz', index=False, sep='\t', encoding='utf8', compression='gz')
#过滤,不改变x的值 x.drop(1) #根据index,不改变x的值 x.drop(x.index[1]) #根据位置,不改变x的值 x[0.1 != x.values] ''' #数据导入、导出与处理 ''' #数据导入 pd.read_csv(file,encoding) pd.read_table(file,name=[,],sep="", encoding) pd.read_excel(fiel,sheetname,names,engine) # engine=python可以解决中文路径问题 #数据导出 pd.to_csv(filepath, sep=",", index=True, header=True) #index是否导出行索引,header是否导出列索引 #数据去重 df.duplicated() #找出行重复信息,bool值 df.duplicated(index,key) #找出列重复,根据key的值,key可以省略 df.drop_duplicates() #去重 df.drop_duplicates(index) #按列去重 #数据缺失 df.read_csv(file,na_value) #读取文件是将某个值指定为na df.isnull() #取得是否为空的nool DataFrame df[df.isnull()] #过滤查找NA所在行或者列 df.dropna() #去除包含na的数据行 df.fillna() #填充 #信息抽取
#!/usr/bin/env python import os os.chdir("../UCS/") import pandas as pd UCS_mappings = pd.read_csv("UCS.mappings.csv", index_col=0, usecols=['barcode','TCGA.ID','histology']) a = UCS_mappings UCS_patient = pd.read_csv('../Minfi_Output/UCS_mset.csv', index_col=0) UCS =UCS_patient.head().stack().head() UCS b = UCS_patient c= b.transpose() c.index.name = 'barcode' d = c final = pd.merge(a,d, on=a.index, how='outer') finals = final.head() finals.rename(columns={'key_0':'barcode'}, inplace=True) final['Sample_Type'] = ["Normal" if '-20A-' in col else "Tumor" for col in final['TCGA.ID']] final = pd.to_csv('UCS_df.csv')
### ### ### ### ### IMPORTING DATA ### ### ### ### ### ############################################################### ''' read csv ''' dframe = pd.read_csv('file_name.csv', header = None) dframe = pd.read_csv('file_name.csv', header = None, nrows = 20) dframe = pd.read_table('file_name.txt', sep = ';' , header = None) dframe = pd.to_csv('output_file_name.csv') ''' read html pip install beautiful-soup pip install html5lib ''' url = 'https://www.fdic.gov/bank/individual/failed/banklist.html' from pandas import read_html dframe_list = pd.io.html.read_html(url) # read from url and puts the data into a list of dataframe objects dframe = dframe_list[0] ''' read excel