def transform(file): start = time.time() logger.info("============== Feature Selection start ==============") logger.info("[start] : {}".format(str(start))) logger.info("DATA_PATH : {}".format(DATA_PATH)) ########################################## # Config ########################################## ########################################## # feature Selection ########################################## df = pd.read_csv(save_file, index_col=0) tqdm.pandas() print(df.shape) drop_col = ['target', "$$$"] x = df.drop(drop_col, axis=1) y = df['target'] x.fillna(0, inplace=True) logger.info("type transform") x = x.astype(int) x = mem_ext(x) return x, y
def preprocess_column(cfg, df_data, column_name, do_lemmatize=True, no_stopwords=True): tqdm.pandas() df_data = df_data[pd.notnull(df_data[column_name])] df_data['temp'] = df_data[cfg.get( 'postgres', 'column')].progress_apply(lambda x: remove_all_tables(x)) df_data['readable_text'] = df_data['temp'].progress_apply( lambda x: get_readable_text(x)) df_data.drop(['temp'], axis=1, inplace=True) df_data['processed_value'] = df_data['readable_text'].progress_apply( lambda x: clean_text(x)) if do_lemmatize: parser = spacy.load('en', disable=['parser', 'ner']) df_data['processed_value'] = df_data['processed_value'].progress_apply( lambda x: lemmatize(x, parser)) if no_stopwords: df_data['processed_value'] = df_data['processed_value'].progress_apply( lambda x: ' '.join([ word for word in x.split() if word not in (text.ENGLISH_STOP_WORDS) ])) df_data = df_data[pd.notnull(df_data[column_name])] return df_data
def genotype_to_iupac(geno, thresh=0.5, progress=True): if progress: tqdm.pandas() out = (geno.stack('strain').progress_apply( fuzzy_allele_to_iupac, axis='columns').unstack('strain')) else: out = (geno.stack('strain').apply(fuzzy_allele_to_iupac, axis='columns').unstack('strain')) return out
#TQDM build def tqdm_pandas(t): from pandas.core.frame import Series def inner(series, func, *args, **kwargs): t.total = series.size def wrapper(*args, **kwargs): t.update(1) return func(*args, **kwargs) result = series.apply(wrapper, *args, **kwargs) t.close() return result Series.progress_apply = inner tqdm_pandas(tqdm_notebook()) tqdm.pandas(desc="my bar!") #Feature engineering SAMPLE_RATE = 22050 from scipy.stats import skew print(os.listdir(os.getcwd())) tqdm.pandas train_files = os.listdir(train_path) test_files = os.listdir(test_path) train_files = glob(train_path + '*.wav') test_files = glob(test_path + '*.wav') SAMPLE_RATE = 22050 def get_feature(fname): #b,_ = librosa.load(fname, res_type = 'kaiser_fast') b,_ = librosa.load(fname, res_type = 'kaiser_fast') try:
ch.setLevel(logging.INFO) # create formatter and add it to the handlers formatter = logging.Formatter( "%(asctime)s - %(name)s - %(lineno)d - %(levelname)s - %(message)s") fh.setFormatter(formatter) ch.setFormatter(formatter) # add the handlers to the logger logger.addHandler(fh) logger.addHandler(ch) # This is for jupyter notebook # from tqdm.notebook import tqdm_notebook # tqdm_notebook.pandas() # tqdm_func = tqdm_notebook # This is for terminal tqdm.pandas(desc="Progress") tqdm_func = tqdm # # Params engine = create_engine(config.DB_STR) logger.info("Logging to get line") engine.connect() out_nodes_table_name = "outnodes" out_edges_table_name = "outedges" in_nodes_table_name = "innodes" in_edges_table_name = "inedges" cards_graphs_as_json_to_table = f"{config.CARDS_JSON_TNAME}_temp"
import glob import json import nussl import os import pandas as pd import shutil import numpy as np import tqdm import gin import re from tqdm import tqdm import hashlib import pickle tqdm.pandas() @gin.configurable def construct_dataframe(json_path, sep_audio_path, og_audio_path, cache_location): json_files = glob.glob(f"{json_path}/**/*.json", recursive=True) hash_file = hashlib.sha224(" ".join(json_files).encode('utf-8')).hexdigest() hash_file = os.path.join(os.path.join(cache_location, hash_file)) os.makedirs(cache_location, exist_ok=True) print(f"Writing or looking for {hash_file}") if os.path.exists(hash_file): with open(hash_file, 'rb') as f: df = pickle.load(f) return df df = nussl.evaluation.aggregate_score_files(json_files) df = df[df['source'] == 'vocals']
import pandas as pd import tqdm try: ipy_str = str(type(get_ipython())) if "zmqshell" in ipy_str or "terminal" in ipy_str: from tqdm import tqdm, tqdm_notebook tqdm.pandas(tqdm_notebook) except: from tqdm import tqdm tqdm.pandas() def read_cdwow(filename): df = ( # read the data pd.read_csv( filename, names=["id", "date", "num_purchased", "dollars"], header=None, sep=r"\s+", ) # format the dates as dates .pipe(lambda x: x.assign(date=pd.to_datetime(x.date, format="%Y%m%d"))) # calculate the average cost per CD in the basket .pipe(lambda x: x.assign(price_per_cd=x.dollars / x.num_purchased))) return df
import sklearn import sklearn.ensemble from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC import functions ################ ## Requirements ################ tqdm.pandas( desc="my bar!" ) ## Register `pandas.progress_apply` and `pandas.Series.map_apply` with `tqdm` ################ ## Data Preparation ################ fileName1 = "/home/huiyangd/toxicSpans/data/2018_01_n_1000_32.csv" fileName2 = "/home/huiyangd/toxicSpans/data/2018_01_p_1000_32.csv" RC_2018_01_n_1000_32 = pd.read_csv(fileName1) RC_2018_01_p_1000_32 = pd.read_csv(fileName2) frames = [RC_2018_01_n_1000_32, RC_2018_01_p_1000_32] RC_2018_01_combined_1000 = pd.concat(frames) class_names = list(RC_2018_01_combined_1000.toxicity.unique())