def test_pandas_groupby_apply(): """ Test pandas.DataFrame.groupby(...).progress_apply """ try: from numpy.random import randint from tqdm import tqdm_pandas import pandas as pd except: raise SkipTest with closing(StringIO()) as our_file: df = pd.DataFrame(randint(0, 50, (500, 3))) dfs = pd.DataFrame(randint(0, 50, (500, 3)), columns=list('abc')) tqdm_pandas(tqdm(file=our_file, leave=False, ascii=True)) df.groupby(0).progress_apply(lambda x: None) tqdm_pandas(tqdm(file=our_file, leave=False, ascii=True)) dfs.groupby(['a']).progress_apply(lambda x: None) our_file.seek(0) # don't expect final output since no `leave` and # high dynamic `miniters` nexres = '100%|##########|' if nexres in our_file.read(): our_file.seek(0) raise AssertionError("\nDid not expect:\n{0}\nIn:{1}\n".format( nexres, our_file.read()))
def run_bootstrap_per_ASV(input_file): print(f'Reading in ASV file - {input_file} and running with {number_of_experiments} simulations') df = pd.read_csv(input_file) tqdm_pandas(tqdm()) results = df.progress_apply(process_row, axis=1) joined_results = df.merge(results, how='left', on=['ID', 'Site']) joined_results.to_csv(f'./data/ASV_cultivation_bootstrapped_numbers_max_viability_{number_of_experiments}_simulations.joined.csv', index=False)
def loadandbuildindex2(): global default nums = 1000 df_artists, df_en_songs = load_data() df_en_songs = df_en_songs.iloc[:nums, :] # -- after load we should create corpus Documents for search system -- # # -- preprocessed data -- # tqdm_pandas(tqdm()) clspreproccessed = ClassPreprocessed(text_column='Lyric', DRAREFREQ_words=True) df_new_songs = clspreproccessed.preprocessed(df_en_songs) # -- save -- # df_new_songs.to_csv('preprocessed_data.csv') print(df_new_songs['Lyric'][0]) print('Most common words:\n') print(FREQWORDS) print('Most rare words:\n') print(RAREWORDS) # -- we should preprocessed title of songs -- # clspreproccessed2 = ClassPreprocessed(text_column='SName', DRAREFREQ_words=False, stopwords=False, lemmatize=False) df_new_songs = clspreproccessed2.preprocessed(df_new_songs, verbose=1) df_new_songs.to_csv('preprocessed_data.csv') try: index = np.random.choice(df_new_songs.index, size=1)[0] print('before: \n') print(df_en_songs.loc[index, ['Lyric', 'SName']].values) print('after: \n') print(df_new_songs.loc[index, ['Lyric', 'SName']].values) except: print('Choice works bad!') print('all Done!') # -- create invert index -- # # -- We have alseo information about Artist, Popularity, Genre of each music -- # # -- We can use it for getting more actual results -- # # -- We should use name artist and Genre and Popularity -- # # -- merge df_artist and df_songs -- # merge = pd.merge(df_new_songs, df_artists, how='inner', left_on='ALink', right_on='Link') # -- У нас есть ещё жанр Genre и Artist_name -- # # -- Мы должны этим воспользоваться -- # # -- Сгруппировать песни по Жанру и Артисту -- # # -- Тоже сделать что-то вроде базы данных, которая содержит список песен для каждого отдельного взятого певца -- # createDataInvertIndex(merge) build_index2(merge, new_columns=True) # -- Заполняем наш словарь отображений слов -- # load_embedding_WV() # -- По умолчанию вектор -- # default = sum(dictionary_fastWV.values()) / len(dictionary_fastWV)
def flatten_images_directory(self): tqdm_pandas(tqdm()) logger.INFO("Flattening images data") self.build_all_images_dir() self.ref_dataset.progress_apply( lambda x: copyfile(x['origin_path'], x['destination_path']), axis=1)
def calculate_featureset3(dataframe): # Word Mover's Distance tqdm_pandas(tqdm()) dataframe['wmd'] = dataframe.progress_apply( lambda x: calc_wordmoversdist(x['question1'], x['question2']), axis=1) dataframe['norm_wmd'] = dataframe.progress_apply( lambda x: calc_norm_wordmover(x['question1'], x['question2']), axis=1) return dataframe
def estimate_viability_range_for_underrepresented_taxa(): print('*** Calculating maximum viability to explain observed number of wells ***') df = pd.read_csv(f'./data/ASV_cultivation_bootstrapped_numbers_max_viability_{number_of_experiments}_simulations.joined.csv') lower_than_expected_df = df[df.deviance <0] tqdm_pandas(tqdm()) results = lower_than_expected_df.progress_apply(process_viability, axis=1) results.to_csv(f'./data/estimate_viability_range_for_underrepresented_taxa_{number_of_experiments}_simulations.csv', index=False)
def test_pandas_apply_args_deprecation(): """Test warning info in `pandas.Dataframe(Series).progress_apply(func, *args)`""" try: from tqdm import tqdm_pandas except ImportError as err: skip(str(err)) with closing(StringIO()) as our_file: tqdm_pandas(tqdm(file=our_file, leave=False, ascii=True, ncols=20)) df = pd.DataFrame(randint(0, 50, (500, 3))) df.progress_apply(lambda x: None, 1) # 1 shall cause a warning # Check deprecation message res = our_file.getvalue() assert all([i in res for i in ( "TqdmDeprecationWarning", "not supported", "keyword arguments instead")])
def test_pandas_apply_args_deprecation(): """Test warning info in `pandas.Dataframe(Series).progress_apply(func, *args)`""" try: from numpy.random import randint from tqdm import tqdm_pandas import pandas as pd except ImportError: raise SkipTest with closing(StringIO()) as our_file: tqdm_pandas(tqdm(file=our_file, leave=False, ascii=True, ncols=20)) df = pd.DataFrame(randint(0, 50, (500, 3))) df.progress_apply(lambda x: None, 1) # 1 shall cause a warning # Check deprecation message res = our_file.getvalue() assert all([i in res for i in ( "TqdmDeprecationWarning", "not supported", "keyword arguments instead")])
def test_pandas_apply(): """ Test pandas.DataFrame.progress_apply """ try: from numpy.random import randint from tqdm import tqdm_pandas import pandas as pd except: raise SkipTest with closing(StringIO()) as our_file: df = pd.DataFrame(randint(0, 100, (1000, 6))) tqdm_pandas(tqdm(file=our_file, leave=True, ascii=True)) df.progress_apply(lambda x: None) our_file.seek(0) if '/6' not in our_file.read(): our_file.seek(0) raise AssertionError("\nExpected:\n{0}\nIn:{1}\n".format( '/6', our_file.read()))
def test_pandas(): try: from numpy.random import randint from tqdm import tqdm_pandas import pandas as pd except: raise SkipTest with closing(StringIO()) as our_file: df = pd.DataFrame(randint(0, 100, (1000, 6))) tqdm_pandas(tqdm(file=our_file, leave=False, ascii=True)) df.groupby(0).progress_apply(lambda x: None) our_file.seek(0) try: # don't expect final output since no `leave` and # high dynamic `miniters` assert '100%|##########| 101/101' not in our_file.read() except: raise AssertionError('Did not expect:\n\t100%|##########| 101/101')
def test_pandas_leave(): """ Test pandas with `leave=True` """ try: from numpy.random import randint from tqdm import tqdm_pandas import pandas as pd except: raise SkipTest with closing(StringIO()) as our_file: df = pd.DataFrame(randint(0, 100, (1000, 6))) tqdm_pandas(tqdm(file=our_file, leave=True, ascii=True)) df.groupby(0).progress_apply(lambda x: None) our_file.seek(0) exres = '100%|##########| 101/101' if exres not in our_file.read(): our_file.seek(0) raise AssertionError("\nExpected:\n{0}\nIn:{1}\n".format( exres, our_file.read()))
def test_pandas_deprecation(): """Test bar object instance as argument deprecation""" try: from tqdm import tqdm_pandas except ImportError as err: skip(str(err)) with closing(StringIO()) as our_file: tqdm_pandas(tqdm(file=our_file, leave=False, ascii=True, ncols=20)) df = pd.DataFrame(randint(0, 50, (500, 3))) df.groupby(0).progress_apply(lambda x: None) # Check deprecation message assert "TqdmDeprecationWarning" in our_file.getvalue() assert "instead of `tqdm_pandas(tqdm(...))`" in our_file.getvalue() with closing(StringIO()) as our_file: tqdm_pandas(tqdm, file=our_file, leave=False, ascii=True, ncols=20) df = pd.DataFrame(randint(0, 50, (500, 3))) df.groupby(0).progress_apply(lambda x: None) # Check deprecation message assert "TqdmDeprecationWarning" in our_file.getvalue() assert "instead of `tqdm_pandas(tqdm, ...)`" in our_file.getvalue()
def test_pandas_leave(): try: from numpy.random import randint from tqdm import tqdm_pandas import pandas as pd except: raise SkipTest with closing(StringIO()) as our_file: df = pd.DataFrame(randint(0, 100, (1000, 6))) tqdm_pandas(tqdm(file=our_file, leave=True, ascii=True)) df.groupby(0).progress_apply(lambda x: None) our_file.seek(0) try: assert '100%|##########| 101/101' in our_file.read() except: our_file.seek(0) raise AssertionError('\n'.join(('Expected:', '100%|##########| 101/101', 'Got:', our_file.read())))
def test_pandas_leave(): try: from numpy.random import randint from tqdm import tqdm_pandas import pandas as pd except: raise SkipTest with closing(StringIO()) as our_file: df = pd.DataFrame(randint(0, 100, (1000, 6))) tqdm_pandas(tqdm(file=our_file, leave=True, ascii=True)) df.groupby(0).progress_apply(lambda x: None) our_file.seek(0) try: assert '100%|##########| 101/101' in our_file.read() except: our_file.seek(0) raise AssertionError('\n'.join( ('Expected:', '100%|##########| 101/101', 'Got:', our_file.read())))
def test_pandas_apply(): """ Test pandas.DataFrame[.series].progress_apply """ try: from numpy.random import randint from tqdm import tqdm_pandas import pandas as pd except: raise SkipTest with closing(StringIO()) as our_file: df = pd.DataFrame(randint(0, 50, (500, 3))) dfs = pd.DataFrame(randint(0, 50, (500, 3)), columns=list('abc')) tqdm_pandas(tqdm(file=our_file, leave=True, ascii=True)) df.progress_apply(lambda x: None) tqdm_pandas(tqdm(file=our_file, leave=True, ascii=True)) dfs.a.progress_apply(lambda x: None) our_file.seek(0) if our_file.read().count('100%') < 2: our_file.seek(0) raise AssertionError("\nExpected:\n{0}\nIn:{1}\n".format( '100% at least twice', our_file.read()))
def test_pandas_deprecation(): """Test bar object instance as argument deprecation""" try: from numpy.random import randint from tqdm import tqdm_pandas import pandas as pd except ImportError: raise SkipTest with closing(StringIO()) as our_file: tqdm_pandas(tqdm(file=our_file, leave=False, ascii=True, ncols=20)) df = pd.DataFrame(randint(0, 50, (500, 3))) df.groupby(0).progress_apply(lambda x: None) # Check deprecation message assert "TqdmDeprecationWarning" in our_file.getvalue() assert "instead of `tqdm_pandas(tqdm(...))`" in our_file.getvalue() with closing(StringIO()) as our_file: tqdm_pandas(tqdm, file=our_file, leave=False, ascii=True, ncols=20) df = pd.DataFrame(randint(0, 50, (500, 3))) df.groupby(0).progress_apply(lambda x: None) # Check deprecation message assert "TqdmDeprecationWarning" in our_file.getvalue() assert "instead of `tqdm_pandas(tqdm, ...)`" in our_file.getvalue()
start_close_time['start_time'] = map(int, start_close_time['start_time'] / 1000) start_close_time['close_time'] = map(int, start_close_time['close_time'] / 1000) unique_app_name = np.unique(start_close_time['app_name']) dict_label = dict( zip(list(unique_app_name), list(np.arange(0, len(unique_app_name), 1)))) import time start_close_time['app_name'] = start_close_time['app_name'].apply( lambda row: str(dict_label[row])) del start_close_time['start_time'], start_close_time['close_time'] from tqdm import tqdm, tqdm_pandas tqdm_pandas(tqdm()) def dealed_row(row): app_name_list = list(row['app_name']) return ' '.join(app_name_list) data_feature = start_close_time.groupby('id').progress_apply( lambda row: dealed_row(row)).reset_index() data_feature = pd.merge(data_all, data_feature, on='id', how='left') del data_feature['id'] count_vec = CountVectorizer() count_csr_basic = count_vec.fit_transform(data_feature[0]) tfidf_vec = TfidfVectorizer()
test_path = 'D:/data_science/kaggle_sound_classification/audio_test/' #TQDM build def tqdm_pandas(t): from pandas.core.frame import Series def inner(series, func, *args, **kwargs): t.total = series.size def wrapper(*args, **kwargs): t.update(1) return func(*args, **kwargs) result = series.apply(wrapper, *args, **kwargs) t.close() return result Series.progress_apply = inner tqdm_pandas(tqdm_notebook()) tqdm.pandas(desc="my bar!") #Feature engineering SAMPLE_RATE = 22050 from scipy.stats import skew print(os.listdir(os.getcwd())) tqdm.pandas train_files = os.listdir(train_path) test_files = os.listdir(test_path) train_files = glob(train_path + '*.wav') test_files = glob(test_path + '*.wav') SAMPLE_RATE = 22050 def get_feature(fname): #b,_ = librosa.load(fname, res_type = 'kaiser_fast') b,_ = librosa.load(fname, res_type = 'kaiser_fast')
import pandas as pd from src.tools.tokenizer import tokenize, is_chinese, combine import os from os.path import join from start_experiment import MANIFEST_FOLDER from tqdm import tqdm, tqdm_pandas from collections import Counter import sentencepiece as spm from start_experiment import MANIFEST_FOLDER from tqdm import tqdm tqdm_pandas(tqdm) CORPUS = 'data/corpus.txt' ENG_CORPUS = 'data/eng_corpus.txt' TRAIN_MANIFEST_LIST_FORCORPUS_CH = [ 'aidatatang_200zh.csv', 'AISHELL-2.csv', 'c_500.csv', 'ce_200.csv', 'data_aishell.csv', 'magic_data_train.csv', 'magic_data_dev.csv', 'magic_data_test.csv', 'stcmds.csv', 'prime.csv' ] TRAIN_MANIFEST_LIST_FORCORPUS_CH = [ join(MANIFEST_FOLDER, i) for i in TRAIN_MANIFEST_LIST_FORCORPUS_CH ] TRAIN_MANIFEST_LIST_FORCORPUS_EN = [ 'libri_100.csv', 'libri_360.csv', 'libri_500.csv' ] TRAIN_MANIFEST_LIST_FORCORPUS_EN = [ join(MANIFEST_FOLDER, i) for i in TRAIN_MANIFEST_LIST_FORCORPUS_EN ] DEV_MANIFEST_LIST_FORCORPUS = [join(MANIFEST_FOLDER, 'ce_20_dev.csv')]
test_df['fuzz_token_set_ratio'] = test_dd.apply( lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1, meta=('a', np.dtype('int64'))).compute(get=dask.multiprocessing.get) test_df['fuzz_token_sort_ratio'] = test_dd.apply( lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1, meta=('a', np.dtype('int64'))).compute(get=dask.multiprocessing.get) print((time.time() - start_time)) del test_dd ############################################################################### model = gensim.models.KeyedVectors.load_word2vec_format( '../../input/GoogleNews-vectors-negative300.bin.gz', binary=True) tqdm_pandas(tqdm(desc="Train wmd:", total=len(train_df))) train_df['wmd'] = train_df.progress_apply( lambda x: wmd(x['question1'], x['question2']), axis=1) tqdm_pandas(tqdm(desc="Test wmd:", total=len(test_df))) test_df['wmd'] = test_df.progress_apply( lambda x: wmd(x['question1'], x['question2']), axis=1) del model norm_model = gensim.models.KeyedVectors.load_word2vec_format( '../../input/GoogleNews-vectors-negative300.bin.gz', binary=True) norm_model.init_sims(replace=True) tqdm_pandas(tqdm(desc="Train norm wmd:", total=len(train_df))) train_df['norm_wmd'] = train_df.progress_apply( lambda x: norm_wmd(x['question1'], x['question2']), axis=1) tqdm_pandas(tqdm(desc="Test norm wmd:", total=len(test_df))) test_df['norm_wmd'] = test_df.progress_apply(
import pandas as pd import numpy as np from tqdm import tqdm, tqdm_pandas df = pd.DataFrame(np.random.randint(0, 100, (100000, 6))) # Create and register a new `tqdm` instance with `pandas` # (can use tqdm_gui, optional kwargs, etc.) tqdm_pandas(tqdm()) # Now you can use `progress_apply` instead of `apply` df.progress_apply(lambda x: x**2) # can also groupby: # df.groupby(0).progress_apply(lambda x: x**2) """ Source code for `tqdm_pandas` (really simple!) """ # def tqdm_pandas(t): # from pandas.core.frame import DataFrame # def inner(df, func, *args, **kwargs): # t.total = groups.size // len(groups) # def wrapper(*args, **kwargs): # t.update(1) # return func(*args, **kwargs) # result = df.apply(wrapper, *args, **kwargs) # t.close() # return result # DataFrame.progress_apply = inner
import numpy as np import pandas import tqdm from typing import Callable from diyrex.cache import cacher tqdm.tqdm_pandas(tqdm.tqdm) # progress bars for groupby def compute_implicit_ratings_func(signals: pandas.DataFrame, agg_func: Callable): ratings = signals.groupby(['user', 'item']).progress_apply(agg_func) # normalize ratings /= ratings.max() return ratings.to_frame(name='rating').reset_index() def compute_implicit_ratings_1(signals): "rating is 1 if any interaction happened" return compute_implicit_ratings_func(signals, lambda g: 1) def compute_implicit_ratings_2(signals: pandas.DataFrame): "rating is the number of days with interactions" return compute_implicit_ratings_func(signals, lambda g: g['date'].dt.date.nunique())
import numpy as np import pandas as pd import tqdm import pickle progress_bar = tqdm.tqdm() tqdm.tqdm_pandas(progress_bar) #load data from gym_splendor_code.envs.mechanics.abstract_observation import DeterministicObservation from nn_models.utils.vectorizer import Vectorizer raw_data_small = pd.read_pickle( '/home/tomasz/ML_Research/splendor/gym_open_ai-splendor/training_data/half_merged.pi' ) # vectorizer = Vectorizer() def obs_to_state(obs: DeterministicObservation): return obs.recreate_state() series_of_states = raw_data_small['observation'].progress_map(obs_to_state) X_list = series_of_states.tolist() n = len(X_list) m = n - n % 5 X_list = X_list[0:m] Y = np.array(raw_data_small['value'].tolist()[0:m]).reshape(m, 1) indices = [i for i in range(len(Y)) if abs(Y[i]) == 1]