# In[6]: import stanza if stanza_download: stanza.download(stanza_model_lang) # IF NEEDED define 'pos_batch_size': 10000 in the next cell, config options. config = {'processors': 'tokenize,mwt,pos,lemma', 'lang': stanza_model_lang} #not using depparse nlp = stanza.Pipeline(**config) # In[7]: import tqdm from tqdm.notebook import tqdm_notebook tqdm_notebook.pandas() # to use tqdm in pandas use progress_apply instead of apply # In[8]: import nltk if nltk_download: nltk.download('stopwords') nltk.download('punkt') from nltk.corpus import stopwords from nltk.stem import PorterStemmer from nltk.tokenize import word_tokenize, sent_tokenize # In[9]:
import functions # Directory, where files are stored dataDir = 'C:\\Users\\Dmitry\\Downloads\\Python для анализа данных\\Python And Data Analysis\\data' if __name__ == '__main__': # %matplotlib inline filterwarnings('ignore') transactions_train, transactions_test, gender_train, gender_test = functions.load_data( dataDir) params = functions.model_params() tqdm_notebook.pandas(desc="Progress:") data_train = transactions_train.groupby( transactions_train.index).progress_apply( functions.features_creation_basic) data_test = transactions_test.groupby( transactions_test.index).progress_apply( functions.features_creation_basic) target = data_train.join(gender_train, how='inner')['gender'] functions.cv_score(params, data_train, target) # Number of trees for XGBoost is important to define after results on cross-validation clf, submission = functions.fit_predict(params, 70, data_train, data_test, target)