# In[6]:

import stanza
if stanza_download:
    stanza.download(stanza_model_lang)

# IF NEEDED define 'pos_batch_size': 10000 in the next cell, config options.
config = {'processors': 'tokenize,mwt,pos,lemma', 'lang': stanza_model_lang}
#not using depparse
nlp = stanza.Pipeline(**config)

# In[7]:

import tqdm
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()
# to use tqdm in pandas use progress_apply instead of apply

# In[8]:

import nltk
if nltk_download:
    nltk.download('stopwords')
    nltk.download('punkt')

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize

# In[9]:
예제 #2
0
import functions

# Directory, where files are stored
dataDir = 'C:\\Users\\Dmitry\\Downloads\\Python для анализа данных\\Python And Data Analysis\\data'

if __name__ == '__main__':
    # %matplotlib inline
    filterwarnings('ignore')

    transactions_train, transactions_test, gender_train, gender_test = functions.load_data(
        dataDir)

    params = functions.model_params()

    tqdm_notebook.pandas(desc="Progress:")

    data_train = transactions_train.groupby(
        transactions_train.index).progress_apply(
            functions.features_creation_basic)
    data_test = transactions_test.groupby(
        transactions_test.index).progress_apply(
            functions.features_creation_basic)

    target = data_train.join(gender_train, how='inner')['gender']
    functions.cv_score(params, data_train, target)

    # Number of trees for XGBoost is important to define after results on cross-validation
    clf, submission = functions.fit_predict(params, 70, data_train, data_test,
                                            target)