def load_text_features(train, test, helper_data_path, tf_idf=True): print('Loading text features...') # Stephan's nlp def get_df(filename): final_nlp_df = None with zipfile.ZipFile(path.join(helper_data_path, filename), 'r') as zip_ref: for name in zip_ref.namelist(): nlp_df = pd.read_pickle(zip_ref.open(name)) nlp_df = nlp_df[[ 'item_id', 'title_word_count', 'description_non_regular_chars_ratio', 'description_word_count', 'merged_params_word_count', 'description_sentence_count', 'description_words/sentence_ratio', 'title_capital_letters_ratio', 'description_capital_letters_ratio', 'title_non_regular_chars_ratio', 'title_num_of_newrow_char', 'description_num_of_newrow_char', 'title_num_adj', 'title_num_nouns', 'title_adj_to_len_ratio', 'title_noun_to_len_ratio', 'description_num_adj', 'description_num_nouns', 'description_adj_to_len_ratio', 'description_noun_to_len_ratio', 'title_first_noun_stemmed', 'title_second_noun_stemmed', 'title_third_noun_stemmed', 'description_first_noun_stemmed', 'description_second_noun_stemmed', 'description_third_noun_stemmed', 'title_first_adj_stemmed', 'title_second_adj_stemmed', 'title_third_adj_stemmed', 'description_first_adj_stemmed', 'description_second_adj_stemmed', 'description_third_adj_stemmed', 'title_sentiment', 'description_sentiment' ]] if final_nlp_df is not None: final_nlp_df = pd.concat([final_nlp_df, nlp_df]) else: final_nlp_df = nlp_df return final_nlp_df train = train.merge(get_df('train_NLP_enriched.zip'), on='item_id', how='left') if test is not None: test = test.merge(get_df('test_NLP_enriched.zip'), on='item_id', how='left') # tf-idf if tf_idf: print('loading tfidf features...') tfidf_df = load_df(helper_data_path, 'train_tfidf_svd.csv.gz') train = pd.concat([train, tfidf_df], axis=1) if test is not None: tfidf_df = load_df(helper_data_path, 'test_tfidf_svd.csv.gz') test = pd.concat([test, tfidf_df], axis=1) print('Done loading text features.') gc.collect() return train, test
def add_aggregated_features(train, test, helper_data_path): aggregated_features = load_df(helper_data_path, 'aggregated_features.csv.gz') train = add_aggregated_features_inner(train, aggregated_features) test = add_aggregated_features_inner(test, aggregated_features) return train, test
def load_text_features(train, test, helper_data_path, tf_idf=True): print('Loading text features...') # Stephan's nlp def get_df(filename): final_nlp_df = None with zipfile.ZipFile(path.join(helper_data_path, filename), 'r') as zip_ref: for name in zip_ref.namelist(): nlp_df = pd.read_pickle(zip_ref.open(name)) nlp_df = nlp_df[[ 'item_id', 'title_word_count', 'description_non_regular_chars_ratio', 'description_word_count', 'merged_params_word_count', 'description_sentence_count', 'description_words/sentence_ratio', 'title_capital_letters_ratio', 'description_capital_letters_ratio', 'title_non_regular_chars_ratio', 'title_num_of_newrow_char', 'description_num_of_newrow_char', 'title_num_adj', 'title_num_nouns', 'title_adj_to_len_ratio', 'title_noun_to_len_ratio', 'description_num_adj', 'description_num_nouns', 'description_adj_to_len_ratio', 'description_noun_to_len_ratio', 'title_first_noun_stemmed', 'title_second_noun_stemmed', 'title_third_noun_stemmed', 'description_first_noun_stemmed', 'description_second_noun_stemmed', 'description_third_noun_stemmed', 'title_first_adj_stemmed', 'title_second_adj_stemmed', 'title_third_adj_stemmed', 'description_first_adj_stemmed', 'description_second_adj_stemmed', 'description_third_adj_stemmed', 'title_sentiment', 'description_sentiment' ]] if final_nlp_df is not None: final_nlp_df = pd.concat([final_nlp_df, nlp_df]) else: final_nlp_df = nlp_df return final_nlp_df train = train.merge(get_df('train_NLP_enriched.zip'), on='item_id', how='left') if test is not None: test = test.merge(get_df('test_NLP_enriched.zip'), on='item_id', how='left') # tf-idf if tf_idf: print('loading tfidf features...') tfidf_df = load_df(helper_data_path, 'train_tfidf_svd.csv.gz') train = pd.concat([train, tfidf_df], axis=1) if test is not None: tfidf_df = load_df(helper_data_path, 'test_tfidf_svd.csv.gz') test = pd.concat([test, tfidf_df], axis=1) def more_text_count_features(df): count = lambda l1, l2: sum([1 for x in l1 if x in l2]) for col in ['description', 'title']: df['num_unique_words_' + col] = df[col].apply( lambda comment: len(set(w for w in comment.split()))) df['num_desc_punct'] = df['description'].apply( lambda x: count(x, set(string.punctuation))) df['words_vs_unique_title'] = df['num_unique_words_title'] / df[ 'title_word_count'] * 100 df['words_vs_unique_description'] = df[ 'num_unique_words_description'] / df['description_word_count'] * 100 return df train = more_text_count_features(train) test = more_text_count_features(test) print('Done loading text features.') gc.collect() return train, test