def get_feature_lda(n_topics): from sklearn.feature_extraction.text import CountVectorizer from sklearn.decomposition import LatentDirichletAllocation with timed_bolck('Gen_ALL_docs'): data = get_raw_data() app_docs = get_split_words(data[['app_des']]) docs = app_docs.jieba_txt.apply(lambda val: ','.join(val)) with timed_bolck('CountVec'): from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer(max_df=0.85, stop_words=[',', '[', ']', '(', ')']) cntTf = cv.fit_transform(docs) with timed_bolck(f'Cal LDA#{n_topics}'): lda = LatentDirichletAllocation(n_components=n_topics, learning_offset=50., random_state=666) docres = lda.fit_transform(cntTf) return pd.DataFrame(docres, columns=[f'fea_lda_{i}' for i in range(n_topics)], index=data.app_id)
def get_feature_seq_input_sentences(): data = get_raw_data() with timed_bolck('for df cut txt to words'): jieba = get_jieba() input_sentences = [ list(jieba.cut(str(text), cut_all=True)) for text in data.app_des.values.tolist() ] word2id = get_word2id() # Encode input words and labels X = [[word2id[word] for word in sentence if word in word2id] for sentence in input_sentences] max_words = 0 # maximum number of words in a sentence # Construction of word2id dict for sentence in input_sentences: if len(sentence) > max_words: max_words = len(sentence) # logger.debug(f'max_words={max_words}') logger.info(f'max_words={max_words}') with timed_bolck('X pad_sequences'): from keras.preprocessing.sequence import pad_sequences X = pad_sequences(X, max_words) return pd.DataFrame(X, index=data.app_id).add_prefix('seq_')
def get_tfidf_all(): with timed_bolck('Gen_ALL_docs'): data = get_raw_data() app_docs = get_split_words(data[['app_des']]) docs = app_docs.jieba_txt.apply(lambda val: ','.join(val)) with timed_bolck('CountVec'): from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer(max_df=0.85, stop_words=[',', '[', ']', '(', ')']) word_count_vector = cv.fit_transform(docs) list(cv.vocabulary_.keys())[:10] with timed_bolck('TFIDF'): from sklearn.feature_extraction.text import TfidfTransformer tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True) tf_idf_vector = tfidf_transformer.fit_transform(word_count_vector) with timed_bolck('Gen Sparse TFIDF'): df = pd.SparseDataFrame(tf_idf_vector, columns=cv.get_feature_names(), index=data.app_id) return df
def _get_embed_by_bert(X): with timed_bolck(f'Prepare train model'): from keras_bert import load_trained_model_from_checkpoint model = load_trained_model_from_checkpoint( config_path, checkpoint_path, training=True, seq_len=SEQ_LEN, ) #model.summary(line_length=120) from tensorflow.python import keras from keras_bert import AdamWarmup, calc_train_steps inputs = model.inputs[:2] dense = model.get_layer('NSP-Dense').output model = keras.models.Model(inputs, dense) #.summary() with timed_bolck(f'try to gen embed DF{len(X)}'): input1_col = [ col for col in X.columns if str(col).startswith('bert_') ] # train_x, train_y = filter_short_desc(train_x, train_y) input1 = X.loc[:, input1_col] # .astype(np.float32) input2 = np.zeros_like(input1) # .astype(np.int8) logger.info(f'NN Input1:{input1.shape}, Input2:{input2.shape}') label2id, id2label = get_label_id() from keras_bert import get_custom_objects import tensorflow as tf with tf.keras.utils.custom_object_scope(get_custom_objects()): res_list = [] partition_len = 5000 for sn in tqdm(range(1 + len(X) // partition_len), 'gen embeding'): tmp = X.iloc[sn * partition_len:(sn + 1) * partition_len] # print('\nbegin tmp\n', tmp.iloc[:3,:3].head()) res = model.predict([ tmp.loc[:, input1_col], np.zeros_like(tmp.loc[:, input1_col]) ]) res = pd.DataFrame(res, index=tmp.index).add_prefix('embd_bert') # print('\nend tmp\n', res.iloc[:3, :3].head()) res_list.append(res) res = pd.concat(res_list) return res
def get_embed_wordvec_file(): fname = bert_wv if os.path.exists(fname): return fname else: type_id = Bert_Embed._get_embed_from_type_name() app_type = get_app_type() app_type = app_type.drop_duplicates('type_name') app_type = app_type.set_index('type_id') type_name = type_id.copy() type_name = pd.merge(type_name, app_type, how='right', left_index=True, right_index=True) type_name = type_name.set_index('type_name') type_all = pd.concat([type_id, type_name]) #del type_all['type_id'] app_desc = Bert_Embed._get_embed_from_app_desc() data = pd.concat([type_all, app_desc]) with timed_bolck(f'Save data#{data.shape} records to :{fname}'): np.savetxt(fname, data.reset_index().values, delimiter=" ", header="{} {}".format(len(data), len(data.columns)), comments="", fmt=["%s"] + ["%.6f"] * len(data.columns)) return fname
def wrapper(*args, **kwargs): val = fn(*args, **kwargs) with timed_bolck(f'Reduce_Mem({fn.__name__}:{ex_type_name(val)})'): if isinstance(val, (pd.DataFrame,)) : val = _reduce_mem_usage(val, verbose=True) if isinstance(val, tuple) and all([ isinstance(df, (pd.DataFrame, pd.Series )) for df in val]): val = tuple([ _reduce_mem_usage(df, verbose=True) for df in val]) else: logger.warning(f'The return type for fun#{fn.__name__} is:{type(val)}') return val
def get_app_des_2_ids(data): data = data.drop_duplicates(['app_id_ex']) with timed_bolck(f'str to bert format for DF:{data.shape}'): # On app_id have multiply type_id ids = np.array( list([ get_ids_from_text(text) for text in data.app_des.values.tolist() ])) data['ids_lens_total'] = ids[:, 0].astype(int) data['ids'] = ids[:, 1] return data
def get_feature_bert_wv(): with timed_bolck(f'Read wv by gensim'): fname = Bert_Embed.get_embed_wordvec_file() import gensim word_vectors = gensim.models.KeyedVectors.load_word2vec_format( fname, binary=False) raw_bert = get_feature_bert(SEQ_LEN) label2id, id2label = get_label_id() df = pd.DataFrame(np.zeros((len(raw_bert), num_classes)), columns=label2id.keys(), index=raw_bert.index) for col in tqdm(df.columns, desc=f'Cal distanc for DF:{df.shape}'): df[col] = pd.Series(df.index).apply( lambda id_ex_bin: word_vectors.distance(id_ex_bin, col)).values return df
def get_feature_bert(seq_len): raw = get_raw_data() #print('====0', len(raw.loc[raw.app_id == 'BA915EC5E4CB0884C08C8DD9E9F1FD8F'])) data = get_app_des_2_ids(raw) #print('====1', len(data.loc[data.app_id=='BA915EC5E4CB0884C08C8DD9E9F1FD8F'])) data = split_app_des(data, seq_len) bert = data.ids.str.split( ',', expand=True).add_prefix('bert_').fillna(0).astype(int) with timed_bolck(f'Join bert#{bert.shape} and raw#{raw.shape} data'): old_shape = bert.shape bert['app_id'] = data.app_id.values bert['app_id_ex'] = data.app_id_ex.values bert['app_id_ex_bin'] = data.app_id_ex_bin.values bert['ids_lens_bin'] = data.ids_lens_bin.values bert['ids_lens_total'] = data.ids_lens_total.values bert['bin'] = data.bin.values if 'app_des' in raw: del raw['app_des'] del raw['app_id'] bert = pd.merge(bert, raw, how='left', on=['app_id_ex']) bert.index = bert.app_id_ex_bin logger.info( f'Merge extend shape from {old_shape}, {raw.shape} to {bert.shape}' ) padding_analysis = bert.loc[:, f'bert_{seq_len-1}'].value_counts().sort_index( ) logger.info(f'padding_analysis(bert_{seq_len-1}):\n{padding_analysis}') return bert.sort_values(['app_id_ex_bin'], ascending=False)