Exemplo n.º 1
0
def get_feature_lda(n_topics):
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.decomposition import LatentDirichletAllocation

    with timed_bolck('Gen_ALL_docs'):
        data = get_raw_data()

        app_docs = get_split_words(data[['app_des']])

        docs = app_docs.jieba_txt.apply(lambda val: ','.join(val))

    with timed_bolck('CountVec'):

        from sklearn.feature_extraction.text import CountVectorizer
        cv = CountVectorizer(max_df=0.85, stop_words=[',', '[', ']', '(', ')'])
        cntTf = cv.fit_transform(docs)

    with timed_bolck(f'Cal LDA#{n_topics}'):

        lda = LatentDirichletAllocation(n_components=n_topics,
                                        learning_offset=50.,
                                        random_state=666)
        docres = lda.fit_transform(cntTf)

    return pd.DataFrame(docres,
                        columns=[f'fea_lda_{i}' for i in range(n_topics)],
                        index=data.app_id)
Exemplo n.º 2
0
def get_feature_seq_input_sentences():
    data = get_raw_data()
    with timed_bolck('for df cut txt to words'):
        jieba = get_jieba()
        input_sentences = [
            list(jieba.cut(str(text), cut_all=True))
            for text in data.app_des.values.tolist()
        ]

    word2id = get_word2id()

    # Encode input words and labels
    X = [[word2id[word] for word in sentence if word in word2id]
         for sentence in input_sentences]
    max_words = 0  # maximum number of words in a sentence
    # Construction of word2id dict
    for sentence in input_sentences:
        if len(sentence) > max_words:
            max_words = len(sentence)
            # logger.debug(f'max_words={max_words}')
    logger.info(f'max_words={max_words}')

    with timed_bolck('X pad_sequences'):
        from keras.preprocessing.sequence import pad_sequences
        X = pad_sequences(X, max_words)

    return pd.DataFrame(X, index=data.app_id).add_prefix('seq_')
Exemplo n.º 3
0
def get_tfidf_all():
    with timed_bolck('Gen_ALL_docs'):
        data = get_raw_data()

        app_docs = get_split_words(data[['app_des']])

        docs = app_docs.jieba_txt.apply(lambda val: ','.join(val))

    with timed_bolck('CountVec'):

        from sklearn.feature_extraction.text import CountVectorizer
        cv = CountVectorizer(max_df=0.85, stop_words=[',', '[', ']', '(', ')'])

        word_count_vector = cv.fit_transform(docs)
        list(cv.vocabulary_.keys())[:10]
    with timed_bolck('TFIDF'):
        from sklearn.feature_extraction.text import TfidfTransformer

        tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
        tf_idf_vector = tfidf_transformer.fit_transform(word_count_vector)

    with timed_bolck('Gen Sparse TFIDF'):
        df = pd.SparseDataFrame(tf_idf_vector,
                                columns=cv.get_feature_names(),
                                index=data.app_id)

    return df
Exemplo n.º 4
0
    def _get_embed_by_bert(X):
        with timed_bolck(f'Prepare train model'):

            from keras_bert import load_trained_model_from_checkpoint

            model = load_trained_model_from_checkpoint(
                config_path,
                checkpoint_path,
                training=True,
                seq_len=SEQ_LEN,
            )
            #model.summary(line_length=120)

            from tensorflow.python import keras
            from keras_bert import AdamWarmup, calc_train_steps
            inputs = model.inputs[:2]
            dense = model.get_layer('NSP-Dense').output
            model = keras.models.Model(inputs, dense)  #.summary()

        with timed_bolck(f'try to gen embed DF{len(X)}'):
            input1_col = [
                col for col in X.columns if str(col).startswith('bert_')
            ]
            # train_x, train_y = filter_short_desc(train_x, train_y)

            input1 = X.loc[:, input1_col]  # .astype(np.float32)
            input2 = np.zeros_like(input1)  # .astype(np.int8)

            logger.info(f'NN Input1:{input1.shape}, Input2:{input2.shape}')

            label2id, id2label = get_label_id()
            from keras_bert import get_custom_objects
            import tensorflow as tf
            with tf.keras.utils.custom_object_scope(get_custom_objects()):
                res_list = []
                partition_len = 5000
                for sn in tqdm(range(1 + len(X) // partition_len),
                               'gen embeding'):
                    tmp = X.iloc[sn * partition_len:(sn + 1) * partition_len]
                    # print('\nbegin tmp\n', tmp.iloc[:3,:3].head())
                    res = model.predict([
                        tmp.loc[:, input1_col],
                        np.zeros_like(tmp.loc[:, input1_col])
                    ])
                    res = pd.DataFrame(res,
                                       index=tmp.index).add_prefix('embd_bert')
                    # print('\nend tmp\n', res.iloc[:3, :3].head())
                    res_list.append(res)

                res = pd.concat(res_list)

        return res
Exemplo n.º 5
0
    def get_embed_wordvec_file():
        fname = bert_wv
        if os.path.exists(fname):
            return fname
        else:
            type_id = Bert_Embed._get_embed_from_type_name()

            app_type = get_app_type()
            app_type = app_type.drop_duplicates('type_name')
            app_type = app_type.set_index('type_id')

            type_name = type_id.copy()
            type_name = pd.merge(type_name,
                                 app_type,
                                 how='right',
                                 left_index=True,
                                 right_index=True)
            type_name = type_name.set_index('type_name')

            type_all = pd.concat([type_id, type_name])
            #del type_all['type_id']

            app_desc = Bert_Embed._get_embed_from_app_desc()

            data = pd.concat([type_all, app_desc])

            with timed_bolck(f'Save data#{data.shape} records to :{fname}'):
                np.savetxt(fname,
                           data.reset_index().values,
                           delimiter=" ",
                           header="{} {}".format(len(data), len(data.columns)),
                           comments="",
                           fmt=["%s"] + ["%.6f"] * len(data.columns))

            return fname
Exemplo n.º 6
0
 def wrapper(*args, **kwargs):
     val = fn(*args, **kwargs)
     with timed_bolck(f'Reduce_Mem({fn.__name__}:{ex_type_name(val)})'):
         if isinstance(val, (pd.DataFrame,)) :
             val = _reduce_mem_usage(val, verbose=True)
         if isinstance(val, tuple) and all([ isinstance(df, (pd.DataFrame, pd.Series )) for df in val]):
             val = tuple([  _reduce_mem_usage(df, verbose=True)  for df in val])
         else:
             logger.warning(f'The return type for fun#{fn.__name__} is:{type(val)}')
     return val
Exemplo n.º 7
0
def get_app_des_2_ids(data):
    data = data.drop_duplicates(['app_id_ex'])
    with timed_bolck(f'str to bert format for DF:{data.shape}'):
        # On app_id have multiply type_id
        ids = np.array(
            list([
                get_ids_from_text(text)
                for text in data.app_des.values.tolist()
            ]))

    data['ids_lens_total'] = ids[:, 0].astype(int)

    data['ids'] = ids[:, 1]

    return data
Exemplo n.º 8
0
def get_feature_bert_wv():
    with timed_bolck(f'Read wv by gensim'):
        fname = Bert_Embed.get_embed_wordvec_file()
        import gensim
        word_vectors = gensim.models.KeyedVectors.load_word2vec_format(
            fname, binary=False)
        raw_bert = get_feature_bert(SEQ_LEN)
        label2id, id2label = get_label_id()
        df = pd.DataFrame(np.zeros((len(raw_bert), num_classes)),
                          columns=label2id.keys(),
                          index=raw_bert.index)

    for col in tqdm(df.columns, desc=f'Cal distanc for DF:{df.shape}'):
        df[col] = pd.Series(df.index).apply(
            lambda id_ex_bin: word_vectors.distance(id_ex_bin, col)).values

    return df
Exemplo n.º 9
0
def get_feature_bert(seq_len):

    raw = get_raw_data()
    #print('====0', len(raw.loc[raw.app_id == 'BA915EC5E4CB0884C08C8DD9E9F1FD8F']))
    data = get_app_des_2_ids(raw)
    #print('====1', len(data.loc[data.app_id=='BA915EC5E4CB0884C08C8DD9E9F1FD8F']))
    data = split_app_des(data, seq_len)

    bert = data.ids.str.split(
        ',', expand=True).add_prefix('bert_').fillna(0).astype(int)

    with timed_bolck(f'Join bert#{bert.shape} and raw#{raw.shape} data'):
        old_shape = bert.shape

        bert['app_id'] = data.app_id.values
        bert['app_id_ex'] = data.app_id_ex.values
        bert['app_id_ex_bin'] = data.app_id_ex_bin.values

        bert['ids_lens_bin'] = data.ids_lens_bin.values
        bert['ids_lens_total'] = data.ids_lens_total.values

        bert['bin'] = data.bin.values
        if 'app_des' in raw: del raw['app_des']
        del raw['app_id']
        bert = pd.merge(bert, raw, how='left', on=['app_id_ex'])

        bert.index = bert.app_id_ex_bin
        logger.info(
            f'Merge extend shape from {old_shape}, {raw.shape} to {bert.shape}'
        )

    padding_analysis = bert.loc[:,
                                f'bert_{seq_len-1}'].value_counts().sort_index(
                                )
    logger.info(f'padding_analysis(bert_{seq_len-1}):\n{padding_analysis}')
    return bert.sort_values(['app_id_ex_bin'], ascending=False)