Exemplo n.º 1
0
    def test_dict_interface(self):
        """Test Python 2 dict-like interface in both Python 2 and 3."""
        d = Dictionary(self.texts)

        self.assertTrue(isinstance(d, Mapping))

        self.assertEqual(list(zip(d.keys(), d.values())), list(d.items()))

        # Even in Py3, we want the iter* members.
        self.assertEqual(list(d.items()), list(d.iteritems()))
        self.assertEqual(list(d.keys()), list(d.iterkeys()))
        self.assertEqual(list(d.values()), list(d.itervalues()))
def create_dictionary(model=None, text=None):
    if model is not None and text is not None:
        gensim_dic = Dictionary()
        gensim_dic.doc2bow(model.wv.vocab.keys(), allow_update=True)
        w2index = {v: k + 1 for k, v in gensim_dic.items()}
        w2vec = {word: model[word] for word in w2index.keys()}

        def word2id(text):  #将分词后文本转化为字典索引的形式,并补齐
            data = []
            for te in text:
                word_2_id = []
                try:
                    #不在词典中的分词索引为0
                    for word in te:
                        word_2_id.append(w2index[word])
                except:
                    word_2_id.append(0)
                data.append(word_2_id)
            return data

        text = word2id(text)
        text = sequence.pad_sequences(text, maxlen=100)

        return w2index, w2vec, text
    else:
        print('data is None')
Exemplo n.º 3
0
def test_dictionaries():
    dictionary = Dictionary(TOKEN_SETS)
    # it maps tokens to numeric indices:
    assert list(dictionary.items()) == [(0, 'all'), (1, 'kings'), (2, 'men'),
                                        (3, 'the'), (4, 'ate'), (5, 'hens'),
                                        (6, 'and'), (7, 'got'), (8, 'sleep'),
                                        (9, 'they'), (10, 'tired'), (11, 'to'),
                                        (12, 'until'), (13, 'went'),
                                        (14, 'zzz')]
    assert dictionary.token2id == {
        'all': 0,
        'kings': 1,
        'men': 2,
        'the': 3,
        'ate': 4,
        'hens': 5,
        'and': 6,
        'got': 7,
        'sleep': 8,
        'they': 9,
        'tired': 10,
        'to': 11,
        'until': 12,
        'went': 13,
        'zzz': 14
    }
Exemplo n.º 4
0
def create_dictionaries(p_model):
    gensim_dict = Dictionary()
    gensim_dict.doc2bow(p_model.wv.vocab.keys(), allow_update=True)

    w2indx = {v: k + 1 for k, v in gensim_dict.items()}  # 词语的索引,从1开始编号
    w2vec = {word: p_model[word] for word in w2indx.keys()}  # 词语的词向量
    return w2indx, w2vec
Exemplo n.º 5
0
def create_dictionaries(model=None, conbined=None):
    if (model is not None) and (conbined is not None):
        gensim_dic = Dictionary()
        gensim_dic.doc2bow(model.wv.vocab.keys(), allow_update=True)

        w2index = {v: k + 1 for k, v in gensim_dic.items()}  #单词 索引
        w2vec = {word: model[word] for word in w2index.keys()}  #单词 向量

        def parse_dataset(combined):
            ''' Words become integers
            '''
            data = []
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2index[word])
                    except:
                        new_txt.append(0)
                data.append(new_txt)
            return data

        conbined = parse_dataset(conbined)
        #输入要求长度一致,所以句子要截取同样长度,不足最大长度补零
        conbined = sequence.pad_sequences(conbined, maxlen=100)

        return w2index, w2vec, conbined
    else:
        print('data is None')
def create_dictionaries(model=None,combined=None):
    ''' Function does are number of Jobs:
        1- Creates a word to index mapping
        2- Creates a word to vector mapping
        3- Transforms the Training and Testing Dictionaries
    '''
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(),
                            allow_update=True)
        #  freqxiao10->0 所以k+1
        w2indx = {v: k+1 for k, v in gensim_dict.items()}#所有频数超过10的词语的索引,(k->v)=>(v->k)
        w2vec = {word: model[word] for word in w2indx.keys()}#所有频数超过10的词语的词向量, (word->model(word))
        def parse_dataset(combined): # 闭包-->临时使用
            ''' Words become integers
            '''
            data=[]
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0) # freqxiao10->0
                data.append(new_txt)
            return data # word=>index
        combined=parse_dataset(combined)
        combined= sequence.pad_sequences(combined, maxlen=maxlen)#每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0
        return w2indx, w2vec,combined
    else:
        print ('No data provided...')
Exemplo n.º 7
0
def create_dictionaries(model=None,
                        combined=None):
    maxlen = 100
    ''' Function does are number of Jobs:
            1- Creates a word to index mapping
            2- Creates a word to vector mapping
            3- Transforms the Training and Testing Dictionaries
    '''
    if (combined is not None) and model is not None:
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
        w2indx = {v: k+1 for k, v in gensim_dict.items()}  # 所有词频数超过10的词语的索引
        w2vec = {word: model[word] for word in w2indx.keys()} # 所有词频数超过10的词语的词向量

        def parse_dataset(combined):
            '''
            Words become integers
            '''
            data=[]
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data.append(new_txt)
            return data
        combined=parse_dataset(combined)
        combined=sequence.pad_sequences(combined, maxlen=maxlen)
        return w2indx, w2vec, combined
    else:
        print('No data provide')
Exemplo n.º 8
0
def create_dictionaries(model=None, sen_lst=None):
    ''' Function does are number of Jobs:
        1- Creates a word to index mapping
        2- Creates a word to vector mapping
        3- Transforms the Training and Testing Dictionaries

    '''
    if (sen_lst is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
        w2indx = {v: k + 1 for k, v in gensim_dict.items()}  #所有频数超过10的词语的索引
        w2vec = {word: model[word] for word in w2indx.keys()}  #所有频数超过10的词语的词向量

        def parse_dataset(sen_lst):
            ''' Words become integers
            '''
            data = []
            for sentence in sen_lst:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data.append(new_txt)
            return data

        combined = parse_dataset(sen_lst)
        global MAX_LEN
        combined = sequence.pad_sequences(
            combined, maxlen=MAX_LEN)  #每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0
        return w2indx, w2vec, combined
    else:
        print('No data provided...')
def create_dictionaries(data=None, model=None):
    if (data is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(),
                            allow_update=True)
        w2indx = {v: k+1 for k, v in gensim_dict.items()}
        w2vec = {word: model[word] for word in w2indx.keys()}

        def parse_dataset(dataset):
            ''' Words become integers
            '''
            for key in dataset.keys():
                txt = dataset[key].lower().replace('\n', '').split()
                new_txt = []
                for word in txt:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                dataset[key] = new_txt
            return data
        data = parse_dataset(data)
        return w2indx, w2vec, data
    else:
        print('No data provided...')
Exemplo n.º 10
0
def create_dictionaries(model=None, combined=None):
    """
    返回索引,单词向量矩阵和具有统一长度和索引的句子
    """
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
        # 有单词向量的单词的索引不为0
        w2indx = {v: k + 1 for k, v in gensim_dict.items()}
        # 将所有对应的向量整合到向量矩阵中
        w2vec = {word: model[word] for word in w2indx.keys()}

        def parse_dataset(combined):
            data = []
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data.append(new_txt)
            return data

        combined = parse_dataset(combined)
        # 用keras的pad_sequences函数统一句子的长度
        combined = sequence.pad_sequences(combined, maxlen=max_len)
        return w2indx, w2vec, combined
    else:
        logging.warning('No data provided...')
Exemplo n.º 11
0
def test_streaming():
    generator = token_stream(NOVELS_DIRPATH)
    # it can be constructed via a generator:
    dictionary = Dictionary(generator)
    token_items = list(dictionary.items())
    assert len(token_items) == 1969
    assert token_items[0:4] == [(0, 'a'), (1, 'about'), (2, 'accommodate'),
                                (3, 'admire')]
Exemplo n.º 12
0
def test_statistical_trimming():
    dictionary = Dictionary(TOKEN_SETS)
    # no_below and no_above like min_df and max_df, except...
    #   + no_below: absolute number of documents
    #   + no_above: percentage of documents
    dictionary.filter_extremes(no_below=2, no_above=0.99)
    # it excludes terms not meeting the filter conditions:
    assert list(dictionary.items()) == [(0, 'kings'), (1, 'the')]
    assert dictionary.token2id == {'kings': 0, 'the': 1}
Exemplo n.º 13
0
    def test_dict_interface(self):
        """Test Python 2 dict-like interface in both Python 2 and 3."""
        d = Dictionary(self.texts)

        self.assertTrue(isinstance(d, Mapping))

        self.assertEqual(list(zip(d.keys(), d.values())), list(d.items()))

        # Even in Py3, we want the iter* members.
        self.assertEqual(list(d.items()), list(d.iteritems()))
        self.assertEqual(list(d.keys()), list(d.iterkeys()))
        self.assertEqual(list(d.values()), list(d.itervalues()))

        # XXX Do we want list results from the dict members in Py3 too?
        if not PY3:
            self.assertTrue(isinstance(d.items(), list))
            self.assertTrue(isinstance(d.keys(), list))
            self.assertTrue(isinstance(d.values(), list))
Exemplo n.º 14
0
def word2vec_init(model=None):  # 加载词向量模型,并计算对应的词嵌入向量矩阵
    global word2vec, word2idx, embed_weight
    model = Word2Vec.load(
        './model/word2vec') if not model else model  # 加载词向量模型

    dic = Dictionary()
    dic.doc2bow(model.wv.vocab.keys(), allow_update=True)
    word2idx = {token: idx + 1 for idx, token in dic.items()}
    word2vec = {word: model[word] for word in dic.values()}

    embed_weight = zeros((len(word2idx) + 1, embed_dim))
    for word, idx in word2idx.items():
        embed_weight[idx, :] = word2vec[word]  # 词向量矩阵,第一行是0向量
Exemplo n.º 15
0
def create_dictionaries(model=None, text=None):
    maxlen = 100  # 向量截断长度
    """Function does are number of Jobs:
            1- Creates a word to index mapping
            2- Creates a word to vector mapping
            3- Transforms the Training and Testing Dictionaries
    """
    if (text is not None) and model is not None:
        # 可以理解为python中的字典对象, 其Key是字典中的词,其Val是词对应的唯一数值型ID
        gensim_dict = Dictionary()

        # 函数doc2bow()只是计算每个唯一的词的出现频率,将词转化整型词id并且将结果作为稀疏向量返回
        gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)

        # 获取给定词的索引
        w2indx = {v: k + 1
                  for k, v in gensim_dict.items()
                  }  # 所有词频数超过10的词语的索引  k->key v->value

        # 输出给定词的词向量
        w2vec = {word: model[word]
                 for word in w2indx.keys()}  # 所有词频数超过10的词语的词向量

        # 文本变数字
        def parse_dataset(text):
            """
            Words become integers
            """
            data = []
            for sentence in text:
                # print(sentence) # 已经分词好的句子
                new_txt = []
                for word in sentence:
                    try:
                        # print(word)# 单个分词
                        new_txt.append(w2indx[word])
                        # print(w2indx[word]) # 索引
                    except:
                        new_txt.append(0)
                # print(new_txt) # 所有词向量
                data.append(new_txt)
            return data

        text = parse_dataset(text)

        # 将多个序列截断或补齐为相同长度
        text = sequence.pad_sequences(text, maxlen=maxlen)
        return w2indx, w2vec, text
    else:
        print('No data provide')
Exemplo n.º 16
0
def _tfidf_gensim(table,
                  input_col,
                  output_col_name="sparse_vectors",
                  tf_weighing='n',
                  df_weighing='t',
                  document_normalization='c'):

    out_table = table.copy()
    tokens = out_table[input_col]
    smartirs = tf_weighing + df_weighing + document_normalization

    dictionary = Dictionary(tokens)
    word_count_vector_list = [dictionary.doc2bow(text) for text in tokens]

    tfidf_model = TfidfModel(word_count_vector_list, smartirs=smartirs)
    tfidf_vector_list = [*tfidf_model[word_count_vector_list]]

    sparse_matrix = corpus2csc(tfidf_vector_list,
                               num_terms=len(dictionary.token2id)).T

    rb = BrtcReprBuilder()

    dictionary_data = [[
        index, word, tfidf_model.dfs[index], tfidf_model.idfs[index]
    ] for index, word in dictionary.items()]
    dictionary_table = pd.DataFrame(data=dictionary_data,
                                    columns=['index', 'word', 'count', 'idf'])
    dictionary_table = dictionary_table.sort_values(["count"],
                                                    ascending=[False])

    rb.addMD(
        strip_margin("""
    | ## TFIDF Result
    | ### Dictionary
    | {table1}
    """.format(table1=pandasDF2MD(dictionary_table))))

    out_table[output_col_name] = csr_matrix_to_sparse_vector_json_list(
        sparse_matrix)

    model = _model_dict('tfidf_model')
    model['dictionary_table'] = dictionary_table
    model['dictionary'] = dictionary
    model['tfidf_model'] = tfidf_model
    model['input_col'] = input_col
    model['output_col_name'] = output_col_name
    model['_repr_brtc_'] = rb.get()

    return {'out_table': out_table, 'model': model}
Exemplo n.º 17
0
def generate_id2wec(model_path):
    """
       :param word2vec_model: 词向量模型位置
       :return: dictionary文字编号填入w2id,二维列表词向量embedding_weights
    """
    model = Word2Vec.load(model_path)
    gensim_dict = Dictionary()
    gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
    w2id = {v: k + 1 for k, v in gensim_dict.items()}  # 词语的索引,从1开始编号
    w2vec = {word: model[word] for word in w2id.keys()}  # 词语的词向量
    n_vocabs = len(w2id) + 1
    embedding_weights = np.zeros((n_vocabs, 100))
    for w, index in w2id.items():  # 从索引为1的词语开始,用词向量填充矩阵
        embedding_weights[index, :] = w2vec[w]
    return w2id, embedding_weights
Exemplo n.º 18
0
def load_data_and_to_vector(window_size=10, n_dims=100, pad_max_length=200):
    '''

    :param window_size: 窗口大小
    :param n_dims: 词向量长度
    :return:
    '''
    words = np.load('../../data/company/tokenized.npy')
    y = np.load('../../data/company/label.npy')
    test = np.load('../../data/company/test.npy')

    total = np.concatenate((words, test))

    w2v = Word2Vec(size=n_dims, window=window_size, workers=4, min_count=1)
    w2v.build_vocab(total)
    w2v.train(total, total_examples=w2v.corpus_count, epochs=w2v.iter)

    _dict = Dictionary()
    # 将一个raw string 转换为根据本词典构构造的向量
    _dict.doc2bow(w2v.wv.vocab.keys(), allow_update=True)
    # w2index is a dict of {word: index} and w2vector is a dict of {word : vector(np.array)}
    w2index = {v: k + 1 for k, v in _dict.items()}  # 词语的索引
    w2vector = {word: w2v[word] for word in w2index.keys()}  # 词语向量

    # 转换序列为索引
    _sequence = []
    for _s in words:
        _sequence.append([w2index[w] for w in _s])

    _tests = []
    for _s in test:
        _tests.append([w2index[w] for w in _s])

    padded_words = sequence.pad_sequences(_sequence, maxlen=pad_max_length)
    padded_test = sequence.pad_sequences(_tests, maxlen=pad_max_length)

    # ------- embed start -------
    n_symbols = len(w2index) + 1  # 因为pad了0
    # every number in index table has n_dims 维的vector
    embedding_weights = np.zeros((n_symbols, n_dims))
    # 填入vector
    for word, index in w2index.items():
        embedding_weights[index, :] = w2vector[word]
    # -------- embed end -------

    return n_symbols, embedding_weights, pad_max_length, padded_words, y, padded_test
def get_word2idx(corpus, w2i_path, keep_tokens, token_limit):

    if iom.check_exists(w2i_path):

        logger.info("Found dictionary! Loading...")
        word2id = iom.load_pickle(w2i_path)

    else:
        logger.info("Dictionary not found! Creating...")
        id2word = Dictionary(corpus, prune_at=2000000)
        # filter out too freq/infreq words
        id2word.filter_extremes(keep_n=token_limit,
                                no_below=2,
                                keep_tokens=keep_tokens)
        word2id = {v: k for k, v in id2word.items()}
        iom.save_pickle(word2id, w2i_path)

    return word2id
def create_dictionaries(model=None, X=None):
    """创建词语字典,并返回每个词语的索引,词向量,以及每个句子所对应的词语索引
       Function does are number of Jobs:
        1- Creates a word to index mapping
        2- Creates a word to vector mapping
        3- Transforms the Training and Testing Dictionaries
    """
    if (X is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
        w2v_ind = {v: k + 1 for k, v in gensim_dict.items()}  # 所有频数超过10的词语的索引
        w2vec = {word: model[word] for word in w2v_ind.keys()}  # 所有频数超过10的词语的词向量
        X = parse_dataset(X, w2v_ind)
        # 每个句子所含词语对应的索引,所有句子中含有频数小于10的词语,索引为0
        X = sequence.pad_sequences(X, maxlen=setting.VOCABULARY_MAXLEN)
        return w2v_ind, w2vec, X
    else:
        print('No data provided...')
Exemplo n.º 21
0
def word2vector(X_train):
    """训练词向量"""
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))
    wv_model = Word2Vec(X_train,
                        size=wv_size,
                        window=6,
                        sg=1,
                        min_count=5,
                        workers=multiprocessing.cpu_count(),
                        iter=10)

    gensim_dict = Dictionary()  # 创建词语词典
    gensim_dict.doc2bow(wv_model.wv.vocab.keys(), allow_update=True)

    w2indx = {v: k + 1 for k, v in gensim_dict.items()}  # 词语的索引,从1开始编号
    w2vec = {word: wv_model[word] for word in w2indx.keys()}  # 词语的词向量
    return w2indx, w2vec
Exemplo n.º 22
0
def create_eta(topic_defs: [TopicDef], etadict: corpora.Dictionary,
               ntopics: int) -> np.ndarray:
    # create a (ntopics, nterms) matrix and fill with 1
    eta = np.full(shape=(ntopics, len(etadict)), fill_value=1)
    for topic_idx, topic_def in enumerate(
            topic_defs):  # for each word in the list of priors
        for word in topic_def.words:
            keyindex = [
                index for index, term in etadict.items() if term == word
            ]  # find word in dict
            if (len(keyindex) > 0):  # if it's in the dictionary
                eta[topic_idx,
                    keyindex[0]] = 1e10  # put a large number in there
            else:
                print(
                    f'create_eta: word "{word}" of topic {topic_def.name} not found in dictionary'
                )
    eta = np.divide(
        eta,
        eta.sum(axis=0))  # normalize so probabilities sum to 1 over all topics
    return eta
Exemplo n.º 23
0
        if i > 0 and (i % 10000) == 0:
            print(i)

        # If you wanna test something..
        if 0 < maxdoc <= i:
            break

        sentence = [line.split()]
        dictionary.add_documents(sentence)
        i += 1

dictionary.filter_extremes()
print("Extracting terms...")
with open(path + 'terms.csv', 'wb') as out:
    csvw = csv.writer(out)
    for item in dictionary.items():
        row = list()
        row.append(str(item[0]))
        row.append(item[1].encode('utf-8'))

        csvw.writerow(row)

print("Writing word-sentence Matrix ... ")
with open(path + 'bow.imat.txt', 'wb') as out:
    with open(corpus_path, "r") as corpus_file:
        csvw = csv.writer(out)
        i = 0
        for line in corpus_file:
            sentence = line.split()
            bow = dictionary.doc2bow(sentence)
            for word in bow:
Exemplo n.º 24
0
lda = LdaModel(common_corpus, num_topics=50, passes = 100)


#%%
aaa = CoherenceModel( lda, texts = datagensim, dictionary=dct,coherence='c_npmi',window_size=40,topn = 5)
aaa.get_coherence()


#%% Building from https://github.com/akashgit/autoencoding_vi_for_topic_models
import pickle as pk

dataAkash = np.load(r'C:\Users\Matteo\Desktop\autoencoding_vi_for_topic_models-master\autoencoding_vi_for_topic_models-master\data\20news_clean\train.txt.npy', encoding="bytes")
dataAkashTest = np.load(r'C:\Users\Matteo\Desktop\autoencoding_vi_for_topic_models-master\autoencoding_vi_for_topic_models-master\data\20news_clean\test.txt.npy', encoding="bytes")
dct = pk.load( open( r'C:\Users\Matteo\Desktop\autoencoding_vi_for_topic_models-master\autoencoding_vi_for_topic_models-master\data\20news_clean\vocab.pkl', "rb" ))
inv_dct = {v: k for k, v in dct.items()}

# build text document
dataAkashText = []

i = 0
for d in dataAkash:
    tmp = []
    for w in d:
        tmp += [inv_dct[w]]
    i +=1
    dataAkashText += [tmp]
dataAkashTextTest = []
i = 0
for d in dataAkashTest:
    tmp = []
Exemplo n.º 25
0
                          if not token.is_punct and not nlp.vocab[str(token)].is_stop \
                                   and ((not str(token).startswith('ne_') and len(str(token)) >= min_word_char_num) or \
                                        (str(token).startswith('ne_') and len(str(token)) >= min_word_char_num + 3))]
        processed_news_list.append(processed_news)

    #processed_news_list = [news.split() for news in news_with_NE]

    dictionary = Dictionary(processed_news_list)
    # remove words with too few document frequency
    dictionary.filter_extremes(no_below=min_doc_tf)
    bow_news = [dictionary.doc2bow(doc) for doc in processed_news_list]
    bow_news = [news for news in bow_news if len(news)>0]

    # find the ids of the ne
    dict_token2id = dictionary.token2id
    dict_id2token = dict(dictionary.items())
    tokens = list(dict_token2id.keys())
    ne_tokens = [token for token in tokens if token.startswith('ne_')]
    ne_token_ids = [dict_token2id[token] for token in ne_tokens]
    ne_token_ids = set(ne_token_ids)

    # ne term weighting
    # add max token frequency tuple in documents
    bow_news = [news + [(-1, max([t[1] for t in news]))] for news in bow_news]

    bow_news = [news + [relationNum(news, ne_token_ids, dict_id2token)] for news in bow_news]

    bow_news = [[(t[0], t[1]+news[-2][1] * news[-1][t[0]]) if t[0] in ne_token_ids else (t[0], t[1]) \
                  for t in news[:-2]] for news in bow_news]

#     dictionary.save(os.path.join(data_dir, 'ne8_%s_%s_%s_weighting.dict'%(topn_concepts, gamma,lambd)))
def train_val_test(dataset: pd.DataFrame, dictionary: Dictionary,
                   test_size: float, val_size: float) -> Dict[str, Any]:

    # Make train val test index
    num_docs = len(dataset)
    vaSize = int(np.floor(val_size * num_docs))
    tsSize = int(np.floor(test_size * num_docs))
    trSize = int(num_docs - vaSize - tsSize)
    idx_permute = np.random.permutation(num_docs).astype(int)
    print('Reading data....')

    # Make sure our text column is of type list
    dataset['text'] = dataset['text'].apply(lambda x: x.split(' '))
    word2id = dict([(w, j) for j, w in dictionary.items()])
    id2word = dict([(j, w) for j, w in dictionary.items()])

    # Remove words not in train_data
    print('Starting vocabulary : {}'.format(len(dictionary)))

    vocab = list(dictionary)

    docs_tr = [[
        word2id[w] for w in dataset['text'][idx_permute[idx_d]] if w in word2id
    ] for idx_d in range(trSize)]
    timestamps_tr = pd.DataFrame(
        dataset['timeslice'][idx_permute[range(trSize)]])
    idx_tr = idx_permute[range(trSize)]

    docs_ts = [[
        word2id[w] for w in dataset['text'][idx_permute[idx_d + trSize]]
        if w in word2id
    ] for idx_d in range(tsSize)]
    timestamps_ts = pd.DataFrame(dataset['timeslice'][idx_permute[range(
        trSize, trSize + tsSize)]])
    idx_ts = idx_permute[range(trSize, trSize + tsSize)]

    docs_va = [[
        word2id[w]
        for w in dataset['text'][idx_permute[idx_d + trSize + tsSize]]
        if w in word2id
    ] for idx_d in range(vaSize)]
    timestamps_va = pd.DataFrame(dataset['timeslice'][idx_permute[range(
        tsSize + trSize, num_docs)]])
    idx_va = idx_permute[range(tsSize + trSize, num_docs)]

    print(
        '  Number of documents in train set : {} [this should be equal to {} and {}]'
        .format(len(docs_tr), trSize, len(timestamps_tr)))
    print(
        '  Number of documents in test set : {} [this should be equal to {} and {}]'
        .format(len(docs_ts), tsSize, len(timestamps_ts)))
    print(
        '  Number of documents in validation set: {} [this should be equal to {} and {}]'
        .format(len(docs_va), vaSize, len(timestamps_va)))

    # Split test set in 2 halves, the first containing the first half of the words in documents, and second part the second
    # half of words in documents. Will be use to gather test completion perplexity.

    print('Splitting test documents in 2 halves...')
    docs_ts_h1 = [[w for i, w in enumerate(doc) if i <= len(doc) / 2.0 - 1]
                  for doc in docs_ts]
    docs_ts_h2 = [[w for i, w in enumerate(doc) if i > len(doc) / 2.0 - 1]
                  for doc in docs_ts]

    print('Creating lists of words...')

    words_tr = create_list_words(docs_tr)
    words_ts = create_list_words(docs_ts)
    words_ts_h1 = create_list_words(docs_ts_h1)
    words_ts_h2 = create_list_words(docs_ts_h2)
    words_va = create_list_words(docs_va)

    print('  Total number of words used in train set : ', len(words_tr))
    print('  Total number of words used in test set : ', len(words_ts))
    print(
        '  Total number of words used in test firt set (first half of documents words): ',
        len(words_ts_h1))
    print(
        '  Total number of words used in test firt set (first half of documents words): ',
        len(words_ts_h2))
    print('  Total number of words used in val set : ', len(words_va))

    n_docs_tr = len(docs_tr)
    n_docs_ts = len(docs_ts)
    n_docs_ts_h1 = len(docs_ts_h1)
    n_docs_ts_h2 = len(docs_ts_h2)
    n_docs_va = len(docs_va)

    # Get doc indices
    print('Getting doc indices...')

    doc_indices_tr = create_doc_indices(docs_tr)
    doc_indices_ts = create_doc_indices(docs_ts)
    doc_indices_ts_h1 = create_doc_indices(docs_ts_h1)
    doc_indices_ts_h2 = create_doc_indices(docs_ts_h2)
    doc_indices_va = create_doc_indices(docs_va)

    print('Creating bow representation...')

    bow_tr = create_bow(doc_indices_tr, words_tr, n_docs_tr, len(vocab))
    bow_ts = create_bow(doc_indices_ts, words_ts, n_docs_ts, len(vocab))
    bow_ts_h1 = create_bow(doc_indices_ts_h1, words_ts_h1, n_docs_ts_h1,
                           len(vocab))
    bow_ts_h2 = create_bow(doc_indices_ts_h2, words_ts_h2, n_docs_ts_h2,
                           len(vocab))
    bow_va = create_bow(doc_indices_va, words_va, n_docs_va, len(vocab))

    print(' Train bag of words shape : {}'.format(bow_tr.shape))
    print(' Test bag of words shape : {}'.format(bow_ts.shape))
    print(' Test set 1 bag of words shape : {}'.format(bow_ts_h1.shape))
    print(' Test set 2 bag of words shape : {}'.format(bow_ts_h2.shape))
    print(' Val bag of words shape : {}'.format(bow_va.shape))

    print('\nMost import words in train BOW : \n')
    print(get_most_important_words(bow_tr, id2word))
    print('\nMost import words in val BOW : \n')
    print(get_most_important_words(bow_va, id2word))
    print('\nMost import words in test BOW : \n')
    print(get_most_important_words(bow_ts, id2word))
    print('\nDone splitting data.')

    return dict(BOW_train=bow_tr,
                BOW_test=bow_ts,
                BOW_test_h1=bow_ts_h1,
                BOW_test_h2=bow_ts_h2,
                BOW_val=bow_va,
                timestamps_train=timestamps_tr,
                timestamps_test=timestamps_ts,
                timestamps_val=timestamps_va,
                train_vocab_size=len(vocab),
                train_num_times=len(np.unique(timestamps_tr['timeslice'])),
                idx_train=idx_tr,
                idx_test=idx_ts,
                idx_val=idx_va)
Exemplo n.º 27
0
metadata = pd.read_csv("..\\data\\absrecord.csv")
print(len(metadata['filename'].values))
fullvocab = []

from preprocessor import preprocess, flatten

for record in range(len(metadata)):
    # print(100*record/len(metadata))
    fullvocab.append(preprocess(str(metadata.iloc[record]['body']))[0])
print(fullvocab)
maindict = Dictionary(fullvocab)
i = 0
fulldict = []
for document in fullvocab:
    temp = []
    print(100 * i / len(fullvocab))
    i += 1
    document = list(sorted(set(document)))
    for token in document:
        if token in list(maindict.values()):
            for key, value in list(maindict.items()):
                if token == value:
                    temp.append({"id": key, "name": token})
                    # print({"id":key, "name":token})
    fulldict.append(temp)

b = metadata['filename'].values
print(fulldict)
a = pd.DataFrame({'keywords': fulldict})
metadata.append(a)
metadata.to_csv("..\\data\\keywords.csv")
Exemplo n.º 28
0
if __name__ == '__main__':

    common_texts = [
        ['human', 'interface', 'computer'],
        ['survey', 'user', 'computer', 'system', 'response', 'time'],
        ['eps', 'user', 'interface', 'system'],
        ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'],
        ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'],
        ['graph', 'minors', 'survey']
    ]

    common_dictionary = Dictionary(common_texts)
    common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]
    #This setp generates the id2token
    for k, v in common_dictionary.items():
        pass
    id2word = common_dictionary.id2token
    ctm = CTMModel(common_corpus, num_topics=3, id2word=id2word)
    print("done")
    ## Larger Test

    do_process = True

    if do_process:
        import nltk
        nltk.download('wordnet')
        medical_df = get_transcription_data()
        docs = numpy.array(medical_df['transcription'])
        # Use LDA to preprocess - later make a base class and refactor.
        lda = LDAAnalysis(docs)