Exemplo n.º 1
0
def test_lookups():
    dictionary = Dictionary(TOKEN_SETS)
    # it can perform token inclusion and position lookups:
    assert dictionary.doc2idx(
        TOKEN_SETS[0]) == [0, 3, 1, 2]  # ["all", "the", "kings", "men"]
    assert dictionary.doc2idx(NEW_TOKENS) == [
        0, 1, -1, -1
    ]  # ["all", "kings", "queens", "jacks"]
Exemplo n.º 2
0
def generate_walks(G, n_walks, walk_len):
    walks = build_random_walk_corpus(G, n_walks, walk_len)
    # Now we have a Gensim Dictionary to work with
    dictionary = Dictionary(walks)
    # Covert docs to indexes in dictionary
    return dictionary, [
        dictionary.doc2idx(w)
        for w in tqdm(walks, desc='Converting to indicies')
    ]
def load_data(data_file):
    data_file = Path(data_file)
    pairs = []
    with data_file.open() as fin:
        for line in fin:
            a, b = line.strip().split('\t')
            pairs.append((a, b))
    d = Dictionary(pairs)
    pairs = np.asarray([d.doc2idx(pair) for pair in pairs])
    return d, pairs
Exemplo n.º 4
0
def build_token_corpus(file_path: str, dictionary: Dictionary) -> list:
    """build data corpus convert each word to id and append EOS

    Arguments:
        file_path {str} -- data file path
        dictionary {Dictionary} -- build up word2id dictionary

    Returns:
        list -- idx data list
    """

    with open(file_path, mode='r', encoding='utf-8') as fp:
        data = json.load(fp)

    data_size = len(data)
    logging.info("data have {} articles".format(data_size))

    for idx, article in enumerate(data):
        article_content = article[ARTICLE_CONTENT]
        article_content = jieba_tokenize(article_content)
        content_token = dictionary.doc2idx(article_content)
        article_title = article[ARTICLE_TITLE]
        article_title = jieba_tokenize(article_title)
        title_token = dictionary.doc2idx(article_title)
        article[ARTICLE_CONTENT] = content_token
        article[ARTICLE_TITLE] = title_token

        for questions_obj in article[QUESTIONS]:
            question = questions_obj[QUESTION]
            question = jieba_tokenize(question)
            question_token = dictionary.doc2idx(question)
            questions_obj[QUESTION] = question_token
            answer = questions_obj[ANSWER]
            answer = jieba_tokenize(answer)
            answer_token = dictionary.doc2idx(answer)
            questions_obj[ANSWER] = answer_token

        if idx % 100 == 0:
            percent = idx / data_size
            logging.info("finish {}% of data".format(percent * 100))

    return data
Exemplo n.º 5
0
def load_data(filename):
    # Load articles from file
    articles, _ = LoadArticles(filename, verbose=False, split=True)

    # Create vocab dictionary for articles
    dct = Dictionary(articles)
    # dct.filter_extremes(no_below=5, no_above=500, keep_n=100000)
    dct.filter_extremes(no_below=5, no_above=500, keep_n=50000)

    # convert words to indices
    # we make UNK the highest index
    vocab_size = len(dct)
    articles = [dct.doc2idx(a, unknown_word_index=vocab_size) for a in articles]

    return articles, dct
def extract_important_words_tfidf(documents, threshold):
    dictionary = Dictionary(documents)
    corpus = [dictionary.doc2bow(text) for text in documents]

    tfidf = TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    new_documents = []

    for i, doc in enumerate(documents):
        important_words_index = get_important_words(corpus_tfidf[i], threshold)
        new_doc = []
        for term in doc:
            if dictionary.doc2idx([term])[0] in important_words_index:
                new_doc.append(term)
        new_documents.append(new_doc)

    return new_documents  # list of list of terms
Exemplo n.º 7
0
def tokenize_files(df_path):
    data = pd.read_csv(df_path, na_filter=False)['language'].values
    cleaned_docs = [
        tokenize_doc(f) for f in tqdm(data, desc='Tokenizing Docs')
    ]
    # Now we have a Gensim Dictionary to work with
    dictionary = Dictionary(cleaned_docs)
    # Remove any tokens with a frequency less than 10
    dictionary.filter_extremes(no_below=10, no_above=0.75)
    # Covert docs to indexes
    indexed_docs = [
        dictionary.doc2idx(d)
        for d in tqdm(cleaned_docs, desc='Converting to indicies')
    ]
    # Remove out of vocab tokens
    return dictionary, [[
        t for t in d if t != -1
    ] for d in tqdm(indexed_docs, desc="Removing out-of-vocab tokens")]
Exemplo n.º 8
0
def make_tf_time_series(tweets_time_series, keep_only_common_words=True):
    tweets_time_series = break_up_sentences(tweets_time_series)
    tweets_dict = Dictionary(tweets_time_series)
    bow_time_series = [
        tweets_dict.doc2bow(tweets) for tweets in tweets_time_series
    ]
    tf_time_series = [
        make_term_frequency(time_step) for time_step in bow_time_series
    ]
    tf_time_series = [[(tweets_dict.get(tup[0]), tup[1]) for tup in time_step]
                      for time_step in tf_time_series]
    if keep_only_common_words:
        tweets_dict.filter_extremes(no_below=len(tweets_time_series),
                                    no_above=1)
        tf_time_series = [[
            tup for tup in time_step if tweets_dict.doc2idx([tup[0]])[0] != -1
        ] for time_step in tf_time_series]
    return tf_time_series
Exemplo n.º 9
0
    def postagging(self, word_seq):
        N = self.freq_obj.get_hidden_stage()
        T = len(word_seq)
        word_seq = [_.lower() for _ in word_seq]
        dct = Dictionary([word_seq])
        word2idx = dct.token2id
        id2word = {v: k for k, v in word2idx.items()}
        observed = np.array(dct.doc2idx(word_seq))
        emission_prob_matrix = np.zeros((N, T))
        for word in word2idx.keys():
            for tag in self.tag_set:
                if tag != 'START':
                    emission_prob_matrix[
                        HMMTagger.get_tagid(tag),
                        word2idx[word]] = self.freq_obj.get_emission_prob(
                            word.lower(), tag)

        path = self.hmm_obj.viterbi(observed,
                                    self.freq_obj.get_transition_matrix(),
                                    emission_prob_matrix,
                                    self.freq_obj.get_initial_distribution())
        return [(id2word[observed[i]], HMMTagger.idx2tag[path[0][i]])
                for i in range(len(observed))]
Exemplo n.º 10
0
    #First parameter is the replacement, second parameter is your input string
    test = regex.sub('', d)
    #Out: 'abdE'
    if len(test) > 100:
        datagensim += [[i for i in test.split(" ") if len(i) > 3]]

dct = Dictionary(datagensim)
dct.filter_extremes(no_below=2, no_above=0.9)
dct.compactify()
X = np.zeros((len(dct.keys()), len(datagensim)), int)
i = 0
bow = []
datagensimClean = []
for d in datagensim:

    idx = dct.doc2idx(d)
    dC = [d[i] for i in range(len(d)) if idx[i] > -1]
    tmp = dct.doc2bow(dC)
    datagensimClean += [dC]
    bow += [tmp]
    for key, value in tmp:
        X[key, i] = value
    i += 1

datagensim = datagensimClean
#%% Test data
#with open(r'C:\Users\Matteo\Documents\Git\aLDA\data\wikitext-2-raw\wiki.test.raw', encoding="utf8") as file:
#    dataRaw = file.read()
#
#dataRaw = stem(dataRaw)
#data = dataRaw.replace('= = = = ','+ + + +')
Exemplo n.º 11
0
n_hidden = 128

df = pd.read_csv(data_file, names=('question', 'answer'), dtype='object')

q_maxlen = df['question'].map(len).max()
a_maxlen = df['answer'].map(len).max()

rpad_blank = lambda size: (lambda s: s.ljust(size, ' '))

que = df['question'].map(rpad_blank(q_maxlen))
ans = df['answer'].map(rpad_blank(a_maxlen))

dic = Dictionary([list(' '.join(df.values.flatten()))])
dic.save_as_text(f'{data_file}.dic')

one_hot = lambda s: np.eye(len(dic))[dic.doc2idx(list(s))]

x = np.array([one_hot(q) for q in que])
y = np.array([one_hot(a) for a in ans])

model = Sequential()

# encoder
model.add(LSTM(n_hidden, input_shape=(q_maxlen, len(dic))))

# decoder
model.add(RepeatVector(a_maxlen))
model.add(LSTM(n_hidden, return_sequences=True))

model.add(TimeDistributed(Dense(len(dic))))
model.add(Activation('softmax'))
Exemplo n.º 12
0
class SkipGramDataset(Dataset):

    def __init__(self, args):
        self.args = args
        self.dictionary = None
        self.examples = []
        self.name = ''

    def __getitem__(self, index):
        return self._example_to_tensor(*self.examples[index])

    def __len__(self):
        return len(self.examples)

    def save(self, examples_path, dict_path):
        print('Saving Dataset Examples...')
        torch.save({
             'examples': self.examples,
        }, examples_path)
        print('Saving Dataset Dictionary...')
        self.dictionary.save(dict_path)
        print('Saved Dataset!')

    def load(self, examples_path, dict_path):
        print('Loading Dataset Examples...')
        self.examples = torch.load(examples_path)['examples']
        print('Loading Dataset Dictionary...')
        self.dictionary = Dictionary().load(dict_path)
        print('Loaded Saved Dataset!')

    def generate_examples_serial(self):
        """
        Generates examples with no multiprocessing - straight through!
        :return: None - updates class properties
        """
        # Now we have a Gensim Dictionary to work with
        self._build_dictionary()
        # Remove any tokens with a frequency less than 10
        self.dictionary.filter_extremes(no_below=10, no_above=0.75)

        self.examples = []
        for file in tqdm(self.load_files(), desc="Generating Examples (serial)"):
            file = self.dictionary.doc2idx(file)
            self.examples.extend(self._generate_examples_from_file(file))

    def load_files(self):
        """
        Sets self.files as a list of tokenized documents!
        :returns: List of files
        """
        # Needs to be implemented by child class
        raise NotImplementedError

    def _build_dictionary(self):
        """
        Creates a Gensim Dictionary
        :return: None - modifies self.dictionary
        """
        print("Building Dictionary...")
        self.dictionary = Dictionary(self.load_files())

    def _generate_examples_from_file(self, file):
        """
        Generate all examples from a file within window size
        :param file: File from self.files
        :returns: List of examples
        """

        examples = []
        for i, token in enumerate(file):
            if token == -1:
                # Out of dictionary token
                continue

            # Generate context tokens for the current token
            context_words = self._generate_contexts(i, file)

            # Form Examples:
            # center, context - follows form: (input, target)
            new_examples = [(token, ctxt) for ctxt in context_words if ctxt != -1]

            # Add to class
            examples.extend(new_examples)
        return examples

    def _generate_contexts(self, token_idx, tokenized_doc):
        """
        Generate Token's Context Words
        Generates all the context words within the window size defined
        during initialization around token.

        :param token_idx: Index at which center token is found in tokenized_doc
        :param tokenized_doc: List - Document broken into tokens
        :returns: List of context words
        """
        contexts = []
        # Iterate over each position in window
        for w in range(-self.args.window_size, self.args.window_size + 1):
            context_pos = token_idx + w

            # Make sure current center and context are valid
            is_outside_doc = context_pos < 0 or context_pos >= len(tokenized_doc)
            center_is_context = token_idx == context_pos

            if is_outside_doc or center_is_context:
                # Not valid - skip to next window position
                continue

            contexts.append(tokenized_doc[context_pos])
        return contexts

    def _example_to_tensor(self, center, target):
        """
        Takes raw example and turns it into tensor values

        :params example: Tuple of form: (center word, document id)
        :params target: String of the target word
        :returns: A tuple of tensors
        """
        center, target = torch.tensor([int(center)]), torch.tensor([int(target)])
        return center, target
Exemplo n.º 13
0
n_hidden = int(sys.argv[5])

BOS = '\t'
EOS = '\n'

df = pd.read_csv(data_file, names=('question', 'answer'), dtype='object')

a_maxlen = df['answer'].map(len).max() + 2

ans = df['answer'].map(lambda a: f'{BOS}{a}{EOS}')

dic = Dictionary([list(BOS + EOS + ' '.join(df.values.flatten()))])
dic.save(f'{data_file}.dic')

padding_one_hot = lambda d, size: np.vstack(
    (np.eye(len(dic))[dic.doc2idx(list(d))], np.zeros(
        (size - len(d), len(dic)))))

one_hot = lambda s: np.eye(len(dic))[dic.doc2idx(list(s))]
sum_one_hot = lambda s: np.add.reduce(one_hot(s))

x1 = np.array([sum_one_hot(q) for q in df['question']])
x2 = np.array([padding_one_hot(a, a_maxlen) for a in ans])
y = np.array([np.vstack((d[1:], np.zeros((1, len(dic))))) for d in x2])

enc_inputs = Input(shape=(len(dic), ))
enc_outputs = Dense(n_hidden, activation='relu')(enc_inputs)

enc_states = [enc_outputs]

decoder = GRU(n_hidden, return_sequences=True, return_state=True)
Exemplo n.º 14
0
limit = int('9' * digit) + 1

input_format = lambda f: f.ljust(input_digit, ' ')
output_format = lambda f: f.ljust(output_digit, ' ')

input_formula = lambda d: input_format(f"{d[0]}+{d[1]}")

ds = np.random.randint(0, limit, size = (n, 2))

data = [input_formula(d) for d in ds]
labels = [output_format(f"{sum(d)}") for d in ds]

dic = Dictionary([list('0123456789+ ')])

one_hot = lambda ds: np.array([np.eye(len(dic))[dic.doc2idx(list(d))] for d in ds])

x = one_hot(data)
y = one_hot(labels)


model = Sequential()

# encoder
model.add(LSTM(128, input_shape=(input_digit, len(dic))))

# decoder
model.add(RepeatVector(output_digit))
model.add(LSTM(128, return_sequences = True))

model.add(TimeDistributed(Dense(len(dic))))
remove_text='https?'
vocab=vocab.str.replace(remove_text,'')
vocab=vocab.str.findall('[a-zA-Z]+')
stopword=stopwords.words('english')

vocab=vocab.apply(lambda x: [w.lower() for w in x if w.lower() not in stopword])
vocab=vocab.apply(lambda x: [stemmer.stem(w) for w in x])
vocab[:10]

dictionary=Dictionary(vocab)

train.text=vocab.iloc[:7613]

test.text=vocab[7613:].reset_index().text

train.text=train.text.apply(lambda x: dictionary.doc2idx(x))
train_text=pad_sequences(train.text)
train_target=train.target.values

from gensim.models.word2vec import Word2Vec

import inspect
word2vec=Word2Vec(vocab,size=32)
#inspect.signature(Word2Vec)

vocab[:10]

word_vectors=word2vec.wv.vectors

word_vectors.shape
class BilingualPreprocessor:
    def __init__(self, is_training=False):
        self.ja_dictionary = Dictionary([['<PAD>'], ['<BeginOfEncode>'],
                                         ['<BOS>'], ['<EOS>'], ['<UNK>']])
        self.en_dictionary = Dictionary([['<PAD>'], ['<BeginOfEncode>'],
                                         ['<BOS>'], ['<EOS>'], ['<UNK>']])
        self.is_training = is_training

    def register_ja_texts(self, texts: List[List[str]]):
        if self.is_training:
            self.ja_dictionary.add_documents(texts)

    def register_en_texts(self, texts: List[List[str]]):
        if self.is_training:
            self.en_dictionary.add_documents(texts)

    @property
    def ja_eos_index(self):
        return self.ja_dictionary.token2id['<EOS>']

    @property
    def en_eos_index(self):
        return self.en_dictionary.token2id['<EOS>']

    @property
    def ja_unknown_word_index(self):
        return self.ja_dictionary.token2id['<UNK>']

    @property
    def en_unknown_word_index(self):
        return self.en_dictionary.token2id['<UNK>']

    @property
    def ja_begin_of_encode_index(self):
        return self.ja_dictionary.token2id['<BeginOfEncode>']

    @property
    def en_begin_of_encode_index(self):
        return self.en_dictionary.token2id['<BeginOfEncode>']

    @property
    def ja_vocab_count(self):
        return len(self.ja_dictionary)

    @property
    def en_vocab_count(self):
        return len(self.en_dictionary)

    def doc2idx_ja(self, texts):
        return self.ja_dictionary.doc2idx(
            texts, unknown_word_index=self.ja_unknown_word_index)

    def doc2idx_en(self, texts):
        return self.en_dictionary.doc2idx(
            texts, unknown_word_index=self.en_unknown_word_index)

    def save(self, file_path):
        with open(file_path, 'wb') as f:
            dill.dump(self, f)

    @classmethod
    def load(cls, file_path):
        with open(file_path, 'rb') as f:
            preprocessor = dill.load(f)
        assert isinstance(preprocessor,
                          cls), 'Load a class different from {}'.format(cls)

        return preprocessor
Exemplo n.º 17
0
with open('reviews_using_dataset.csv') as csvfile:
    readCSV = csv.reader(csvfile, delimiter=',')
    for i, row in enumerate(readCSV):
        if i == 0:
            continue
        if i < 100:
            reviewTexts_train.append(row[0])
            sentiment_train.append(int(row[1]))
        else:
            break
            reviewTexts_test.append(row[0])
            sentiment_test.append(int(row[1]))

from gensim.corpora import Dictionary
dct = Dictionary([sent.strip().split() for sent in reviewTexts_train])

doc_idx = [
    dct.doc2idx(reviewTexts_train[i].strip().split())
    for i in range(len(reviewTexts_train))
]
print(doc_idx[0])
#train_bow = dct.doc2bow([sent.strip().split() for sent in reviewTexts_train])

#print (len(train_bow))

from keras.preprocessing import sequence
max_words = 500
X_train = sequence.pad_sequences(doc_idx, maxlen=max_words)
#X_test = sequence.pad_sequences(X_test, maxlen=max_words)
Exemplo n.º 18
0
class Dataset(object):
    '''
    Create dataset for training supervised model
    '''
    def __init__(self, config):
        self.config = config
        self.train_data = None
        self.test_data = None
        self.val_data = None
        self.vocab = None
        self.word_embeddings = None

    def get_pandas_df(self, filename):
        '''
        Load the data into Pandas.DataFrame object
        This will be used to convert data to torchtext object
        '''
        with open(filename, 'r', encoding='utf-8') as datafile:
            data = [line.strip().split(' ', maxsplit=1) for line in datafile]
            data_text = list(map(lambda x: x[1], data))
            data_label = list(map(lambda x: x[0], data))

        full_df = pd.DataFrame({"text": data_text, "label": data_label})
        return full_df

    def load_data(self,
                  train_file,
                  test_file,
                  dataname,
                  embed_file=None,
                  val_file=None):
        '''
        Loads the data from files   
        Sets up iterators for training, validation and test data
        Also create vocabulary and word embeddings based on the data

        Inputs:
            embed_file (String): absolute path to file containing word embeddings (GloVe/Word2Vec)
            train_file (String): absolute path to training file
            test_file (String): absolute path to test file
            val_file (String): absolute path to validation file
        '''
        # load embeddings
        voc_file = dataname + '_vocab.txt'
        new_embed = dataname + '_embed.pkl'
        train_X, train_Y = read_labeled(train_file)
        test_X, test_Y = read_labeled(test_file)
        val_X = None
        val_Y = None
        if val_file:
            val_X, val_Y = read_labeled(val_file)
        else:
            sp = int(len(train_X) * 0.8)
            train_X, val_X = (train_X[:sp], train_X[sp:])
            train_Y, val_Y = (train_Y[:sp], train_Y[sp:])
        train_X = [doc_padding(x, self.config.max_sen_len) for x in train_X]
        test_X = [doc_padding(x, self.config.max_sen_len) for x in test_X]
        val_X = [doc_padding(x, self.config.max_sen_len) for x in val_X]

        if os.path.isfile(voc_file):
            self.vocab = Dictionary.load_from_text(voc_file)
        else:
            self.vocab = Dictionary(train_X)
            special_tokens = {'<pad>': 0, '<unk>': 1}
            self.vocab.patch_with_special_tokens(special_tokens)
            self.vocab.save_as_text(voc_file)
        # build vocab
        train_X = [self.vocab.doc2idx(x, 1) for x in train_X]
        test_X = [self.vocab.doc2idx(x, 1) for x in test_X]
        val_X = [self.vocab.doc2idx(x, 1) for x in val_X]
        # transform words to index
        if os.path.isfile(new_embed):
            self.word_embeddings = torch.load(new_embed)
        else:
            embeds = Vectors(embed_file,
                             unk_init=lambda x: torch.Tensor(
                                 np.random.normal(scale=0.6, size=(x.size()))))
            self.word_embeddings = weight_matrix(self.vocab, embeds)
            torch.save(self.word_embeddings, new_embed)
        self.train_data = (train_X, train_Y)
        self.test_data = (test_X, test_Y)
        self.val_data = (val_X, val_Y)

        print("Loaded {} training examples".format(len(train_X)))
        print("Loaded {} test examples".format(len(test_X)))
        print("Loaded {} validation examples".format(len(val_X)))

    def train_iterator(self):
        return batch_iter(*self.train_data, self.config.batch_size)

    def test_iterator(self):
        return batch_iter(*self.test_data, self.config.batch_size, False)

    def val_iterator(self):
        return batch_iter(*self.val_data, self.config.batch_size, False)
Exemplo n.º 19
0
def extract_and_save_biterm(fname,
                            embed_size=300,
                            min_count=5,
                            max_percent=0.5,
                            iteration=200):
    '''
    simple preprocessing of biterm

    A biterm is an unordered words pair
    Biterm is drawn from documents not from the whole corpus
    '''

    docs = read_corpus(fname, labeled=False, tokens_only=True)
    docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

    # Remove words that are only one character, and remove stop words
    docs = [[
        token for token in doc if len(token) > 1 and token not in STOP_WORDS
    ] for doc in docs]

    lemmatizer = WordNetLemmatizer()
    docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

    bigram = Phrases(docs, min_count=20)
    for idx in range(len(docs)):
        for token in bigram[docs[idx]]:
            if '_' in token:
                # Token is a bigram, add to document.
                docs[idx].append(token)

    dictionary = Dictionary(docs)
    dictionary.filter_extremes(no_below=min_count, no_above=max_percent)
    dictionary.compactify()
    '''encode'''
    docs = [[token for token in doc if token in dictionary.token2id]
            for doc in docs]

    # # Remove docs that contains less than 3 words
    docs = [doc for doc in docs if len(set(doc)) > 1]
    # remove docs that contain less than 2 unique words
    model = gensim.models.Word2Vec(docs,
                                   workers=4,
                                   size=embed_size,
                                   iter=100,
                                   min_count=2)

    docs = [dictionary.doc2idx(doc) for doc in docs]

    biterms = {}
    i = 0
    doc_bitems = []
    for doc in docs:
        d_bi = {}
        doc = sorted(doc)
        for x in range(len(doc) - 1):
            for y in range(x + 1, len(doc)):
                if doc[x] == doc[y]:
                    continue
                biterm = (doc[x], doc[y])
                idx = 0
                if biterm not in biterms:
                    biterms[biterm] = i
                    idx = i
                    i += 1
                else:
                    idx = biterms[biterm]
                if idx in d_bi:
                    d_bi[idx] += 1
                else:
                    d_bi[idx] = 1
        doc_bitems.append(d_bi)
    fname = os.path.basename(fname)
    fname = fname.split('.')[0]
    dirc = os.path.join(os.getcwd(), 'Data', 'unsupervised')
    if not os.path.exists(dirc):
        os.makedirs(dirc)

    embeddings = {}
    for key, token in dictionary.iteritems():
        embeddings[key] = model.wv[token]

    dictionary.save(os.path.join(dirc, fname + '_dic.pkl'))
    biterms = dict([key, val] for val, key in biterms.items())

    with open(os.path.join(dirc, fname + '_bit.pkl'), 'wb') as f:
        pickle.dump(biterms, f)
    with open(os.path.join(dirc, fname + '_doc_bit.pkl'), 'wb') as f:
        pickle.dump(doc_bitems, f)
    with open(os.path.join(dirc, fname + '_emb.pkl'), 'wb') as f:
        pickle.dump(embeddings, f)
    with open(os.path.join(dirc, fname + '_doc.pkl'), 'wb') as f:
        pickle.dump(docs, f)
Exemplo n.º 20
0
from gensim.models import word2vec
from sklearn import decomposition

data_file = sys.argv[1]
pca_num = int(sys.argv[2])

limit_value = 0.1

sentences = [s for s in word2vec.LineSentence(data_file) if len(s) >= 2]

dic = Dictionary(sentences)

x = np.zeros((len(sentences), len(dic)))

for i, d in enumerate(sentences):
    x[np.ix_([i], dic.doc2idx(d))] = 1

pca = decomposition.PCA(n_components = pca_num, random_state = 1)

nx = pca.fit_transform(x)

print(sum(pca.explained_variance_ratio_))

print(nx)

for i, pc in enumerate(pca.components_):

    ids = np.where((pc >= limit_value) | (pc <= -limit_value))
    items = [dic[id] for id in ids[0]]

    for r in sorted(zip(items, pc[ids]), key = lambda x: -x[1]):
Exemplo n.º 21
0
class Index(object):
    """define an index instance along with its associated methods"""
    def __init__(self, stops, minsize=3):
        """initialize index variables"""
        self.ix = None
        self.tokenizer = StandardAnalyzer(stoplist=stops, minsize=minsize)
        self.umls = umls.UMLSLookup()
        self.term_dict = {}
        self.token2cuis = {}
        self.concept_dict = {"__NULL__": 0}
        self.synsets = {}

    def get_doc_ids(self, corpus_path, corpus_name):
        """get doc ids from corpus"""
        if "OHSUMED" in corpus_name:
            docs = safir_utils.gen_trec_doc(corpus_path)
        elif "TREC_CDS" in corpus_name:
            docs = safir_utils.gen_cds_doc(corpus_path)
        return [docno for docno, doc in docs]

    def only_digits(self, token):
        """check whether input token contains only digits and/or punctuation"""
        return all(char.isdigit() or char in string.punctuation
                   for char in token)

    def preprocess_text(self, text, tags=False, remove_digits=True):
        """preprocess text: tokenize docs, lowerize text, remove words with length < min_size, remove tags, remove only-digits tokens and remove stopwords"""
        if tags:  # remove tags
            text = strip_tags(text)
        if remove_digits:  # tokenize and remove digits-only tokens
            text = [
                token.text for token in self.tokenizer(text)
                if not self.only_digits(token.text)
            ]
        else:  # tokenize and keep digits-only tokens
            text = [token.text for token in self.tokenizer(text)]
        # return preprocessed doc
        return text

    def preprocess_corpus(self, corpus_path, corpus_name, out_corpus, out_ids):
        """preprocess corpus: apply preprocess_text to each doc within corpus"""
        if "OHSUMED" in corpus_name:
            docs = safir_utils.gen_trec_doc(corpus_path)
        elif "TREC_CDS" in corpus_name:
            docs = safir_utils.gen_cds_doc(corpus_path)
        # tokenize docs
        print("pre processing docs...")
        #pproc_corpus = [self.preprocess_text(doc) for docno, doc in docs]
        pproc_corpus = []
        doc_ids = []
        # iterate over docs and store pre processed docs and docnos
        for docno, doc in docs:
            pproc_corpus.append(self.preprocess_text(doc))
            doc_ids.append(docno)
        print("pre processing finished!")
        # store pproc_corpus
        print("store pre processed corpus in {}".format(out_corpus))
        with open(out_corpus, 'w') as outf:
            json.dump(pproc_corpus, outf)
        # store docnos
        print("store doc_ids in {}".format(out_ids))
        with open(out_ids, 'w') as outf:
            json.dump(doc_ids, outf)
        # return pproc_corpus and doc_ids
        return pproc_corpus, doc_ids

    def load_pproc_corpus(self, fname):
        """load stored pre processed corpus"""
        with open(fname, 'r') as inf:
            pproc_corpus = json.load(inf)
        return pproc_corpus

    def load_doc_ids(self, fname):
        """load stored doc ids"""
        with open(fname, 'r') as inf:
            doc_ids = json.load(inf)
        return doc_ids

    def index_corpus(self, pproc_corpus, fname):
        """index pre processed corpus using gensim dictionary - fast doc2bow, doc2idx conversion"""
        self.ix = Dictionary(pproc_corpus)
        self.ix.save_as_text(fname)
        return True

    def load_index(self, fname):
        """load stored index"""
        self.ix = Dictionary.load_from_text(fname)
        return True

    def build_term_dict(self,
                        pproc_corpus,
                        fname,
                        dict_size=131072,
                        remove_digits=True,
                        min_df=2,
                        max_df=0.5):
        """create term dictionary"""
        ttf = {}
        # filter terms with df lower than 2 and greater than 0.5 (in %) and store their ttf
        for doc in tqdm(pproc_corpus):
            # get doc in bow format
            bow = self.ix.doc2bow(doc)
            for idx, tf in bow:
                if self.ix.dfs[idx] >= 2 and self.ix.dfs[
                        idx] / self.ix.num_docs <= 0.5:
                    if idx in ttf:
                        ttf[idx] += tf
                    else:
                        ttf[idx] = tf
        # convert ttf dict into counter and keep dict_size most frequent terms
        count = Counter(ttf).most_common(dict_size)
        # create term dict - two-levels encoding (i.e. self.term_dict[self.ix.token2id[token]])
        for idx, ttf in count:
            self.term_dict[idx] = len(self.term_dict)
        # store term dictionary
        with open(fname, 'w') as outf:
            json.dump(self.term_dict, outf)
        return True

    def load_term_dict(self, fname):
        """load term dictionary"""
        with open(fname, 'r') as inf:
            self.term_dict = json.load(inf)
        # convert keys from str back to int - json stores dict keys as str
        self.term_dict = {
            int(ix_term): dict_term
            for ix_term, dict_term in self.term_dict.items()
        }
        return True

    def get_pos2token(self, text):
        """split text into tokens and return {pos: [token, ["__NULL__"]]}"""
        pos2token = {}
        tokens = text.split(
        )  # split on whitespaces as text has been already pre processed
        # set text index
        index = text.index
        running_offset = 0
        # loop over tokens
        for token in tokens:
            token_offset = index(token, running_offset)
            token_len = len(token)
            # update running offset
            running_offset = token_offset + token_len
            pos2token[token_offset] = [self.ix.token2id[token], ["__NULL__"]
                                       ]  # note: ["__NULL__"] is for later use
        return pos2token

    def associate_token2cuis(self, pos2token, terms_candidate_cuis):
        """return list of (token, [cui1, cui2, ...]) pairs given token position and candidate concepts"""
        for term_cuis in terms_candidate_cuis:
            # get positional information
            start = term_cuis[0]['start']
            # check whether 'start' matches with any pos2token key
            if start in pos2token:
                # update ["__NULL__"] with candidate cuis
                pos2token[start][1] = [concept['cui'] for concept in term_cuis]
        # return pos2token values only - i.e. (term, [cui1, cui2, ...]) pairs
        return list(pos2token.values())

    def map_token2cuis(self, fname, threshold=1.0, stypes_fname=None):
        """map candidate cuis to each token in the index"""
        terms_str = ' '.join(list(self.ix.token2id.keys()))
        # split term_str into substrings of length <= 999999 - max length allowed by scipy parser
        substrs = wrap(terms_str,
                       width=999999,
                       break_long_words=False,
                       break_on_hyphens=False)
        if stypes_fname is not None:  # load user-specified UMLS semantic types
            print("user-specified UMLS semantic types for QuickUMLS enabled")
            semtypes = ','.join(safir_utils.load_semtypes(stypes_fname))
        else:  # keep default QuickUMLS semantic types
            semtypes = None
        # initialize QuickUMLS server
        server = QuickUMLS(window=1, threshold=threshold, semtypes=semtypes)
        server.launch_quickumls()
        # initialize concept matcher
        matcher = get_quickumls_client()
        token2cuis = []
        # extract concepts from substrs
        for substr in substrs:
            terms_candidate_cuis = matcher.match(substr)
            # get position dict: {pos: [token, ["__NULL__"]]} given substr
            pos2token = self.get_pos2token(substr)
            # associate each token with its candidate concepts
            token2cuis += self.associate_token2cuis(pos2token,
                                                    terms_candidate_cuis)
        # close connection with QuickUMLS server
        server.close_quickumls()
        # store token2cuis as dict
        self.token2cuis = dict(token2cuis)
        # store token2cuis
        with open(fname, 'w') as outf:
            json.dump(self.token2cuis, outf)
        return True

    def load_token2cuis(self, fname):
        """load token2cuis"""
        with open(fname, 'r') as inf:
            self.token2cuis = json.load(inf)
        # convert keys from str back to int - json stores dict keys as str
        self.token2cuis = {
            int(token): cuis
            for token, cuis in self.token2cuis.items()
        }
        return True

    def update_concept_dict(self, cui):
        """update concept dictionary"""
        if cui in self.concept_dict:
            return True
        else:
            self.concept_dict[cui] = len(self.concept_dict)
            return True

    def load_concept_dict(self, fname):
        """load concept dictionary"""
        with open(fname, 'r') as inf:
            self.concept_dict = json.load(inf)
        return True

    def update_synsets(self, cui, idx):
        """update synonyms set"""
        if self.concept_dict[
                cui] in self.synsets:  # add term to set of synonyms for the given cui
            self.synsets[self.concept_dict[cui]].add(self.term_dict[idx])
            return True
        elif self.concept_dict[cui] != self.concept_dict[
                "__NULL__"]:  # initialize set of synsets for given cui
            self.synsets[self.concept_dict[cui]] = {self.term_dict[idx]}
            return True
        else:  # do not update synsets
            return False

    def load_synsets(self, fname):
        """load synsets"""
        with open(fname, 'r') as inf:
            self.synsets = json.load(inf)
        # convert keys from str back to int - json stores dict keys as str
        self.synsets = {int(cui): syns for cui, syns in self.synsets.items()}
        return True

    def get_sense_pairs(self):
        """return senses as (term, cui) 2-dim np array"""
        syns = [
            list(itertools.product(self.synsets[cui], [cui]))
            for cui in self.synsets
        ]
        synp = [list(itertools.combinations(syn, 2)) for syn in syns]
        return np.array(list(itertools.chain.from_iterable(synp)))

    def s_wsd(self, doc, table_name, query=False):
        """shallow word-sense disambiguation: disambiguate polysemous terms based on shallow word-concept connectivity within UMLS"""
        doc_cuis = {}
        # convert doc into doc2idx format
        doc2idx = self.ix.doc2idx(doc)
        # get cuis from doc tokens
        for idx in doc2idx:
            if idx in self.token2cuis and self.token2cuis[idx] != ["__NULL__"]:
                for cui in self.token2cuis[idx]:
                    if cui in doc_cuis:  # increase cui count
                        doc_cuis[cui] += 1
                    else:  # initialize cui count
                        doc_cuis[cui] = 1
        # perform shallow word-sense disambiguation
        enc_doc = []
        for idx in doc2idx:
            if idx in self.term_dict:  # disambiguate only for terms contained within self.term_dict
                max_edges = 0  # relative maximum connections (edges)
                if len(self.token2cuis[idx]) == 1:  # monosemous term
                    ref_cui = self.token2cuis[idx][0]
                    if not query:  # update concept dict and synsets
                        self.update_concept_dict(ref_cui)
                        self.update_synsets(ref_cui, idx)
                    # encode (term, cui) pair
                    enc_doc.append(
                        [self.term_dict[idx], self.concept_dict[ref_cui]])
                else:  # polysemous term
                    candidates = []
                    # loop over cadidate concepts
                    for subj_cui in self.token2cuis[idx]:
                        num_edges = 0  # number of edges
                        if doc_cuis[
                                subj_cui] == 1:  # subj_cui is only associated with current term (idx)
                            obj_cuis = list(
                                set(doc_cuis.keys()).difference({subj_cui}))
                        else:  # subj_cui is associated with other terms in the doc too
                            obj_cuis = list(doc_cuis.keys())
                        num_edges += self.umls.compute_num_edges(
                            subj_cui, obj_cuis, table_name)
                        # verify connectivity
                        if num_edges > max_edges:
                            # set candidates to subj_cui
                            candidates = [subj_cui]
                            # update max_edges
                            max_edges = num_edges
                        else:
                            # append subj_cui to candidates
                            candidates.append(subj_cui)
                    # keep head candidate - when disambiguation is not complete, it allows to get the most likely concept based on QuickUMLS ordering
                    ref_cui = candidates[0]
                    if not query:  # update concept dict and synsets
                        self.update_concept_dict(ref_cui)
                        self.update_synsets(ref_cui, idx)
                    # encode (term, cui) pair
                    enc_doc.append(
                        [self.term_dict[idx], self.concept_dict[ref_cui]])
            else:  # term oov
                continue
        return enc_doc

    def encode_corpus(self,
                      pproc_corpus,
                      corpus_name,
                      ecorpus_fname,
                      t2c_fname,
                      cdict_fname,
                      syn_fname,
                      threshold=0.7,
                      stypes_fname=None):
        """perform semantic indexing and encode corpus"""
        print("map UMLS concepts to (indexed) tokens")
        self.map_token2cuis(t2c_fname,
                            threshold=threshold,
                            stypes_fname=stypes_fname)
        # get UMLS concepts mapped to (indexed) tokens
        ix_concepts = {
            cui
            for cuis in self.token2cuis.values() for cui in cuis
            if cui != "__NULL__"
        }
        # create sql table to store relations between concepts associated to indexed tokens - allows for fast accessing compared to MRREL table
        print(
            "create table to store UMLS relations between concepts associated to (indexed) tokens - fast access is enabled by indexes"
        )
        self.umls.restrict_to_ix_concepts(ix_concepts, corpus_name)
        # create indexes to speed up requests
        self.umls.create_index("CUI1_" + corpus_name, ["CUI1"],
                               corpus_name)  # create index for subject column
        self.umls.create_index("CUI2_" + corpus_name, ["CUI2"],
                               corpus_name)  # create index for object column
        self.umls.create_index(
            "CUI1_CUI2_" + corpus_name, ["CUI1", "CUI2"],
            corpus_name)  # create multicolumn index (subj, obj)
        # encode corpus
        print("disambiguate polysemous tokens and encode corpus")
        enc_corpus = [
            self.s_wsd(doc, corpus_name, query=False)
            for doc in tqdm(pproc_corpus)
        ]
        # store synsets as dict of lists - enables json encoding
        self.synsets = {cui: list(syns) for cui, syns in self.synsets.items()}
        # store semantic data and encoded corpus
        with open(ecorpus_fname, 'w') as outf:
            json.dump(enc_corpus, outf)
        with open(cdict_fname, 'w') as outf:
            json.dump(self.concept_dict, outf)
        with open(syn_fname, 'w') as outf:
            json.dump(self.synsets, outf)
        # return encoded corpus
        return enc_corpus

    def load_enc_corpus(self, fname):
        """load encoded corpus"""
        with open(fname, 'r') as inf:
            enc_corpus = json.load(inf)
        return enc_corpus

    def preprocess_query(self, query):
        """pre process query"""
        pproc_query = self.preprocess_text(query)
        return pproc_query

    def encode_query(self, pproc_query, corpus_name):
        """disambiguate polysemous terms and encode query"""
        enc_query = self.s_wsd(pproc_query, corpus_name, query=True)
        if not enc_query:
            print("query does not contain known terms")
            return None
        else:
            return np.array(enc_query)

    def project_query(self,
                      query,
                      corpus_name,
                      word_embs,
                      proj_weights,
                      concept_embs=None):
        """project encoded query into dense vector of size [1, doc_embs]"""
        enc_query = self.encode_query(self.preprocess_query(query),
                                      corpus_name)
        if enc_query is None:
            return None
        else:
            if concept_embs is None:  # only terms are considered
                return np.matmul(proj_weights,
                                 np.mean(word_embs[enc_query[:, 0]], axis=0))
            else:  # terms + concepts are considered (i.e. senses)
                return np.matmul(
                    proj_weights,
                    np.mean(np.add(word_embs[enc_query[:, 0]],
                                   concept_embs[enc_query[:, 1]]),
                            axis=0))

    def semantic_search(self, doc_ids, docs, query_ids, queries,
                        ranking_folder, ranking_name):
        """perform search over queries using neural semantic models and return ranking"""
        doc_ids = np.array(doc_ids)
        print("compute similarities between docs and queries")
        similarities = cosine_similarity(docs, queries)
        out = open(ranking_folder + '/' + ranking_name + '.txt', 'w')
        for i in tqdm(range(similarities.shape[1])):
            rank = np.argsort(-similarities[:, i])[:1000]
            docs_rank = doc_ids[rank]
            qid = query_ids[i]
            if qid.isdigit(
            ):  # cast to integer - this operation avoids storing topic ids as '0##' instead of '##'
                qid = str(int(qid))  # convert to int and then back to str
            for j in range(len(docs_rank)):
                out.write('%s %s %s %d %f %s\n' %
                          (qid, 'Q0', docs_rank[j], j,
                           similarities[rank[j]][i], ranking_name))
        out.close()
        return True
Exemplo n.º 22
0
mecab = MeCab.Tagger("-Owakati")

# 辞書に含めない単語たち
words_blacklist = [
    ">>",  # チャットのアノテーション
    "some_agent",
    "\u3000",  # 全角スペースを意味している
    "。",
    "、",
]

dct = Dictionary()
# csvファイルの読み込み
df = pd.read_csv(filepath, delimiter=",", names=["talker", "words", "type"])
# 文を分かち書き -> 半角スペースで区切り -> 最後の1文字(改行コード)を消したリストを得る
wakati_df = df["words"].map(lambda x: mecab.parse(x).split(" ")[:-1])
# 辞書に追加
dct.add_documents(wakati_df)

# ブラックリストの辞書内でのidを得る
words_blacklist_id = dct.doc2idx(words_blacklist)
# 辞書から削除
dct.filter_tokens(bad_ids=words_blacklist_id)
#dct.filter_n_most_frequent(600)

# 辞書の保存
dct.save(os.path.join(filedir, ".".join([filename, "dict"])))

# 辞書の中身と単語数の表示
print(dct.token2id)
print(len(dct.token2id))
Exemplo n.º 23
0
tokens = list()
for text in texts:
    tokens.append(simple_preprocess(text))

# Vectorize the text samples into a 2D integer tensor.

MAX_NUM_WORDS = 10000  # 2 words reserved: 0=pad, 1=oov
MAX_SEQUENCE_LENGTH = 1000

dictionary = Dictionary(tokens)
dictionary.filter_extremes(no_below=0, no_above=1.0, keep_n=MAX_NUM_WORDS - 2)

word_index = dictionary.token2id
print('Found %s unique tokens.' % len(word_index))

data = [dictionary.doc2idx(t) for t in tokens]

# Truncate and pad sequences.

data = [i[:MAX_SEQUENCE_LENGTH] for i in data]
data = np.array([
    np.pad(i, (0, MAX_SEQUENCE_LENGTH - len(i)),
           mode='constant',
           constant_values=-2) for i in data
],
                dtype=int)
data = data + 2

print('Shape of data tensor:', data.shape)
print('Length of label vector:', len(labels))
Exemplo n.º 24
0
#读取停止词
file = codecs.open('stopwords.dic','r','utf-8')
stoplist = [line.strip() for line in file] 
#读取数据集
file = codecs.open('data.dat','r','utf-8')
doc_set = [document.strip() for document in file]

texts = [] 
for i in doc_set: 
    raw = i.lower().strip()
    tokens = jieba.cut(raw)    
    stemmed_tokens = [word.strip() for word in tokens]
    stopped_tokens = [word for word in stemmed_tokens if word not in stoplist and len(word) > 1 and not re.search('[0-9]', word)]  
    texts.append(stopped_tokens)
dictionary = Dictionary(texts)
corpus =[dictionary.doc2idx(text) for text in texts]
corpus1=sequence.pad_sequences(corpus,maxlen=77)
trainset, testset= cross_validation.train_test_split(corpus1, test_size=0.2, random_state=0)
n_topics=10
random_state=0
n_iter=10

class AMC:   
    def __init__(self, n_topics, n_iter, alpha=0.1, eta=0.01, random_state=None,
                 refresh=10):
        self.n_topics = n_topics
        self.n_iter = n_iter
        self.alpha = alpha
        self.eta = eta
        # if random_state is None, check_random_state(None) does nothing
        # other than return the current numpy RandomState
Exemplo n.º 25
0
class Vocab:
    def __init__(self):
        self.dictionary = Dictionary()
        self.dictionary.token2id['<UNK>'] = -1
        self.dictionary.id2token[-1] = '<UNK>'
        self.dictionary.dfs[-1] = 0

    def set(self, corpus, prune_at=2000000):
        self.dictionary.add_documents(corpus, prune_at)

    def prune(self, **kwargs):
        # it is best if pruning is applied after all the updates
        # otherwise dropped tokens during pruning, seen in update
        # docs will produce wrong counts
        if self.dictionary.dfs == {}:
            raise ValueError('no vocab to filter; build vocab first')
        no_below = kwargs.get('no_below', 5)
        no_above = kwargs.get('no_above', 0.7)
        keep_n = kwargs.get('keep_n', 100000)
        keep_tokens = kwargs.get('keep_tokens', None)
        if keep_tokens:
            keep_tokens.append('UNK')
        else:
            keep_tokens = ['UNK']
        preprune_count = sum([df for _, df in self.dictionary.dfs.items()])
        self.dictionary.filter_extremes(no_below, no_above, keep_n,
                                        keep_tokens)
        postprune_count = sum([df for _, df in self.dictionary.dfs.items()])
        self.dictionary.dfs[-1] = preprune_count - postprune_count
        # add UNK back (gets pruned due to 0 initial val)
        self.dictionary.token2id['<UNK>'] = -1
        self.dictionary.id2token[-1] = '<UNK>'

    def update(self, docs, prune_at=2000000):
        self.add_documents(docs, prune_at)

    def transform(self, docs, transform_to='ids', with_unk=True):
        if transform_to == 'ids':
            for doc in docs:
                yield self.dictionary.doc2idx(doc)
        elif transform_to == 'bow':
            for doc in docs:
                if with_unk:
                    yield self.doc2bow(doc)
                else:
                    yield self.dictionary.doc2bow(doc)
        else:
            raise ValueError('unknwon transformation format')

    def fit_transform(self,
                      docs,
                      transform_to='ids',
                      prune_at=2000000,
                      filter_vocab=False,
                      **kwargs):
        self.set(docs, prune_at)
        if filter_vocab:
            self.prune(**kwargs)
        yield from self.transform(docs, transform_to)

    def merge(self, other):
        self.dictionary.merge_with(other)

    def save(self, fname, as_text=False, sort_by_word=False):
        if as_text:
            self.dictionary.save_as_text(fname, sort_by_word)
        else:
            self.dictionary.save(fname)

    def load(self, fname, from_text=False):
        if from_text:
            self.dictionary = Dictionary.load_from_text(fname)
        else:
            self.dictionary = Dictionary.load(fname)

    def __len__(self):
        return len(self.dictionary)

    def __iter__(self):
        return iter(self.dictionary)

    def keys(self):
        return list(self.dictionary.token2id.values())

    def __str__(self):
        return str(self.dictionary)

    def __getitem__(self, tokenid):
        return self.dictionary[tokenid]

    def doc2bow(self, document):
        # note: slight variation to BoW format conversion from gensim
        # to allow '<UNK>' tokens
        if isinstance(document, string_types):
            raise TypeError(
                "doc2bow expects an array of unicode tokens on input, not a single string"
            )

        # Construct (word, frequency) mapping.
        counter = defaultdict(int)
        for w in document:
            if w in self.dictionary.token2id:
                counter[self.dictionary.token2id[w]] += 1
            else:
                counter[-1] += 1

        # return tokenids, in ascending id order
        counter = sorted(iteritems(counter))
        return counter
Exemplo n.º 26
0
BOS = '\t'
EOS = '\n'

df = pd.read_csv(data_file, names=('question', 'answer'), dtype='object')

q_maxlen = df['question'].map(len).max()
a_maxlen = df['answer'].map(len).max() + 2

ans = df['answer'].map(lambda a: f'{BOS}{a}{EOS}')

dic = Dictionary([list(BOS + EOS + ' '.join(df.values.flatten()))])
dic.save(f'{data_file}.dic')

padding_one_hot = lambda d, size: np.vstack(
    (np.eye(len(dic))[dic.doc2idx(list(d))], np.zeros(
        (size - len(d), len(dic)))))

x1 = np.array([padding_one_hot(q, q_maxlen) for q in df['question']])
x2 = np.array([padding_one_hot(a, a_maxlen) for a in ans])
y = np.array([np.vstack((d[1:], np.zeros((1, len(dic))))) for d in x2])

enc_inputs = Input(batch_shape=(None, q_maxlen, len(dic)))
enc_outputs = Dense(n_hidden)(Flatten()(Dense(n_hidden)(enc_inputs)))

enc_states = [enc_outputs]

decoder = GRU(n_hidden, return_sequences=True, return_state=True)
dec_inputs = Input(shape=(None, len(dic)))
dec_outputs, _ = decoder(dec_inputs, initial_state=enc_states)
Exemplo n.º 27
0
    '''
    token2id : dict of (str,int) - token -> tokenID
    id2token : dict of (int,str) 
    dfs : dict of (int,int)
    '''
dct.token2id #查看词汇编码  0 1 2 3 4 5 6
    
dct.dfs      #查看每个词汇出现次数

dct.num_pos  #查看处理过程的词汇数量
dct.num_nnz  #与num_pos 类似

dct.add_documents([['cat','bird','cute'],['动物','植物','panda']])  #增加词条
dct.token2id #查看词汇编码  0 1 2 3 4 5 6

dct.doc2idx(['this','cat','is','cute']) #查询词汇在字典中的编码  没有的就饿返回-1

    #准换为BOW稀疏向量
dct.doc2bow(['this','is','a','cute','cat'],  #8(cute) 和 9(cat) 出现了 1次
            return_missing=True,  #词库没有的词
            allow_update=True     #把词库没有的词加入词库 是否直接更新所用字典
            )
dct.token2id







Exemplo n.º 28
0
                 names=('keyword', 'sentence'),
                 dtype='object')

keywords = [k.split(' ') for k in df['keyword'].values]
sentences = [[BOS] + s.split(' ') + [EOS] for s in df['sentence'].values]

q_maxlen = np.max([len(q) for q in keywords])
a_maxlen = np.max([len(a) for a in sentences])

print(f'question max size: {q_maxlen}, answer max size: {a_maxlen}')

dic = Dictionary(keywords + sentences)
dic.save(f'{dest_file_prefix}.dic')

padding_one_hot = lambda d, size: np.vstack(
    (np.eye(len(dic))[dic.doc2idx(d)], np.zeros((size - len(d), len(dic)))))

x1 = np.array([padding_one_hot(q, q_maxlen) for q in keywords])
x2 = np.array([padding_one_hot(a, a_maxlen) for a in sentences])
y = np.array([np.vstack((d[1:], np.zeros((1, len(dic))))) for d in x2])

encoder = GRU(n_hidden, return_state=True)
enc_inputs = Input(shape=(None, len(dic)))
enc_outputs, enc_h = encoder(enc_inputs)

enc_states = [enc_h]

decoder = GRU(n_hidden, return_sequences=True, return_state=True)
dec_inputs = Input(shape=(None, len(dic)))
dec_outputs, _ = decoder(dec_inputs, initial_state=enc_states)