def testMerge(self):
        d = Dictionary(self.texts)
        f = Dictionary(self.texts[:3])
        g = Dictionary(self.texts[3:])

        f.merge_with(g)
        self.assertEqual(sorted(d.token2id.keys()), sorted(f.token2id.keys()))
Exemplo n.º 2
0
    def testMerge(self):
        d = Dictionary(self.texts)
        f = Dictionary(self.texts[:3])
        g = Dictionary(self.texts[3:])

        f.merge_with(g)
        self.assertEqual(sorted(d.token2id.keys()), sorted(f.token2id.keys()))
Exemplo n.º 3
0
def create_dictionary(path):
    dictionary = Dictionary()
    for year in os.listdir(path):
        for month in os.listdir(os.path.join(path, year)):
            dict_temp = corpora.Dictionary(ReadFilesDir(os.path.join(path, year, month)))
            dictionary.merge_with(dict_temp)
            print(month)
    return dictionary
Exemplo n.º 4
0
def load_input(dataset_path):
    #training_path, test_path
    xy_train = []
    xy_test = []

    (x_train, y_train), (x_test, y_test) = ([], []), ([], [])

    for tag in filter(lambda x: x[0] != '.', listdir(dataset_path)):
        path = dataset_path + "/" + tag
        num_files = len([
            f for f in os.listdir(path)
            if os.path.isfile(os.path.join(path, f))
        ])
        k = 0
        for file in filter(lambda x: x[0] != '.', listdir(path)):
            k += 1
            f = open(path + "/" + file, "r")
            if k < num_files * 0.8:
                xy_train.append(
                    np.array((
                        clean_str(f.read()) + " " +
                        tag).split()))  # last element of collection is the tag
            else:
                xy_test.append(
                    np.array((
                        clean_str(f.read()) + " " +
                        tag).split()))  # last element of collection is the tag

    vocab_train = Dictionary(xy_train)
    vocab_test = Dictionary(xy_test)
    vocab_train.merge_with(vocab_test)

    for xy in xy_train:
        y = xy[-1]
        y_train.append(vocab_train.token2id[y])
        x = np.delete(xy, -1)
        words = []
        for word in x:
            words.append(vocab_train.token2id[word])
        x_train.append(words)

    for xy in xy_test:
        y = xy[-1]
        y_test.append(vocab_train.token2id[y])
        x = np.delete(xy, -1)
        words = []
        for word in x:
            words.append(vocab_train.token2id[word])
        x_test.append(words)

    return (np.array(x_train),
            np.array(y_train)), (np.array(x_test),
                                 np.array(y_test)), vocab_train
def training():

	train_gs = ["train/STS2012-en-train/STS.gs.MSRpar.txt", "train/STS2012-en-train/STS.gs.MSRvid.txt",
"train/STS2012-en-train/STS.gs.SMTeuroparl.txt"]

	train_input = ["train/STS2012-en-train/STS.input.MSRpar.txt","train/STS2012-en-train/STS.input.MSRvid.txt",
"train/STS2012-en-train/STS.input.SMTeuroparl.txt"]

	train_align = ["trainalign/2012/STS.alignment.MSRpar.txt", "trainalign/2012/STS.alignment.MSRvid.txt",
"trainalign/2012/STS.alignment.SMTeuroparl.txt"]

	dictionary = Dictionary([])
	features = []
	labels = []
	aligns = []
	for i in range(len(train_input)):
		sentencesA, sentencesB, idfDict, aligns, NEs_A, NEs_B, spw_A, spw_B = read_sentences(train_input[i],train_align[i])
		features += sentence_vector_similarity(sentencesA, sentencesB, embeddings, idfDict, aligns, NEs_A, NEs_B, spw_A, spw_B)
		dictionary.merge_with(Dictionary(sentencesA+sentencesB))
		# read gold standard
		with open(train_gs[i], "rb") as f:
			labels += map(float, f.read().strip().split())
	corpus_A = []
	corpus_B = []
	for i in range(len(train_input)):
		sentencesA, sentencesB, _, _, _, _, _, _ = read_sentences(train_input[i],train_align[i])
		for doc in sentencesA:
			corpus_A.append(dictionary.doc2bow(doc))
		for doc in sentencesB:
			corpus_B.append(dictionary.doc2bow(doc))
	NUM_TPC = 14
	topicModel = LdaModel(corpus_A+corpus_B, num_topics = NUM_TPC)
	assert len(corpus_A)==len(corpus_B)==len(features) == len(labels)
	for i in xrange(len(corpus_A)):

		vectorA = numpy.zeros(NUM_TPC)
		vectorB = numpy.zeros(NUM_TPC)
		for j,prob in topicModel[corpus_A[i]]:
			vectorA[j] = prob
		for j,prob in topicModel[corpus_B[i]]:
			vectorB[j] = prob
		if numpy.linalg.norm(vectorA) == 0 or numpy.linalg.norm(vectorB) == 0:
			features[i].append(0.)
		else:
			features[i].append(cosine_similarity(vectorA, vectorB))

	# train model
	# model = MLPRegressor(hidden_layer_sizes = (100,100), max_iter = 10000,
	# 									activation = 'logistic')
	model = Ridge()
	model.fit(features, labels)
	return model,topicModel, dictionary
Exemplo n.º 6
0
def get_dict():
    global PAD_token
    global SOS_token
    global EOS_token

    dct = Dictionary([['<PAD>']])
    default_dct = Dictionary([DEFAULT_TOKENS])
    dct.merge_with(default_dct)

    PAD_token = dct.token2id['<PAD>']
    SOS_token = dct.token2id['<SOS>']
    EOS_token = dct.token2id['<EOS>']

    return dct
Exemplo n.º 7
0
class Sentences:
    def __init__(self, corpus_file, n_docs=-1):
        self.corpus_file = corpus_file
        self.n_docs = n_docs

        self.tp = TextProcessing(dir='')

        self.dictionary = Dictionary('')
        self.tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
        self.en_stop = get_stop_words('en')
        self.p_stemmer = PorterStemmer()

    def __iter__(self, dict_dir):
        logging.info("Loading corpus in file %s" % self.corpus_file)

        i = 0
        for line in open(self.corpus_file, 'r'):
            # cleaning the line
            stemmed_tokens = self.tp.clean_line(line)

            # add tokens to list
            #ret.append(stemmed_tokens)

            # add line to dictionary
            d2 = Dictionary(stemmed_tokens)
            self.dictionary = self.dictionary.merge_with(d2)

            # count number of documents and break if > num_docs
            i += 1
            if self.n_docs != -1 and i >= self.n_docs:
                break
            if i % 1000 == 0:
                logging.debug('Document %s loaded' % i)
Exemplo n.º 8
0
def load_inputTrainingTest(training_path, test_path):
    xy_train = []
    xy_test = []

    (x_train, y_train), (x_test, y_test) = ([], []), ([], [])

    for tag in filter(lambda x: x[0] != '.', listdir(training_path)):
        path = training_path + "/" + tag
        for file in filter(lambda x: x[0] != '.', listdir(path)):
            f = open(path + "/" + file, "r")
            xy_train.append(np.array(
                (clean_str(f.read()) + " " +
                 tag).split()))  #last element of collection is the tag

    for tag in filter(lambda x: x[0] != '.', listdir(test_path)):
        path = test_path + "/" + tag
        for file in filter(lambda x: x[0] != '.', listdir(path)):
            f = open(path + "/" + file, "r")
            xy_test.append(np.array(
                (clean_str(f.read()) + " " +
                 tag).split()))  #last element of collection is the tag

    vocab_train = Dictionary(xy_train)
    vocab_test = Dictionary(xy_test)
    vocab_train.merge_with(vocab_test)

    for xy in xy_train:
        y = xy[-1]
        y_train.append(vocab_train.token2id[y])
        x = np.delete(xy, -1)
        words = []
        for word in x:
            words.append(vocab_train.token2id[word])
        x_train.append(words)

    for xy in xy_test:
        y = xy[-1]
        y_test.append(vocab_train.token2id[y])
        x = np.delete(xy, -1)
        words = []
        for word in x:
            words.append(vocab_train.token2id[word])
        x_test.append(words)

    return (np.array(x_train),
            np.array(y_train)), (np.array(x_test),
                                 np.array(y_test)), vocab_train
Exemplo n.º 9
0
    def compile_gensim_vocab(self, tf_vectorizer, vocabulary_outpath):
        '''
        Extract the vocabulary from fit sklearn count vectorizer, save in Gensim's
        Dictionary format
        '''
        print('\nCreate and Save to file Vocabulary from sklearn CountVectorizer using'
            + 'Gensim Dictionary')
        start = datetime.now()

        sklearn_vocab = tf_vectorizer.vocabulary_

        vocabulary_gensim = {}
        for key, val in sklearn_vocab.items():
            vocabulary_gensim[val] = key
        vocabulary = Dictionary()
        vocabulary.merge_with(vocabulary_gensim)

        vocabulary.save(vocabulary_outpath)

        end = datetime.now()
        print("   Time taken: {}".format(end - start))

        return vocabulary
Exemplo n.º 10
0
    validation_extra_features.append(
        [feats + token_feat for token_feat in token_feats])

logging.info('Extra features created')

dictionary = Dictionary([["<OOV>", "<PAD>"]])

x_train = [[remove_duplicates_char(token.lower_) for token in doc]
           for doc in train_docs]
train_dictionary = Dictionary(x_train)

train_selected_dictionary = Dictionary(
    [[remove_duplicates_char(token.lower_) for token in doc]
     for doc in train_selected_docs])
train_dictionary.filter_extremes(no_above=0.6, no_below=10)
dictionary.merge_with(train_selected_dictionary)
dictionary.merge_with(train_dictionary)
dictionary.save(join(stg.MODELS_DIR, 'rnn_spacy_tokens_dict'))

x_train_indexed = [[
    dictionary.token2id.get(remove_duplicates_char(token.lower_), 0)
    for token in doc
] for doc in train_docs]
x_validation_indexed = [[
    dictionary.token2id.get(remove_duplicates_char(token.lower_), 0)
    for token in doc
] for doc in validation_docs]

if ARGS.load_embedding_matrix == 'y':
    embedding_matrix = joblib.load(
        filename=join(stg.MODELS_DIR, 'embedding_matrix'))
Exemplo n.º 11
0
class Vocab:
    def __init__(self):
        self.dictionary = Dictionary()
        self.dictionary.token2id['<UNK>'] = -1
        self.dictionary.id2token[-1] = '<UNK>'
        self.dictionary.dfs[-1] = 0

    def set(self, corpus, prune_at=2000000):
        self.dictionary.add_documents(corpus, prune_at)

    def prune(self, **kwargs):
        # it is best if pruning is applied after all the updates
        # otherwise dropped tokens during pruning, seen in update
        # docs will produce wrong counts
        if self.dictionary.dfs == {}:
            raise ValueError('no vocab to filter; build vocab first')
        no_below = kwargs.get('no_below', 5)
        no_above = kwargs.get('no_above', 0.7)
        keep_n = kwargs.get('keep_n', 100000)
        keep_tokens = kwargs.get('keep_tokens', None)
        if keep_tokens:
            keep_tokens.append('UNK')
        else:
            keep_tokens = ['UNK']
        preprune_count = sum([df for _, df in self.dictionary.dfs.items()])
        self.dictionary.filter_extremes(no_below, no_above, keep_n,
                                        keep_tokens)
        postprune_count = sum([df for _, df in self.dictionary.dfs.items()])
        self.dictionary.dfs[-1] = preprune_count - postprune_count
        # add UNK back (gets pruned due to 0 initial val)
        self.dictionary.token2id['<UNK>'] = -1
        self.dictionary.id2token[-1] = '<UNK>'

    def update(self, docs, prune_at=2000000):
        self.add_documents(docs, prune_at)

    def transform(self, docs, transform_to='ids', with_unk=True):
        if transform_to == 'ids':
            for doc in docs:
                yield self.dictionary.doc2idx(doc)
        elif transform_to == 'bow':
            for doc in docs:
                if with_unk:
                    yield self.doc2bow(doc)
                else:
                    yield self.dictionary.doc2bow(doc)
        else:
            raise ValueError('unknwon transformation format')

    def fit_transform(self,
                      docs,
                      transform_to='ids',
                      prune_at=2000000,
                      filter_vocab=False,
                      **kwargs):
        self.set(docs, prune_at)
        if filter_vocab:
            self.prune(**kwargs)
        yield from self.transform(docs, transform_to)

    def merge(self, other):
        self.dictionary.merge_with(other)

    def save(self, fname, as_text=False, sort_by_word=False):
        if as_text:
            self.dictionary.save_as_text(fname, sort_by_word)
        else:
            self.dictionary.save(fname)

    def load(self, fname, from_text=False):
        if from_text:
            self.dictionary = Dictionary.load_from_text(fname)
        else:
            self.dictionary = Dictionary.load(fname)

    def __len__(self):
        return len(self.dictionary)

    def __iter__(self):
        return iter(self.dictionary)

    def keys(self):
        return list(self.dictionary.token2id.values())

    def __str__(self):
        return str(self.dictionary)

    def __getitem__(self, tokenid):
        return self.dictionary[tokenid]

    def doc2bow(self, document):
        # note: slight variation to BoW format conversion from gensim
        # to allow '<UNK>' tokens
        if isinstance(document, string_types):
            raise TypeError(
                "doc2bow expects an array of unicode tokens on input, not a single string"
            )

        # Construct (word, frequency) mapping.
        counter = defaultdict(int)
        for w in document:
            if w in self.dictionary.token2id:
                counter[self.dictionary.token2id[w]] += 1
            else:
                counter[-1] += 1

        # return tokenids, in ascending id order
        counter = sorted(iteritems(counter))
        return counter