示例#1
0
文件: vsm.py 项目: tttthomasssss/wort
	def __init__(self, window_size, weighting='ppmi', min_frequency=0, lowercase=True, stop_words=None, encoding='utf-8',
				 max_features=None, preprocessor=None, tokenizer=None, analyzer='word', binary=False, sppmi_shift=0,
				 token_pattern=r'(?u)\b\w\w+\b', decode_error='strict', strip_accents=None, input='content',
				 ngram_range=(1, 1), cds=1., dim_reduction=None, svd_dim=None, svd_eig_weighting=1, random_state=1105,
				 context_window_weighting='constant', add_context_vectors=True, word_white_list=set(),
				 subsampling_rate=None, cache_intermediary_results=False, cache_path='~/.wort_data/model_cache',
				 log_level=logging.INFO, log_file=None):
		"""
		TODO: documentation...
		:param window_size:
		:param weighting:
		:param min_frequency:
		:param lowercase:
		:param stop_words:
		:param encoding:
		:param max_features:
		:param preprocessor:
		:param tokenizer:
		:param analyzer:
		:param binary:
		:param sppmi_shift:
		:param token_pattern:
		:param decode_error:
		:param strip_accents:
		:param input:
		:param ngram_range:
		:param random_state:
		:param cds:
		:param dim_reduction:
		:param svd_dim:
		:param svd_eig_weighting:
		:param context_window_weighting: weighting of the context window under consideration (must be either "constant", "harmonic", "distance" or "aggressive")
		:param add_context_vectors:
		:param word_white_list:
		:param subsampling_rate:
		:param cache_intermediary_results:
		:param cache_path:
		:param log_level:
		:param log_file:
		:return:
		"""

		# Support for asymmetric context windows
		if (isinstance(window_size, tuple)):
			if (len(window_size) > 1):
				self.l_window_size = window_size[0]
				self.r_window_size = window_size[1]
			else:
				self.l_window_size = window_size[0]
				self.r_window_size = window_size[0]
		else:
			self.l_window_size = window_size
			self.r_window_size = window_size

		self.weighting = weighting
		self.min_frequency = min_frequency
		self.lowercase = lowercase
		self.stop_words = stop_words
		self.encoding = encoding
		self.max_features = max_features
		self.preprocessor = preprocessor
		self.tokenizer = tokenizer
		self.analyzer = analyzer
		self.binary = binary
		self.sppmi_shift = sppmi_shift
		self.token_pattern = token_pattern
		self.decode_error = decode_error
		self.strip_accents = strip_accents
		self.input = input
		self.ngram_range = ngram_range
		self.context_window_weighting = context_window_weighting
		self.random_state = random_state
		self.cds = cds
		self.svd_dim = svd_dim
		self.svd_eig_weighting = svd_eig_weighting
		self.dim_reduction = dim_reduction
		self.add_context_vectors = add_context_vectors
		self.word_white_list = word_white_list
		self.subsampling_rate = subsampling_rate
		self.cache_intermediary_results = cache_intermediary_results
		if (cache_path is not None and cache_path.startswith('~')):
			cache_path = os.path.expanduser(cache_path)
		if (not os.path.exists(cache_path)):
			os.makedirs(cache_path)
		self.cache_path = cache_path

		self.inverted_index_ = {}
		self.index_ = {}
		self.p_w_ = None
		self.vocab_count_ = 0
		self.token_count_ = 0
		self.M_ = None
		self.T_ = None
		self.density_ = 0.

		self.config_registry_ = ConfigRegistry(path=cache_path, min_frequency=self.min_frequency, lowercase=self.lowercase,
											   stop_words=self.stop_words, encoding= self.encoding, max_features=self.max_features,
											   preprocessor=self.preprocessor, tokenizer=self.tokenizer, analyzer=self.analyzer,
											   token_pattern=self.token_pattern, decode_error=self.decode_error,
											   strip_accents=self.strip_accents, input=self.input, ngram_range=self.ngram_range,
											   random_state=self.random_state, subsampling_rate=self.subsampling_rate,
											   wort_white_list=self.word_white_list, window_size=window_size,
											   context_window_weighting=self.context_window_weighting, binary=binary,
											   weighting=weighting, cds=cds, sppmi_shift=sppmi_shift)
		self.io_handler_ = IOHandler(cache_path=cache_path, log_file=log_file, log_level=log_level)
		self.io_handler_.setup_logging()
示例#2
0
文件: vsm.py 项目: tttthomasssss/wort
class VSMVectorizer(BaseEstimator, VectorizerMixin):
	def __init__(self, window_size, weighting='ppmi', min_frequency=0, lowercase=True, stop_words=None, encoding='utf-8',
				 max_features=None, preprocessor=None, tokenizer=None, analyzer='word', binary=False, sppmi_shift=0,
				 token_pattern=r'(?u)\b\w\w+\b', decode_error='strict', strip_accents=None, input='content',
				 ngram_range=(1, 1), cds=1., dim_reduction=None, svd_dim=None, svd_eig_weighting=1, random_state=1105,
				 context_window_weighting='constant', add_context_vectors=True, word_white_list=set(),
				 subsampling_rate=None, cache_intermediary_results=False, cache_path='~/.wort_data/model_cache',
				 log_level=logging.INFO, log_file=None):
		"""
		TODO: documentation...
		:param window_size:
		:param weighting:
		:param min_frequency:
		:param lowercase:
		:param stop_words:
		:param encoding:
		:param max_features:
		:param preprocessor:
		:param tokenizer:
		:param analyzer:
		:param binary:
		:param sppmi_shift:
		:param token_pattern:
		:param decode_error:
		:param strip_accents:
		:param input:
		:param ngram_range:
		:param random_state:
		:param cds:
		:param dim_reduction:
		:param svd_dim:
		:param svd_eig_weighting:
		:param context_window_weighting: weighting of the context window under consideration (must be either "constant", "harmonic", "distance" or "aggressive")
		:param add_context_vectors:
		:param word_white_list:
		:param subsampling_rate:
		:param cache_intermediary_results:
		:param cache_path:
		:param log_level:
		:param log_file:
		:return:
		"""

		# Support for asymmetric context windows
		if (isinstance(window_size, tuple)):
			if (len(window_size) > 1):
				self.l_window_size = window_size[0]
				self.r_window_size = window_size[1]
			else:
				self.l_window_size = window_size[0]
				self.r_window_size = window_size[0]
		else:
			self.l_window_size = window_size
			self.r_window_size = window_size

		self.weighting = weighting
		self.min_frequency = min_frequency
		self.lowercase = lowercase
		self.stop_words = stop_words
		self.encoding = encoding
		self.max_features = max_features
		self.preprocessor = preprocessor
		self.tokenizer = tokenizer
		self.analyzer = analyzer
		self.binary = binary
		self.sppmi_shift = sppmi_shift
		self.token_pattern = token_pattern
		self.decode_error = decode_error
		self.strip_accents = strip_accents
		self.input = input
		self.ngram_range = ngram_range
		self.context_window_weighting = context_window_weighting
		self.random_state = random_state
		self.cds = cds
		self.svd_dim = svd_dim
		self.svd_eig_weighting = svd_eig_weighting
		self.dim_reduction = dim_reduction
		self.add_context_vectors = add_context_vectors
		self.word_white_list = word_white_list
		self.subsampling_rate = subsampling_rate
		self.cache_intermediary_results = cache_intermediary_results
		if (cache_path is not None and cache_path.startswith('~')):
			cache_path = os.path.expanduser(cache_path)
		if (not os.path.exists(cache_path)):
			os.makedirs(cache_path)
		self.cache_path = cache_path

		self.inverted_index_ = {}
		self.index_ = {}
		self.p_w_ = None
		self.vocab_count_ = 0
		self.token_count_ = 0
		self.M_ = None
		self.T_ = None
		self.density_ = 0.

		self.config_registry_ = ConfigRegistry(path=cache_path, min_frequency=self.min_frequency, lowercase=self.lowercase,
											   stop_words=self.stop_words, encoding= self.encoding, max_features=self.max_features,
											   preprocessor=self.preprocessor, tokenizer=self.tokenizer, analyzer=self.analyzer,
											   token_pattern=self.token_pattern, decode_error=self.decode_error,
											   strip_accents=self.strip_accents, input=self.input, ngram_range=self.ngram_range,
											   random_state=self.random_state, subsampling_rate=self.subsampling_rate,
											   wort_white_list=self.word_white_list, window_size=window_size,
											   context_window_weighting=self.context_window_weighting, binary=binary,
											   weighting=weighting, cds=cds, sppmi_shift=sppmi_shift)
		self.io_handler_ = IOHandler(cache_path=cache_path, log_file=log_file, log_level=log_level)
		self.io_handler_.setup_logging()

	def _delete_from_vocab(self, W, idx):
		W = np.delete(W, idx)

		for i in idx:
			item = self.index_[i]
			del self.inverted_index_[item]
			del self.index_[i]

		return W

	def fit_vocabulary(self, raw_documents, analyser=None):
		if (analyser is None):
			analyser = self.build_analyzer()

		n_vocab = -1
		w = array.array('i')
		white_list_idx = set()

		# Extract vocabulary
		for doc in tqdm(raw_documents):
			for feature in analyser(doc):
				idx = self.inverted_index_.get(feature, n_vocab+1)

				# Build vocab
				if (idx > n_vocab):
					n_vocab += 1
					self.inverted_index_[feature] = n_vocab
					w.append(1)
				else:
					w[idx] += 1

				# Build white_list index
				if (feature in self.word_white_list):
					white_list_idx.add(idx)

		# Vocab was used for indexing (hence, started at 0 for the first item (NOT init!)), so has to be incremented by 1
		# to reflect the true vocab count
		n_vocab += 1
		logging.info('Finished Extracting vocabulary! n_vocab={}'.format(n_vocab))

		W = np.array(w, dtype=np.uint64)
		self.index_ = dict(zip(self.inverted_index_.values(), self.inverted_index_.keys()))

		logging.info('Filtering extremes...')
		# Filter extremes
		if (self.min_frequency > 1):
			idx = np.where(W < self.min_frequency)[0]

			if (len(self.word_white_list) > 0): # Take word_white_list into account - TODO: is there a better way?
				idx = np.array(list(set(idx.tolist()) - white_list_idx))

			W = self._delete_from_vocab(W, idx)

			n_vocab -= len(idx)

		# Max Features Filter
		if (self.max_features is not None and self.max_features < n_vocab):
			idx = np.argpartition(-W, self.max_features)[self.max_features:]

			if (len(self.word_white_list) > 0): # Take word_white_list into account - TODO: is there a better way?
				idx = np.array(list(set(idx.tolist()) - white_list_idx))

			W = self._delete_from_vocab(W, idx)

			n_vocab -= len(idx)

		# Subsampling TODO: this can certainly be optimised
		token_count = W.sum()
		if (self.subsampling_rate is not None):
			rnd = np.random.RandomState(self.random_state)
			t = self.subsampling_rate * token_count

			cand_idx = np.where(W>t)[1] # idx of words exceeding threshold

			P = 1 - np.sqrt(W * (1/t)) # `word2vec` subsampling formula
			R = rnd.rand(W.shape)

			subsample_idx = np.where(R<=P)[1] # idx of filtered words

			idx = cand_idx - subsample_idx

			if (len(self.word_white_list) > 0): # Take word_white_list into account - TODO: is there a better way?
				idx -= white_list_idx

			W = self._delete_from_vocab(W, idx)

			n_vocab -= len(idx)
		logging.info('Finished Filtering extremes! n_vocab={}; n_tokens={}'.format(n_vocab, token_count))

		self.p_w_ = W / token_count
		self.vocab_count_ = n_vocab
		self.token_count_ = token_count

		# Watch out when rebuilding the index, `self.index_` needs to be built _before_ `self.inverted_index_`
		# to reflect the updated `W` array
		self.index_ = dict(zip(range(n_vocab), self.index_.values()))
		self.inverted_index_ = dict(zip(self.index_.values(), self.index_.keys()))

	def fit_cooccurrence_matrix(self, raw_documents, analyser=None):
		if (analyser is None):
			analyser = self.build_analyzer()
		# TODO: This needs optimisation
		# https://en.wikipedia.org/wiki/Feature_hashing#Feature_vectorization_using_the_hashing_trick
		# http://datascience.stackexchange.com/questions/9918/optimizing-co-occurrence-matrix-computation
		# The construction of the co-occurrence matrix can also be chunked, the size of the vocabulary
		# is known in advance (as is the number of tokens), so the construction below, which is memory heavy
		# could be chunked into several bits to ease the memory hunger of the loops a bit
		logging.info('Constructing co-occurrence matrix...')
		# Incrementally construct coo matrix (see http://www.stefanoscerra.it)
		# This can be parallelised (inverted_index is shared and immutable and the rest is just a matrix)
		rows = array.array('I') #rows = array.array('i')
		cols = array.array('I') #cols = array.array('i')
		data = array.array('I' if self.context_window_weighting == 'constant' else 'f')

		if (isinstance(self.context_window_weighting, Callable)):
			window_weighting_fn = self.context_window_weighting
		else:
			window_weighting_fn = getattr(context_weighting, '{}_window_weighting'.format(self.context_window_weighting))

		for doc in tqdm(raw_documents):
			buffer = array.array('i')
			for feature in analyser(doc):
				if (feature in self.inverted_index_):
					buffer.append(self.inverted_index_[feature])

			# Track co-occurrences
			l = len(buffer)
			for i in range(l):
				# Backward co-occurrences
				for distance, j in enumerate(range(max(i-self.l_window_size, 0), i), 1):
					rows.append(buffer[i])
					cols.append(buffer[j])
					data.append(window_weighting_fn(-distance, self.l_window_size)) # The -distance is a bit of an ugly hack to support non-symmetric weighting

				# Forward co-occurrences
				for distance, j in enumerate(range(i+1, min(i+self.r_window_size+1, l)), 1):
					rows.append(buffer[i])
					cols.append(buffer[j])
					data.append(window_weighting_fn(distance, self.r_window_size))

		# TODO: This is still a bit of a bottleneck
		#		Either cythonize the shit
		#		Or chunk it up and create several sparse arrays that get added (?)
		logging.info('Numpyifying co-occurrence data...')
		data = np.array(data, dtype=np.uint8 if self.context_window_weighting == 'constant' else np.float64, copy=False)
		rows = np.array(rows, dtype=np.uint32, copy=False)
		cols = np.array(cols, dtype=np.uint32, copy=False)

		logging.info('Creating sparse matrix...')
		# Create a csr_matrix straight away!!!
		dtype = np.uint64 if self.context_window_weighting == 'constant' else np.float64
		self.M_ = sparse.csr_matrix((data.astype(dtype), (rows, cols)), shape=(self.vocab_count_, self.vocab_count_))
		logging.info('M.shape={}'.format(self.M_.shape))

		# Apply Binarisation
		if (self.binary):
			self.M_ = self.M_.minimum(1)

	def fit_pmi_matrix(self):
		logging.info('Applying {} weight transformation...'.format(self.weighting))

		self.T_ = sparse.lil_matrix(self.M_.shape, dtype=np.float64)

		logging.info('Applying CDS...')

		# Marginals for context (with optional context distribution smoothing)
		p_c = self.p_w_ ** self.cds if self.cds != 1 else self.p_w_

		'''
		PMI is the log of the joint probability of w and c divided by the product of their marginals
		PMI = log(P(w, c) / (P(c) * P(w)))

		The joint probability can be expressed as a conditional probability via the chain rule
		P(w, c) = P(c | w) * P(w)
		P(w, c) = P(w | c) * P(c)

		Plugging this into the PMI calculation results in
		PMI = log(P(c | w) * P(w) / (P(w) * P(c)))

		This allows P(w) (or P(c), depending on how the chain rule is applied) to be eliminated
		PMI = log(P(c | w) / P(c))
		'''

		logging.info('Calculating PMI the new and fancy way...')

		# Need the conditional probability P(c | w) and the marginal P(c), but need to maintain the sparsity structure of the matrix
		# Doing it this way, keeps the matrices sparse: http://stackoverflow.com/questions/3247775/how-to-elementwise-multiply-a-scipy-sparse-matrix-by-a-broadcasted-dense-1d-arra
		P_w = sparse.lil_matrix(self.M_.shape, dtype=np.float64)
		P_c = sparse.lil_matrix(self.M_.shape, dtype=np.float64)
		P_w.setdiag(1 / self.M_.sum(axis=1))
		P_c.setdiag(1 / p_c)

		logging.info('type(P_w)={}; P_w.shape={}; type(P_c)={}; P_c.shape={}'.format(type(P_w), P_w.shape, type(P_c), P_c.shape))

		'''
		(P_w * self.M_) calculates the conditional probability P(c | w) vectorised and rowwise while keeping the matrices sparse
		Multiplication by P_c (which contains the reciprocal 1 / p_c values), achieves the division by P(c)
		'''
		PMI = (P_w * self.M_) * P_c

		logging.info('type(PMI)={}; PMI.shape={}'.format(type(PMI), PMI.shape))

		# Perform log on the nonzero elements of PMI
		data = np.log(PMI.data)
		rows, cols = PMI.nonzero()

		logging.info('Applying the PMI option')
		# TODO: with the new & optimised PMI variant, some of the assets required by the other PMI options need to calculated
		# TODO: explicitely, hence that needs to be supported properly
		# ...apply the PMI variant (e.g. PPMI, SPPMI, PLMI or PNPMI)
		if (isinstance(self.weighting, Callable)):
			fn_feat_transformation = self.weighting
		else:
			fn_feat_transformation = getattr(feature_transformation, '{}_transformation'.format(self.weighting))

		PMI = fn_feat_transformation(sparse.csr_matrix((data, (rows, cols)), shape=self.M_.shape, dtype=np.float64), None, self.p_w_, p_c)
		logging.info('after weight option, type(PMI)={}, PMI.shape={}'.format(type(PMI), PMI.shape))

		# Apply shift
		if (self.sppmi_shift is not None and self.sppmi_shift > 0):
			logging.info('Applying shift={}...'.format(self.sppmi_shift))
			rows, cols = PMI.nonzero()
			data = np.full(rows.shape, self.sppmi_shift, dtype=np.float64)
			PMI -= sparse.csr_matrix((data, (rows, cols)), shape=PMI.shape, dtype=np.float64)

		logging.info('Applying the threshold [type(PMI)={}]...'.format(type(PMI)))
		# Apply threshold
		self.T_ = PMI.maximum(0)
		logging.info('PMI ALL DONE [type(self.T_)={}]'.format(type(self.T_)))

		logging.info('Returning [density={}]...'.format(len(self.T_.nonzero()[0]) / (self.T_.shape[0] * self.T_.shape[1])))
		self.density_ = len(self.T_.nonzero()[0]) / (self.T_.shape[0] * self.T_.shape[1])

	def fit_dimensionality_reduction(self):
		# Apply SVD
		if (self.dim_reduction == 'svd'):
			logging.info('Applying SVD with dimensionality={}...'.format(self.svd_dim))
			Ut, S, Vt = sparsesvd(self.T_.tocsc() if sparse.issparse(self.T_) else sparse.csc_matrix(self.T_), self.svd_dim)

			# Perform Context Weighting
			S = sparse.csr_matrix(np.diag(S ** self.svd_eig_weighting))

			W = sparse.csr_matrix(Ut.T).dot(S)
			V = sparse.csr_matrix(Vt.T).dot(S)

			# Add context vectors
			if (self.add_context_vectors):
				self.T_ = W + V
			else:
				self.T_ = W

	def fit(self, raw_documents, y=None):

		# Shameless copy/paste from Radims word2vec Tutorial, no generators matey, need multi-pass!!!
		if (raw_documents is not None):
			if (isinstance(raw_documents, GeneratorType)):
				raise TypeError('You can\'t pass a generator as the sentences argument. Try an iterator.')

		analyser = self.build_analyzer()

		##### FIT VOCABULARY
		vocab_folder = self.config_registry_.vocab_cache_folder()
		if (vocab_folder is not None and vocab_folder != ''):
			# Load cached resources
			logging.info('Loading cached vocabulary resources from {}...'.format(os.path.join(self.cache_path, vocab_folder)))
			self.p_w_ = self.io_handler_.load_p_w(vocab_folder)
			self.vocab_count_ = self.io_handler_.load_vocab_count(vocab_folder)
			self.index_ = self.io_handler_.load_index(vocab_folder)
			self.inverted_index_ = self.io_handler_.load_inverted_index(vocab_folder)
			logging.info('Cache loaded!')
		else:
			# Create vocabulary
			logging.info('Fitting vocabulary...')
			self.fit_vocabulary(raw_documents=raw_documents, analyser=analyser)
			logging.info('Vocabulary fitted!')

			# Cache vocabulary
			if (self.cache_intermediary_results):
				sub_folder = self.config_registry_.register_vocab()
				logging.info('Storing vocabulary cache to folder {}...'.format(sub_folder))
				self.io_handler_.save_index(self.index_, sub_folder)
				self.io_handler_.save_inverted_index(self.inverted_index_, sub_folder)
				self.io_handler_.save_vocab_count(self.vocab_count_, sub_folder)
				self.io_handler_.save_p_w(self.p_w_, sub_folder)
				logging.info('Cache stored!')
		##################

		##### FIT CO-OCCURRENCE MATRIX
		cooc_folder = self.config_registry_.cooccurrence_matrix_folder()
		if (cooc_folder is not None and cooc_folder != ''):
			logging.info('Loading cached co-occurrence matrix resources from {}...'.format(os.path.join(self.cache_path, cooc_folder)))
			self.M_ = self.io_handler_.load_cooccurrence_matrix(cooc_folder)
			logging.info('Cache loaded!')
		else:
			logging.info('Fitting co-occurrence matrix...')
			self.fit_cooccurrence_matrix(raw_documents=raw_documents, analyser=analyser)
			logging.info('Co-occurrence matrix fitted!')

			# Cache co-occurrence matrix
			if (self.cache_intermediary_results):
				sub_folder = self.config_registry_.register_cooccurrence_matrix()
				logging.info('Storing co-occurrence matrix cache to folder {}...'.format(sub_folder))
				self.io_handler_.save_cooccurrence_matrix(self.M_, sub_folder)
				logging.info('Cache stored!')
		##################

		##### FIT PMI FEATURE TRANSFORMATION
		pmi_folder = self.config_registry_.pmi_matrix_folder()
		if (pmi_folder is not None and pmi_folder != ''):
			logging.info('Loading cached PMI matrix resources from {}...'.format(os.path.join(self.cache_path, pmi_folder)))
			self.T_ = self.io_handler_.load_pmi_matrix(pmi_folder)
			logging.info('Cache loaded!')
		else:
			logging.info('Fitting PMI matrix...')
			self.fit_pmi_matrix()
			logging.info('PMI matrix fitted!')

			# Cache PMI matrix
			if (self.cache_intermediary_results):
				sub_folder = self.config_registry_.register_pmi_matrix()
				logging.info('Storing PMI matrix cache to folder {}...'.format(sub_folder))
				self.io_handler_.save_pmi_matrix(self.T_, sub_folder)
				logging.info('Cache stored!')
		##################

		##### FIT DIMENSIONALITY REDUCTION
		logging.info('Fitting dimensionality reduction...')
		self.fit_dimensionality_reduction()
		logging.info('Dimensionality reduction fitted!')
		##################

		return self

	def transform(self, raw_documents, as_matrix=False, oov='zeros'):
		'''

		:param raw_documents:
		:param as_matrix:
		:param oov: Handling of OOV entries, "ignore" doesn't return anything for an OOV item, "random", returns a random vector, "zeros" (default) returns a vector with zeros and "ones" returns a vector with ones.
		:return:
		'''
		analyser = self.build_analyzer()

		if (isinstance(oov, Callable)):
			oov_fn = oov
		else:
			oov_fn = getattr(oov_handler, '{}_oov_handler'.format(oov))

		l = []
		# Peek if a list or a string are passed
		if (isinstance(raw_documents, list)):
			for doc in raw_documents:
				d = []
				for feature in analyser(doc):
					if (feature in self):
						d.append(self[feature])
					else:
						if (oov != 'ignore'):
							d.append(oov_fn((1, self.get_vector_size()), self.T_.dtype, self.density_, self.random_state))
				l.append(d)

			# Convert list of lists of sparse vectors to list of sparse matrices (scipy doesn't support sparse tensors afaik)
			if (as_matrix):
				ll = []
				for l_doc in l:
					X = l_doc.pop(0)
					for x in l_doc:
						X = sparse.vstack((X, x))
					ll.append(X)
				return ll
		else:
			for feature in analyser(raw_documents):
				if (feature in self):
					l.append(self[feature])
				else:
					if (oov != 'ignore'):
						l.append(oov_fn((1, self.get_vector_size()), self.T_.dtype, self.density_, self.random_state))

			# Convert list of sparse vectors to sparse matrix
			if (as_matrix):
				X = l.pop(0)
				for x in l:
					X = sparse.vstack((X, x))
				return X

		return l

	def fit_transform(self, raw_documents, y=None, as_matrix=False, oov='zeros'):
		self.fit(raw_documents)
		return self.transform(raw_documents, as_matrix=as_matrix, oov=oov)

	def to_dict(self):
		d = {}
		nnz_col_idx = 1 if sparse.issparse(self.T_) else 0
		for i in self.index_.keys():
			feature_dict = {}
			for col_idx in self.T_[i].nonzero()[nnz_col_idx]:
				if (self.index_[col_idx] != self.index_[i]): # Avoid self co-occurrences
					feature_dict[self.index_[col_idx]] = self.T_[i, col_idx]
			d[self.index_[i]] = feature_dict

		return d

	def get_matrix(self):
		return self.T_

	def get_index(self):
		return self.index_

	def get_inverted_index(self):
		return self.inverted_index_

	def get_vector_size(self):
		return self.T_.shape[1]

	def __getitem__(self, item):
		return self.T_[self.inverted_index_[item]]

	def __contains__(self, item):
		return item in self.inverted_index_

	@classmethod
	def load_from_file(cls, path, as_dict=False):
		model = VSMVectorizer(window_size=0, cache_path=path)
		model.T_ = model.io_handler_.load_pmi_matrix('')
		model.index_ = model.io_handler_.load_index('')
		model.inverted_index_ = model.io_handler_.load_inverted_index('')
		model.p_w_ = model.io_handler_.load_p_w('')

		return model

	def save_to_file(self, path, as_dict=False):
		# If as_dict=True, call to_dict on self.T_ prior to serialisation
		# Store a few type infos in a metadata file, e.g. the type of self.T_
		# Get all params as well
		self.io_handler_.save_pmi_matrix(self.T_, sub_folder='', base_path=path)
		self.io_handler_.save_index(self.index_, sub_folder='', base_path=path)
		self.io_handler_.save_inverted_index(self.inverted_index_, sub_folder='', base_path=path)
		self.io_handler_.save_p_w(self.p_w_, sub_folder='', base_path=path)