def create_partition_function(self, f_w2v, f_h5): print("Building the partition function") # Load the model from disk M = load_w2vec() words = M.index2word ZT = [] INPUT_ITR = tqdm.tqdm(words) # Compute the partition function for each word for w in INPUT_ITR: UE = self.energy(M.syn0, M[w]) z = compute_partition_stats(UE) ZT.append(z) # Save the partition function to disk # (special care needed for h5py unicode strings) dt = h5py.special_dtype(vlen=unicode) with h5py.File(f_h5, 'w') as h5: h5.create_dataset("words", (len(words),), dtype=dt, data=[w.encode('utf8') for w in words]) h5.attrs['vocab_N'] = len(words) h5['Z'] = ZT
def __init__(self, downsample_weights=None, *args, **kwargs): """ Initialize the class, loading the word2vec model. If any words are given a downsample weight then they are applied here. Args: *args: DOCUMENTATION_UNKNOWN **kwargs: DOCUMENTATION_UNKNOWN """ # Load the model from disk self.M = load_w2vec() # Build the dictionary, and a mapping from word2index self.shape = self.M.wv.syn0.shape self.vocab = dict(zip(self.M.wv.index2word, range(self.shape[0]))) self.DSW = np.ones(shape=len(self.vocab), dtype=float) for word, weight in downsample_weights.items(): if not self.check_word_vector(word): msg = "Downsample word '{}' not found in dictionary" logger.warning(msg.format(word)) continue vec = self.get_word_vector(word) scale = np.exp(-float(weight) / 1.0 * self.M.wv.syn0.dot(vec)) scale = np.clip(scale, 0, 1) self.DSW *= scale # Make sure nothing has been set yet self.V = self._ref = None
def __init__(self, *args, **kwargs): super(affinity_mapping, self).__init__(*args, **kwargs) # Load the model from disk self.M = load_w2vec() self.shape = self.M.wv.syn0.shape # Set parallel option self._PARALLEL = ast.literal_eval(kwargs["_PARALLEL"]) self.damping = float(kwargs["damping"]) if not os.path.exists(kwargs["f_affinity"]): h5 = h5py.File(kwargs["f_affinity"], 'w') h5.close() self.h5 = h5py.File(kwargs["f_affinity"], 'r+') global damping, M damping = self.damping M = self.M self.vocab_n = len(M.wv.index2word) M.word2index = dict([ (w, i) for w, i in zip(M.wv.index2word, range(self.vocab_n)) ]) # Increment this as we find more clusters self.cluster_n = 0
def __init__(self, *args, **kwargs): ''' Computes various measures of central tendency of a document. For Z_X scores, the raw word tokens are summed over the partition function. For I_X scores, the same statistics are computed over the similarity of all word pairs for words with top 10% Z values. This will precompute the partition function if it doesn't exist. ''' cfg_embed = simple_config.load()["embedding"] cfg_score = simple_config.load()["score"] f_w2v = os.path.join( cfg_embed["output_data_directory"], cfg_embed["w2v_embedding"]["f_db"], ) f_partition_function = os.path.join( cfg_embed["output_data_directory"], cfg_score["document_log_probability"]["f_partition_function"], ) if not os.path.exists(f_partition_function): self.create_partition_function(f_w2v, f_partition_function) self.Z = self.load_partition_function(f_partition_function) self.scores = [] val = cfg_score["document_log_probability"]["intra_document_cutoff"] self.intra_document_cutoff = float(val) self.model = load_w2vec()
def __init__(self, *args, **kwargs): super(generic_document_score, self).__init__(*args, **kwargs) # Load the model from disk self.M = load_w2vec() self.shape = self.M.wv.syn0.shape # Build the dictionary vocab_n = self.shape[0] self.word2index = dict(zip(self.M.wv.index2word, range(vocab_n))) # Set parallel option (currently does nothing) # self._PARALLEL = kwargs["_PARALLEL"] if "negative_weights" in kwargs: NV = [] for word, weight in kwargs["negative_weights"].items(): if not self.check_word_vector(word): msg = "Negative weight word '{}' not found in dictionary" print(msg.format(word)) continue vec = self.get_word_vector(word) scale = np.exp(-float(weight) * self.M.wv.syn0.dot(vec)) # Don't oversample, max out weights to unity scale[scale > 1] = 1.0 NV.append(scale) self.negative_weights = np.array(NV).T.sum(axis=1) else: self.negative_weights = np.ones(vocab_n, dtype=float) # Save the target column to compute self.target_column = simple_config.load()["target_column"] # Make sure nothing has been set yet self.V = self._ref = None # Set the variables for reduced representation config_score = simple_config.load()["score"] self.compute_reduced = config_score["compute_reduced_representation"] if self.compute_reduced: sec = config_score['reduced_representation'] self.reduced_n_components = sec['n_components'] self.h5py_args = {"compression": "gzip"}
def describe_clusters(self, **kwargs): W = load_w2vec() meta_clusters = self.load_centroid_dataset("meta_centroids") n_clusters = meta_clusters.shape[0] # Find the closest items to each centroid all_words = [] for i in range(n_clusters): v = meta_clusters[i] dist = W.syn0.dot(v) idx = np.argsort(dist)[::-1][:10] words = [W.index2word[i].replace('PHRASE_', '') for i in idx] all_words.append(u' '.join(words)) return np.array(all_words)
def __init__(self, *args, **kwargs): super(generic_document_score, self).__init__(*args, **kwargs) # Load the model from disk self.M = load_w2vec() self.shape = self.M.syn0.shape # Build the dictionary vocab_n = self.shape[0] self.word2index = dict(zip(self.M.index2word, range(vocab_n))) # Set parallel option (currently does nothing) # self._PARALLEL = kwargs["_PARALLEL"] # Load the negative weights if "negative_weights" in kwargs: neg_W = kwargs["negative_weights"] self.neg_W = dict((k, float(v)) for k, v in neg_W.items()) self.neg_vec = dict((k, self.get_word_vector(k)) for k, v in neg_W.items()) else: self.neg_W = {} self.neg_vec = {} # Save the target column to compute self.target_column = simple_config.load()["target_column"] # Make sure nothing has been set yet self.V = self._ref = None # Set the variables for reduced representation config_score = simple_config.load()["score"] self.compute_reduced = config_score["compute_reduced_representation"] if self.compute_reduced: sec = config_score['reduced_representation'] self.reduced_n_components = sec['n_components']
def load_embeddings(): ''' Loads the gensim word embedding model. ''' return load_w2vec()
def analyze_metacluster_from_config(config): ''' Does analysis on metaclusters to return descriptive information and statistics. Args: config: a config file ''' score_method = config["metacluster"]["score_method"] config = config["postprocessing"] topn_words_returned = config["topn_words_returned"] save_dest = config['output_data_directory'] os.system('mkdir -p {}'.format(save_dest)) model = uds.load_w2vec() ORG = uds.load_ORG_data(config["master_columns"]) MC = uds.load_metacluster_data() C = MC["meta_centroids"] DV = uds.load_document_vectors(score_method) # Fix any zero vectors with random ones dim = DV["docv"].shape[1] idx = np.where(np.linalg.norm(DV["docv"], axis=1) == 0)[0] for i in idx: vec = np.random.uniform(size=(dim, )) vec /= np.linalg.norm(vec) DV["docv"][i] = vec # Build the results for the metaclusters labels = np.unique(MC["meta_labels"]) if config["compute_dispersion"]: logger.info("Computing intra-document dispersion.") dist = _compute_dispersion_matrix(DV["docv"], MC["meta_labels"]) # Compute the linkage and the order linkage = hierarchy.linkage(dist, method='average') d_idx = hierarchy.dendrogram(linkage, no_plot=True)["leaves"] else: # If dispersion is not calculated set d_idx to be the cluster index d_idx = np.sort(labels) # V = DV["docv"] data = [] for cx, cluster_id in zip(C, labels): idx = MC["meta_labels"] == cluster_id item = {} item["counts"] = idx.sum() item["avg_centroid_distance"] = _compute_centroid_dist(V[idx], cx) if config["compute_dispersion"]: item["intra_document_dispersion"] = dist[cluster_id, cluster_id] else: item["intra_document_dispersion"] = -1 # Compute closest words to the centroid desc = ' '.join( zip(*model.wv.similar_by_vector(cx, topn=topn_words_returned))[0]) item["word2vec_description"] = desc data.append(item) df = pd.DataFrame(data, index=labels) df.index.name = "cluster_id" df["dispersion_order"] = d_idx cols = [ "dispersion_order", "counts", "avg_centroid_distance", "intra_document_dispersion", "word2vec_description" ] df = df[cols] f_csv = os.path.join(save_dest, "cluster_desc.csv") df.to_csv(f_csv, index_label="cluster_id") logger.info("Computing master-label spreadsheets.") cluster_lookup = dict(zip(df.index, df.dispersion_order.values)) ORG["cluster_id"] = MC["meta_labels"] ORG["dispersion_order"] = -1 for i, j in cluster_lookup.items(): idx = ORG["cluster_id"] == i ORG.loc[idx, "dispersion_order"] = j special_cols = ["_ref", "cluster_id", "dispersion_order"] cols = [x for x in ORG.columns if x not in special_cols] ORG = ORG[special_cols + cols] f_csv = os.path.join(save_dest, "cluster_master_labels.csv") ORG.to_csv(f_csv, index=False) print(df) # Output the result to stdout