def create_partition_function(self, f_w2v, f_h5):
        print("Building the partition function")

        # Load the model from disk
        M = load_w2vec()

        words = M.index2word
        ZT = []
        INPUT_ITR = tqdm.tqdm(words)

        # Compute the partition function for each word
        for w in INPUT_ITR:
            UE = self.energy(M.syn0, M[w])
            z = compute_partition_stats(UE)
            ZT.append(z)

        # Save the partition function to disk
        # (special care needed for h5py unicode strings)
        dt = h5py.special_dtype(vlen=unicode)

        with h5py.File(f_h5, 'w') as h5:

            h5.create_dataset("words", (len(words),),
                              dtype=dt,
                              data=[w.encode('utf8') for w in words])

            h5.attrs['vocab_N'] = len(words)
            h5['Z'] = ZT
예제 #2
0
    def __init__(self, downsample_weights=None, *args, **kwargs):
        """
        Initialize the class, loading the word2vec model. If any words are
        given a downsample weight then they are applied here.

        Args:
            *args: DOCUMENTATION_UNKNOWN
            **kwargs: DOCUMENTATION_UNKNOWN
        """

        # Load the model from disk
        self.M = load_w2vec()

        # Build the dictionary, and a mapping from word2index
        self.shape = self.M.wv.syn0.shape
        self.vocab = dict(zip(self.M.wv.index2word, range(self.shape[0])))

        self.DSW = np.ones(shape=len(self.vocab), dtype=float)

        for word, weight in downsample_weights.items():

            if not self.check_word_vector(word):
                msg = "Downsample word '{}' not found in dictionary"
                logger.warning(msg.format(word))
                continue

            vec = self.get_word_vector(word)
            scale = np.exp(-float(weight) / 1.0 * self.M.wv.syn0.dot(vec))
            scale = np.clip(scale, 0, 1)

            self.DSW *= scale

        # Make sure nothing has been set yet
        self.V = self._ref = None
예제 #3
0
    def __init__(self, *args, **kwargs):
        super(affinity_mapping, self).__init__(*args, **kwargs)

        # Load the model from disk
        self.M = load_w2vec()
        self.shape = self.M.wv.syn0.shape

        # Set parallel option
        self._PARALLEL = ast.literal_eval(kwargs["_PARALLEL"])

        self.damping = float(kwargs["damping"])

        if not os.path.exists(kwargs["f_affinity"]):
            h5 = h5py.File(kwargs["f_affinity"], 'w')
            h5.close()

        self.h5 = h5py.File(kwargs["f_affinity"], 'r+')

        global damping, M

        damping = self.damping
        M = self.M

        self.vocab_n = len(M.wv.index2word)

        M.word2index = dict([
            (w, i) for w, i in zip(M.wv.index2word, range(self.vocab_n))
        ])

        # Increment this as we find more clusters
        self.cluster_n = 0
    def __init__(self, *args, **kwargs):
        '''
        Computes various measures of central tendency of a document.
        For Z_X scores, the raw word tokens are summed over the partition
        function. For I_X scores, the same statistics are computed over
        the similarity of all word pairs for words with top 10% Z values.
        This will precompute the partition function if it doesn't exist.
        '''
        cfg_embed = simple_config.load()["embedding"]
        cfg_score = simple_config.load()["score"]

        f_w2v = os.path.join(
            cfg_embed["output_data_directory"],
            cfg_embed["w2v_embedding"]["f_db"],
        )

        f_partition_function = os.path.join(
            cfg_embed["output_data_directory"],
            cfg_score["document_log_probability"]["f_partition_function"],
        )

        if not os.path.exists(f_partition_function):
            self.create_partition_function(f_w2v, f_partition_function)

        self.Z = self.load_partition_function(f_partition_function)
        self.scores = []

        val = cfg_score["document_log_probability"]["intra_document_cutoff"]
        self.intra_document_cutoff = float(val)

        self.model = load_w2vec()
예제 #5
0
    def __init__(self, *args, **kwargs):
        super(generic_document_score, self).__init__(*args, **kwargs)

        # Load the model from disk
        self.M = load_w2vec()
        self.shape = self.M.wv.syn0.shape

        # Build the dictionary
        vocab_n = self.shape[0]
        self.word2index = dict(zip(self.M.wv.index2word, range(vocab_n)))

        # Set parallel option (currently does nothing)
        # self._PARALLEL = kwargs["_PARALLEL"]

        if "negative_weights" in kwargs:
            NV = []
            for word, weight in kwargs["negative_weights"].items():

                if not self.check_word_vector(word):
                    msg = "Negative weight word '{}' not found in dictionary"
                    print(msg.format(word))
                    continue

                vec = self.get_word_vector(word)
                scale = np.exp(-float(weight) * self.M.wv.syn0.dot(vec))

                # Don't oversample, max out weights to unity
                scale[scale > 1] = 1.0
                NV.append(scale)

            self.negative_weights = np.array(NV).T.sum(axis=1)

        else:
            self.negative_weights = np.ones(vocab_n, dtype=float)

        # Save the target column to compute
        self.target_column = simple_config.load()["target_column"]

        # Make sure nothing has been set yet
        self.V = self._ref = None

        # Set the variables for reduced representation
        config_score = simple_config.load()["score"]
        self.compute_reduced = config_score["compute_reduced_representation"]

        if self.compute_reduced:
            sec = config_score['reduced_representation']
            self.reduced_n_components = sec['n_components']

        self.h5py_args = {"compression": "gzip"}
예제 #6
0
    def describe_clusters(self, **kwargs):

        W = load_w2vec()

        meta_clusters = self.load_centroid_dataset("meta_centroids")
        n_clusters = meta_clusters.shape[0]

        # Find the closest items to each centroid
        all_words = []

        for i in range(n_clusters):
            v = meta_clusters[i]

            dist = W.syn0.dot(v)
            idx = np.argsort(dist)[::-1][:10]

            words = [W.index2word[i].replace('PHRASE_', '') for i in idx]

            all_words.append(u' '.join(words))

        return np.array(all_words)
    def __init__(self, *args, **kwargs):
        super(generic_document_score, self).__init__(*args, **kwargs)

        # Load the model from disk
        self.M = load_w2vec()
        self.shape = self.M.syn0.shape

        # Build the dictionary
        vocab_n = self.shape[0]
        self.word2index = dict(zip(self.M.index2word, range(vocab_n)))

        # Set parallel option (currently does nothing)
        # self._PARALLEL = kwargs["_PARALLEL"]

        # Load the negative weights
        if "negative_weights" in kwargs:
            neg_W = kwargs["negative_weights"]
            self.neg_W = dict((k, float(v)) for k, v in neg_W.items())
            self.neg_vec = dict((k, self.get_word_vector(k))
                                for k, v in neg_W.items())
        else:
            self.neg_W = {}
            self.neg_vec = {}

        # Save the target column to compute
        self.target_column = simple_config.load()["target_column"]

        # Make sure nothing has been set yet
        self.V = self._ref = None

        # Set the variables for reduced representation
        config_score = simple_config.load()["score"]
        self.compute_reduced = config_score["compute_reduced_representation"]

        if self.compute_reduced:
            sec = config_score['reduced_representation']
            self.reduced_n_components = sec['n_components']
예제 #8
0
def load_embeddings():
    '''
    Loads the gensim word embedding model.
    '''
    return load_w2vec()
예제 #9
0
def analyze_metacluster_from_config(config):
    '''
    Does analysis on metaclusters to return descriptive information and
    statistics.

    Args:
        config: a config file
    '''

    score_method = config["metacluster"]["score_method"]
    config = config["postprocessing"]
    topn_words_returned = config["topn_words_returned"]

    save_dest = config['output_data_directory']
    os.system('mkdir -p {}'.format(save_dest))

    model = uds.load_w2vec()
    ORG = uds.load_ORG_data(config["master_columns"])

    MC = uds.load_metacluster_data()
    C = MC["meta_centroids"]

    DV = uds.load_document_vectors(score_method)

    # Fix any zero vectors with random ones
    dim = DV["docv"].shape[1]
    idx = np.where(np.linalg.norm(DV["docv"], axis=1) == 0)[0]
    for i in idx:
        vec = np.random.uniform(size=(dim, ))
        vec /= np.linalg.norm(vec)
        DV["docv"][i] = vec

    # Build the results for the metaclusters
    labels = np.unique(MC["meta_labels"])

    if config["compute_dispersion"]:
        logger.info("Computing intra-document dispersion.")
        dist = _compute_dispersion_matrix(DV["docv"], MC["meta_labels"])

        # Compute the linkage and the order
        linkage = hierarchy.linkage(dist, method='average')
        d_idx = hierarchy.dendrogram(linkage, no_plot=True)["leaves"]

    else:
        # If dispersion is not calculated set d_idx to be the cluster index
        d_idx = np.sort(labels)

    #

    V = DV["docv"]
    data = []
    for cx, cluster_id in zip(C, labels):
        idx = MC["meta_labels"] == cluster_id

        item = {}
        item["counts"] = idx.sum()
        item["avg_centroid_distance"] = _compute_centroid_dist(V[idx], cx)

        if config["compute_dispersion"]:
            item["intra_document_dispersion"] = dist[cluster_id, cluster_id]
        else:
            item["intra_document_dispersion"] = -1

        # Compute closest words to the centroid
        desc = ' '.join(
            zip(*model.wv.similar_by_vector(cx, topn=topn_words_returned))[0])
        item["word2vec_description"] = desc

        data.append(item)

    df = pd.DataFrame(data, index=labels)

    df.index.name = "cluster_id"
    df["dispersion_order"] = d_idx

    cols = [
        "dispersion_order", "counts", "avg_centroid_distance",
        "intra_document_dispersion", "word2vec_description"
    ]

    df = df[cols]

    f_csv = os.path.join(save_dest, "cluster_desc.csv")
    df.to_csv(f_csv, index_label="cluster_id")

    logger.info("Computing master-label spreadsheets.")
    cluster_lookup = dict(zip(df.index, df.dispersion_order.values))
    ORG["cluster_id"] = MC["meta_labels"]
    ORG["dispersion_order"] = -1

    for i, j in cluster_lookup.items():
        idx = ORG["cluster_id"] == i
        ORG.loc[idx, "dispersion_order"] = j

    special_cols = ["_ref", "cluster_id", "dispersion_order"]
    cols = [x for x in ORG.columns if x not in special_cols]

    ORG = ORG[special_cols + cols]

    f_csv = os.path.join(save_dest, "cluster_master_labels.csv")
    ORG.to_csv(f_csv, index=False)

    print(df)  # Output the result to stdout