def load(self): """ Ensure that all the data is loaded. """ if self.small_frame is not None: return try: if self.frame is None: self.frame = load_hdf(self.vector_filename) if not self.frame.index[1].startswith('/c/'): # These terms weren't in ConceptNet standard form. Assume # they're in English, and stick the English language tag on # them without any further transformation, so we can be sure # we're evaluating the vectors as provided. self.finder = None self.frame.index = ['/c/en/' + label for label in self.frame.index] if not self.frame.index.is_monotonic_increasing: self.frame = self.frame.sort_index() self.k = self.frame.shape[1] self.small_k = 100 self.small_frame = self.frame.iloc[:, : self.small_k].copy() except OSError: raise MissingVectorSpace( "Couldn't load the vector space %r. Do you need to build or " "download it?" % self.vector_filename ) self._build_trie()
def load(self): """ Ensure that all the data is loaded. """ if self.small_frame is not None: return try: if self.frame is None: self.frame = load_hdf(self.vector_filename) if not self.frame.index[1].startswith('/c/'): # These terms weren't in ConceptNet standard form. Assume # they're in English, and stick the English language tag on # them without any further transformation, so we can be sure # we're evaluating the vectors as provided. self.frame.index = [ '/c/en/' + label for label in self.frame.index ] if not self.frame.index.is_monotonic_increasing: self.frame = self.frame.sort_index() self.k = self.frame.shape[1] self.small_k = 100 self.small_frame = self.frame.iloc[:, :self.small_k].copy() except OSError: raise MissingVectorSpace( "Couldn't load the vector space %r. Do you need to build or " "download it?" % self.vector_filename) self._build_trie()
def load(self): """ Ensure that all the data is loaded. """ if self.small_frame is not None: return try: if self.frame is None: self.frame = load_hdf(self.vector_filename) # FIXME: is self.standardized used for anything? if self.frame.index[0].startswith('/c/'): self.standardized = True else: # These terms weren't in ConceptNet standard form. Assume # they're in English, and stick the English language tag on # them without any further transformation, so we can be sure # we're evaluating the vectors as provided. self.standardized = False self.finder = None self.frame.index = [ '/c/en/' + label for label in self.frame.index ] self.k = self.frame.shape[1] self.small_k = 100 self.small_frame = self.frame.iloc[:, :self.small_k].copy() except OSError: raise MissingVectorSpace( "Couldn't load the vector space %r. Do you need to build or " "download it?" % self.vector_filename)
def graph_comparison(table_filename): import matplotlib.pyplot as plt result = load_hdf(table_filename) plt.style.use('bmh') plt.rcParams['xtick.labelsize'] = 'x-large' plt.rcParams['ytick.labelsize'] = 'x-large' width = 0.15 evals = ['men3000', 'rw', 'mturk', 'ws353', 'story-cloze', 'sat-analogies'] eval_labels = ['MEN-3000', 'Rare Words', 'MTurk-771', 'WS353', 'Story Cloze', 'SAT analogies'] colors = [props['color'] for props in plt.rcParams['axes.prop_cycle']] systems = [ ('word2vec Google News', 'data/raw/vectors/GoogleNews-vectors-negative300.bin.gz'), ('GloVe 1.2 840B', 'data/raw/vectors/glove12.840B.300d.txt.gz'), ('LexVec', 'data/raw/vectors/lexvec.no-header.vectors.gz'), ('ConceptNet PPMI', 'data/vectors/conceptnet-55-ppmi.h5'), ('ConceptNet Numberbatch 16.09', 'data/vectors/numberbatch.h5') ] ind = np.arange(len(evals)) fig, ax = plt.subplots(figsize=(12, 6)) for i, (sysname, syspath) in enumerate(systems): eval_table = result.xs(syspath, level=0).loc[evals] errs = [eval_table['high'] - eval_table['acc'], eval_table['acc'] - eval_table['low']] ax.bar(ind + i * width, eval_table['acc'], width, color=colors[i], yerr=errs, ecolor='k') ax.set_ylim(0.0, 1.0) ax.legend([name for (name, path) in systems]) ax.set_xticks(ind + width * len(systems) / 2) ax.set_xticklabels(eval_labels) ax.xaxis.grid(False) plt.ylabel('Evaluation score', fontsize='x-large') plt.savefig('data/stats/eval-graph.png', dpi=300)
def graph_comparison(table_filename, out_filename): import matplotlib.pyplot as plt result = load_hdf(table_filename) # plt.style.use('bmh') plt.rcParams['xtick.labelsize'] = 'x-large' plt.rcParams['ytick.labelsize'] = 'x-large' evals = ['men3000', 'rw', 'mturk', 'ws353', 'semeval-2a-en'] eval_labels = [ 'MEN-3000', 'Rare Words', 'MTurk-771', 'WordSim-353', 'SemEval 2017-2a', ] prop_cycle = list(plt.rcParams['axes.prop_cycle']) colors = [props['color'] for props in prop_cycle] systems = [ ( 'word2vec Google News', 'data/raw/vectors/GoogleNews-vectors-negative300.bin.gz', ), ('GloVe 1.2 840B', 'data/raw/vectors/glove12.840B.300d.txt.gz'), ('GloVe renormalized', 'data/vectors/glove12-840B.h5'), ('fastText enWP (without OOV)', 'data/raw/vectors/fasttext-wiki-en.vec.gz'), # ('ConceptNet Numberbatch biased', 'data/vectors/numberbatch-biased.h5'), ('ConceptNet Numberbatch', 'data/vectors/numberbatch.h5'), ] width = 0.84 / len(systems) ind = np.arange(len(evals)) fig, ax = plt.subplots(figsize=(16, 8)) for i, (sysname, syspath) in enumerate(systems): eval_table = result.xs(syspath, level=0).loc[evals] value = eval_table['acc'] errs = [eval_table['high'] - value, value - eval_table['low']] ax.bar( ind + i * width, value, width * 0.9, color=colors[i], yerr=errs, ecolor='k' ) ax.set_ylim(0.0, 1.0) ax.set_yticks(np.arange(0.0, 1.1, 0.1)) ax.legend( [name for (name, path) in systems], bbox_to_anchor=(1.02, 1), loc=2, borderaxespad=0., ) ax.set_xticks(ind + width * len(systems) / 2) ax.set_xticklabels(eval_labels) ax.xaxis.grid(False) ax.yaxis.grid(True) ax.set_axisbelow(True) plt.ylabel( 'Evaluation score (Spearman \N{GREEK SMALL LETTER RHO})', fontsize='x-large' ) plt.savefig(out_filename, bbox_inches="tight", dpi=300)
def load_any_embeddings(filename): if filename.endswith('.bin.gz'): return load_word2vec_bin(filename, 1000000) elif filename.endswith('.gz'): return load_glove(filename, 1000000) elif filename.endswith('.h5'): return load_hdf(filename) else: raise ValueError("Can't recognize file extension of %r" % filename)
def graph_bias_comparison(table_filename, out_filename): import matplotlib.pyplot as plt result = load_hdf(table_filename) # plt.style.use('bmh') plt.rcParams['xtick.labelsize'] = 'x-large' plt.rcParams['ytick.labelsize'] = 'x-large' evals = [ 'gender', 'beliefs', 'ethnicity-coarse', 'ethnicity-fine', 'ethnicity-names' ] eval_labels = [ 'Gender bias', 'Religious bias', 'Ethnic bias (coarse)', 'Ethnic bias (fine)', 'Bias from names' ] prop_cycle = list(plt.rcParams['axes.prop_cycle']) colors = [props['color'] for props in prop_cycle] systems = [ ('word2vec Google News', 'data/raw/vectors/GoogleNews-vectors-negative300.bin.gz'), ('GloVe 1.2 840B', 'data/raw/vectors/glove12.840B.300d.txt.gz'), ('GloVe renormalized', 'data/vectors/glove12-840B.h5'), ('fastText enWP (without OOV)', 'data/raw/vectors/fasttext-wiki-en.vec.gz'), # ('ConceptNet Numberbatch biased', 'data/vectors/numberbatch-biased.h5'), ('ConceptNet Numberbatch 17.04', 'data/vectors/numberbatch.h5') ] width = 0.84 / len(systems) ind = np.arange(len(evals)) fig, ax = plt.subplots(figsize=(16, 8)) for i, (sysname, syspath) in enumerate(systems): eval_table = result.xs(syspath, level=0).loc[evals] value = eval_table['bias'] errs = [eval_table['high'] - value, value - eval_table['low']] ax.bar(ind + i * width, value, width * 0.9, color=colors[i], yerr=errs, ecolor='k') ax.set_ylim(0.0, 0.4) ax.set_yticks(np.arange(0.0, 0.5, 0.1)) ax.legend([name for (name, path) in systems], bbox_to_anchor=(1.02, 1), loc=2, borderaxespad=0.) ax.set_xticks(ind + width * len(systems) / 2) ax.set_xticklabels(eval_labels) ax.xaxis.grid(False) ax.yaxis.grid(True) ax.set_axisbelow(True) plt.ylabel('Correlation with stereotypes', fontsize='x-large') plt.savefig(out_filename, bbox_inches="tight", dpi=300)
def __init__(self, dirname): """ Load the files exported from a model trained with ConceptNet's implementation of Semantic Matching Energy. `terms-similar.h5` is the (|V| x 300) matrix of term embeddings. (It's named that because using the embeddings directly, instead of operating on them with a relation, is meant to represent the SimilarTo relation.) `relations.h5` is the (24 x 10) matrix of relation embeddings. `assoc.npy` is a 3-tensor with shape (10 x 300 x 300), which relates two term embeddings and a relation embedding. Multiplying two vectors by this tensor in the appropriate dimensions gives you a prediction for the third vector. """ path = pathlib.Path(dirname) self.rel_embeddings = load_hdf(str(path / 'relations.h5')) self.term_embeddings = load_hdf(str(path / 'terms-similar.h5')) self.assoc_tensor = np.load(str(path / 'assoc.npy'))
def graph_comparison(table_filename, out_filename): import matplotlib.pyplot as plt result = load_hdf(table_filename) # plt.style.use('bmh') plt.rcParams['xtick.labelsize'] = 'x-large' plt.rcParams['ytick.labelsize'] = 'x-large' evals = ['men3000', 'rw', 'mturk', 'ws353', 'semeval-2a-en'] eval_labels = [ 'MEN-3000', 'Rare Words', 'MTurk-771', 'WordSim-353', 'SemEval 2017-2a' ] prop_cycle = list(plt.rcParams['axes.prop_cycle']) colors = [props['color'] for props in prop_cycle] systems = [ ('word2vec Google News', 'data/raw/vectors/GoogleNews-vectors-negative300.bin.gz'), ('GloVe 1.2 840B', 'data/raw/vectors/glove12.840B.300d.txt.gz'), ('GloVe renormalized', 'data/vectors/glove12-840B.h5'), ('fastText enWP (without OOV)', 'data/raw/vectors/fasttext-wiki-en.vec.gz'), # ('ConceptNet Numberbatch biased', 'data/vectors/numberbatch-biased.h5'), ('ConceptNet Numberbatch 17.04', 'data/vectors/numberbatch.h5') ] width = 0.84 / len(systems) ind = np.arange(len(evals)) fig, ax = plt.subplots(figsize=(16, 8)) for i, (sysname, syspath) in enumerate(systems): eval_table = result.xs(syspath, level=0).loc[evals] value = eval_table['acc'] errs = [eval_table['high'] - value, value - eval_table['low']] ax.bar(ind + i * width, value, width * 0.9, color=colors[i], yerr=errs, ecolor='k') ax.set_ylim(0.0, 1.0) ax.set_yticks(np.arange(0.0, 1.1, 0.1)) ax.legend([name for (name, path) in systems], bbox_to_anchor=(1.02, 1), loc=2, borderaxespad=0.) ax.set_xticks(ind + width * len(systems) / 2) ax.set_xticklabels(eval_labels) ax.xaxis.grid(False) ax.yaxis.grid(True) ax.set_axisbelow(True) plt.ylabel('Evaluation score (Spearman \N{GREEK SMALL LETTER RHO})', fontsize='x-large') plt.savefig(out_filename, bbox_inches="tight", dpi=300)
def read_embedding_vocabularies(filenames): """ Reads every vector embedding file in the given collection of filenames, and returns the union of their vocabularies. (The files are assumed to be hdf5 files containing dataframes, and the vocabularies are their indices. """ result = pd.Index([]) for filename in filenames: vectors = load_hdf(filename) result = result.union(vectors.index) return result
def _load_vectors(): frame = load_hdf(resource_filename('codenames', 'data/mini.h5')) selections = [ label for label in frame.index if label.startswith('/c/en/') and '_' not in label and '#' not in label and wordfreq.zipf_frequency(label[6:], 'en') > 3.0 ] # Make sure all the words in Codenames are represented wordlist = [ standardized_uri('en', line.strip()) for line in open( resource_filename('codenames', 'data/codenames-words.txt')) ] additions = [word for word in wordlist if word not in selections] selections += additions frame = l2_normalize_rows(frame.loc[selections].astype('f')) return VectorSpaceWrapper(frame=frame)
def graph_comparison(table_filename, out_filename): import matplotlib.pyplot as plt result = load_hdf(table_filename) plt.style.use('bmh') plt.rcParams['xtick.labelsize'] = 'x-large' plt.rcParams['ytick.labelsize'] = 'x-large' patterns = [ "/", "\\" , "//" , "\\\\" , " " ] width = 0.15 evals = ['men3000', 'rw', 'mturk', 'ws353', 'story-cloze', 'sat-analogies'] eval_labels = ['MEN-3000', 'Rare Words', 'MTurk-771', 'WS353', 'Story Cloze', 'SAT analogies'] colors = [props['color'] for props in plt.rcParams['axes.prop_cycle']] systems = [ ('word2vec Google News', 'data/raw/vectors/GoogleNews-vectors-negative300.bin.gz'), ('GloVe 1.2 840B', 'data/raw/vectors/glove12.840B.300d.txt.gz'), ('LexVec: enWP + NewsCrawl', 'data/raw/vectors/lexvec.no-header.vectors.gz'), ('ConceptNet-PPMI', 'data/precomputed/vectors/conceptnet-55-ppmi.h5'), ('ConceptNet Numberbatch', 'data/precomputed/vectors/numberbatch.h5') ] ind = np.arange(len(evals)) fig, ax = plt.subplots(figsize=(16, 8)) for i, (sysname, syspath) in enumerate(systems): eval_table = result.xs(syspath, level=0).loc[evals] errs = [eval_table['high'] - eval_table['acc'], eval_table['acc'] - eval_table['low']] ax.bar(ind + i * width, eval_table['acc'], width, hatch=patterns[i], color=colors[i], yerr=errs, ecolor='k') ax.set_ylim(0.0, 1.0) ax.set_yticks(np.arange(0.0, 1.1, 0.1)) ax.legend([name for (name, path) in systems], bbox_to_anchor=(1.02, 1), loc=2, borderaxespad=0.) ax.set_xticks(ind + width * len(systems) / 2) ax.set_xticklabels(eval_labels) ax.xaxis.grid(False) plt.ylabel('Evaluation score', fontsize='x-large') plt.savefig(out_filename, bbox_inches="tight", dpi=300)
def read_embedding_vocabularies(filenames): result = pd.Index([]) for filename in filenames: vectors = load_hdf(filename) result = result.union(vectors.index) return result
def graph_bias_comparison(table_filename, out_filename): import matplotlib.pyplot as plt result = load_hdf(table_filename) # plt.style.use('bmh') plt.rcParams['xtick.labelsize'] = 'x-large' plt.rcParams['ytick.labelsize'] = 'x-large' evals = [ 'gender', 'beliefs', 'ethnicity-coarse', 'ethnicity-fine', 'ethnicity-names', ] eval_labels = [ 'Gender bias', 'Religious bias', 'Ethnic bias (coarse)', 'Ethnic bias (fine)', 'Bias from names', ] prop_cycle = list(plt.rcParams['axes.prop_cycle']) colors = [props['color'] for props in prop_cycle] systems = [ ( 'word2vec Google News', 'data/raw/vectors/GoogleNews-vectors-negative300.bin.gz', ), ('GloVe 1.2 840B', 'data/raw/vectors/glove12.840B.300d.txt.gz'), ('GloVe renormalized', 'data/vectors/glove12-840B.h5'), ('fastText enWP (without OOV)', 'data/raw/vectors/fasttext-wiki-en.vec.gz'), # ('ConceptNet Numberbatch biased', 'data/vectors/numberbatch-biased.h5'), ('ConceptNet Numberbatch 17.04', 'data/vectors/numberbatch.h5'), ] width = 0.84 / len(systems) ind = np.arange(len(evals)) fig, ax = plt.subplots(figsize=(16, 8)) for i, (sysname, syspath) in enumerate(systems): eval_table = result.xs(syspath, level=0).loc[evals] value = eval_table['bias'] errs = [eval_table['high'] - value, value - eval_table['low']] ax.bar( ind + i * width, value, width * 0.9, color=colors[i], yerr=errs, ecolor='k' ) ax.set_ylim(0.0, 0.4) ax.set_yticks(np.arange(0.0, 0.5, 0.1)) ax.legend( [name for (name, path) in systems], bbox_to_anchor=(1.02, 1), loc=2, borderaxespad=0., ) ax.set_xticks(ind + width * len(systems) / 2) ax.set_xticklabels(eval_labels) ax.xaxis.grid(False) ax.yaxis.grid(True) ax.set_axisbelow(True) plt.ylabel('Correlation with stereotypes', fontsize='x-large') plt.savefig(out_filename, bbox_inches="tight", dpi=300)