def main(outfile=''): manifolds = ['euclidean', # 'transe', 'poincare', 'lorentz'] dimensions = [5, 10, 20, 50, 100, 200] js = {} for mani in manifolds: for dim in dimensions: key_json = '%s%d' % (mani, dim) txtdir = './emb150txt/%s%d/' % (mani, dim) txtfile = txtdir + ('%s%d.txt' % (mani, dim)) print('loading %s...' % txtfile) keyvalues = PoincareKeyedVectors.load_word2vec_format(txtfile, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict', limit=None, datatype=np.float64) pr = generate_data() actual_dim = dim if mani == 'lorentz': actual_dim += 1 eva = ReconstructionEvaluation(pr, keyvalues) # print('filename=%s: ' % txtfile) res = eva.evaluate() js[key_json] = res with open(outfile, 'w') as fp: json.dump(js,fp)
def __init__(self, ns): """ :param ns: ['molecular_function', 'biological_process', 'cellular_component'] """ self._aspect = aspect = GoAspect(ns) self._graph = G = get_ontology_graph(ns) classes = list(reversed(list(nx.topological_sort(G)))) self.root = root = classes[0] self._levels = self.bfs(root) self._levels = levels = dict() for node, lvl in nx.shortest_path_length(G, target=root).items(): if lvl in levels: levels[lvl].append(node) else: levels[lvl] = [node] self._mlb = MultiLabelBinarizer().fit([classes]) key_val = [(go, i) for i, go in enumerate(classes)] self.go2ix = {k: v for k, v in key_val} self.ix2go = {v: k for k, v in key_val} emb_fname = os.path.join('%s/%s-poincare-dim%d-epochs%d.emb' % (DATA_ROOT, aspect, dim, num_epochs)) if os.path.exists(emb_fname): self._kv = PoincareKeyedVectors.load(emb_fname) else: self._kv = embedding(ns, emb_fname)
def __init__(self, params): super().__init__(params) self.wiktionary = self.__get_wiktionary(params['wiki_path']) self.wiki_model = KeyedVectors.load_word2vec_format(params['wiki_vectors_path'], binary=False) self.node2vec = KeyedVectors.load_word2vec_format(params["node2vec_path"], binary=False) self.n = params['n'] self.projection = ProjectionVectorizer(self.w2v_data, params["projection_path"]) self.poincare_model = PoincareKeyedVectors.load_word2vec_format(params["poincare_path"], binary=False) self.n = params["n"] self.delete_bracets = re.compile(r"\(.+?\)") if params['language'] == 'ru': self.pattern = re.compile("[^А-я \-]") else: self.pattern = re.compile("[^A-z \-]")
def load_poincare_model(path, word2vec_format=True, binary=False): """ Load a Poincare embedding model. :param path: path of the file of the pre-trained Poincare embedding model :param word2vec_format: whether to load from word2vec format (default: True) :param binary: binary format (default: False) :return: a pre-trained Poincare embedding model :type path: str :type word2vec_format: bool :type binary: bool :rtype: gensim.models.poincare.PoincareKeyedVectors """ if word2vec_format: return PoincareKeyedVectors.load_word2vec_format(path, binary=binary) else: return PoincareModel.load(path).kv
def main(poincare=''): from gensim.models.poincare import PoincareModel pm = PoincareModel([], size=300, dtype=np.float64) emb = PoincareKeyedVectors.load_word2vec_format(poincare, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict', limit=None, datatype=np.float64) pm.kv = emb pm.save('w2v_poincare.pickle', pickle_protocol=4) pm2 = PoincareModel.load('w2v_poincare.pickle') pm2.train(10000, batch_size=10, print_every=1, check_gradients_every=None) pm2.save('w2v_poincare_after_train.pickle', pickle_protocol=4)
def poincare_distance_heatmap(origin_point, x_range=(-1.0, 1.0), y_range=(-1.0, 1.0), num_points=100): """Create a heatmap of Poincare distances from `origin_point` for each point (x, y), where x and y lie in `x_range` and `y_range` respectively, with `num_points` points chosen uniformly in both ranges. Parameters ---------- origin_point : tuple (int, int) (x, y) from which distances are to be measured and plotted. x_range : tuple (int, int) Range for x-axis from which to choose `num_points` points. y_range : tuple (int, int) Range for y-axis from which to choose `num_points` points. num_points : int Number of points to choose from `x_range` and `y_range`. Notes ----- Points outside the unit circle are ignored, since the Poincare distance is defined only for points inside the circle boundaries (exclusive of the boundary). Returns ------- :class:`plotly.graph_objs.Figure` Plotly figure that contains plot """ epsilon = 1e-8 # Can't choose (-1.0, -1.0) or (1.0, 1.0), distance undefined x_range, y_range = list(x_range), list(y_range) if x_range[0] == -1.0 and y_range[0] == -1.0: x_range[0] += epsilon y_range[0] += epsilon if x_range[0] == 1.0 and y_range[0] == 1.0: x_range[0] -= epsilon y_range[0] -= epsilon x_axis_values = np.linspace(x_range[0], x_range[1], num=num_points) y_axis_values = np.linspace(x_range[0], x_range[1], num=num_points) x, y = np.meshgrid(x_axis_values, y_axis_values) all_points = np.dstack((x, y)).swapaxes(1, 2).swapaxes(0, 1).reshape(2, num_points ** 2).T norms = np.linalg.norm(all_points, axis=1) all_points = all_points[norms < 1] origin_point = np.array(origin_point) all_distances = PoincareKeyedVectors.poincare_dists(origin_point, all_points) distances = go.Scatter( x=all_points[:, 0], y=all_points[:, 1], mode='markers', marker=dict( size='9', color=all_distances, colorscale='Viridis', showscale=True, colorbar=go.ColorBar( title='Poincare Distance' ), ), text=[ 'Distance from (%.2f, %.2f): %.2f' % (origin_point[0], origin_point[1], d) for d in all_distances], name='', # To avoid the default 'trace 0' ) origin = go.Scatter( x=[origin_point[0]], y=[origin_point[1]], name='Distance from (%.2f, %.2f)' % (origin_point[0], origin_point[1]), mode='markers+text', marker=dict( size='10', color='rgb(200, 50, 50)' ) ) layout = go.Layout( width=900, height=800, showlegend=False, title='Poincare Distances from (%.2f, %.2f)' % (origin_point[0], origin_point[1]), hovermode='closest', ) return go.Figure(data=[distances, origin], layout=layout)
def setUp(self): self.vectors = PoincareKeyedVectors.load_word2vec_format( datapath('poincare_vectors.bin'), binary=True)
def setUp(self): self.vectors = PoincareKeyedVectors.load_word2vec_format(datapath('poincare_vectors.bin'), binary=True)
def __init__(self, params): super().__init__(params) self.poincare_model = PoincareKeyedVectors.load_word2vec_format(params["poincare_path"], binary=False) self.n = params["n"]
from gensim.models.poincare import PoincareModel, PoincareKeyedVectors, PoincareRelations from gensim.viz.poincare import poincare_distance_heatmap from tensorflow.keras.layers import Embedding wordnet_mamal_file_path = '/Users/pankaj/dev/git/smu/nlp337/data/mamals.tsv' relations = PoincareRelations(wordnet_mamal_file_path, delimiter='\t') model = PoincareModel(train_data=relations, size=2, burn_in=0) model.train(epochs=2, print_every=500) pcv = PoincareKeyedVectors(vector_size=20) poincare_distance_heatmap((0, 0), x_range=(-1.0, 1.0), y_range=(-1.0, 1.0), num_points=100)