示例#1
0
def plot_embedding_dimension(emb, dim, charpoints):
  plt.figure(figsize=(20,4))
  plt.axis([
    np.min(emb[charpoints][:,dim]),
    np.max(emb[charpoints][:,dim]),
    .3, .7
  ])
  plt.yticks([])
  plt.ylabel('')
  plt.title('Dimension {}'.format(dim+1))
  texts = []
  for cp in charpoints:
    #y = random.random()
    y = .4 + random.random() * .2
    t = plt.text(emb[cp][dim], y, charify(cp)
    )
    color = char_color(chr(cp))
    t.set_bbox(dict(color=color, alpha=.5, boxstyle='round'))
    texts.append(t)

  adjust_text(texts,
    only_move={'text': 'y'},
    force_text=14.5,
    expand_points=(1.2, 1.2),
    lim=5000,
  )

  fname = 'vis/d{}{}.png'.format(
    ALLOWED_TYPES[0] if len(ALLOWED_TYPES) == 1 else '',
  dim)
  plt.savefig(fname, bbox_inches='tight')
  plt.clf()
示例#2
0
def analogy(a, b, j, k=3, targ=None):
  """a is to b, as j is to ___
  """
  a, b, j = ord(a), ord(b), ord(j)
  global embedding, nbrs, charpoints
  e = embedding
  answer = e[j] + (e[b] - e[a])
  dist, idxs = nbrs.kneighbors([answer], k)
  for d,i in zip(dist.ravel(), idxs.ravel()):
    print "{}:{:.1f}".format(charify(charpoints[i]), d)
  if targ:
    target_vector = e[ord(targ)]
    print "Distance to {}: {:.1f}".format(targ, 
      scipy.spatial.distance.euclidean(target_vector, answer))
示例#3
0
def pprint_char(c):
    if ord(c) == VOCAB['bow']:
        return '^'
    elif ord(c) == VOCAB['eow']:
        return '$'
    elif ord(c) == VOCAB['pad']:
        return '_'
    elif ord(c) == VOCAB['bos']:
        return '<BOS>'
    elif ord(c) == VOCAB['eos']:
        return '<EOS>'
    elif c in '^$_':
        return '\\' + c
    else:
        return common.charify(c)
示例#4
0
def nn(vec):
  dist, idxs = nbrs.kneighbors([vec], 3)
  for d,i in zip(dist.ravel(), idxs.ravel()):
    print "{}:{:.1f}".format(charify(charpoints[i]), d)
示例#5
0
  return scipy.spatial.distance.euclidean(a, b)

def nn(vec):
  dist, idxs = nbrs.kneighbors([vec], 3)
  for d,i in zip(dist.ravel(), idxs.ravel()):
    print "{}:{:.1f}".format(charify(charpoints[i]), d)
  

def an(abj, k=3, target=None):
  return analogy(*abj, k=k, targ=target)

#ALLOWED_TYPES = ['digit', 'uppercase', 'lowercase', 'meta', 'punctuation']
ALLOWED_TYPES = ['uppercase', 'lowercase']

charpoints = [i for i in range(128) if char_type(i) in ALLOWED_TYPES]
embedding = get_embedding()

NUM = {}
for i in range(10):
  NUM[i] = embedding[ord(str(i))]

X = embedding[charpoints]
nbrs = NearestNeighbors(n_neighbors=N_NEIGHBS+1, algorithm='brute').fit(X)
distances, indices = nbrs.kneighbors(X)

for i, cp in enumerate(charpoints):
  print charify(cp) + '\t',
  for i2, ddist in zip(indices[i], distances[i])[1:]: # skip self-matches
    print '{}:{:.1f} '.format(charify(charpoints[i2]), ddist),
  print
示例#6
0
    x_max[0] + x_pad * 2,
    x_min[1] - y_pad,
    x_max[1] + y_pad * 2,
])
plt.yticks([])
plt.xticks([])

texts = []
xs = []
ys = []
for i, charpoint in enumerate(charpoints):
    if HIDE_OTHER_TYPES and char_type(charpoint) not in ALLOWED_TYPES:
        continue
    pt = X_sne[i]
    if TEXT_MODE:
        char = charify(charpoint)
        xs.append(pt[0])
        ys.append(pt[1])
        t = plt.text(
            pt[0],
            pt[1],
            char,
            fontdict={'size': 14},
            ha="center",
            va="center",
        )
        if COLORIZE:
            color = char_color(chr(charpoint))
            t.set_bbox(
                dict(
                    color=color,
示例#7
0
import sys
import string
import common

try:
    fname = sys.argv[1]
except IndexError:
    fname = '../data/news.txt'

byte_counts = [0 for _ in range(256)]
with open(fname) as f:
    b = f.read(1)
    while b != "":
        byte_counts[ord(b)] += 1
        b = f.read(1)

print '\n'.join('{}\t{}\t{}'.format(i, byte_counts[i], common.charify(i))
                for i in range(256))