def load_instances(instance_strs, word_vectors): '''Load training examples Args: instance_strs: each string is a training example word_vectors: an instance of vec.wordvector Return: instances: a list of Instance ''' instances = [Instance.parse_from_str(i, word_vectors) for i in instance_strs] total_internal_node = 0 for instance in instances: total_internal_node += (len(instance.words)-1) * instance.freq return instances, total_internal_node
def load_instances(instance_strs, word_vectors): """Load training examples Args: instance_strs: each string is a training example word_vectors: an instance of vec.wordvector Return: instances: a list of Instance """ instances = [Instance.parse_from_str(i, word_vectors) for i in instance_strs] total_internal_node = 0 for instance in instances: # 对于一个短语有n个单词,则经过n-1次组合后形成唯一的短语向量,故中间节点共有n-1个 total_internal_node += (len(instance.words) - 1) * instance.freq return instances, total_internal_node
print >> stderr, 'load RAE parameters...' theta = unpickle(theta_file) rae = RecursiveAutoencoder.build(theta, embsize) total_cost = 0 total_instance_num = 0 total_internal_node_num = 0 print '='*63 print '%20s %20s %20s' % ('all', 'avg/node', 'internal node') print '-'*63 with Reader(phrases_file) as reader, Writer(output_file) as writer: for phrase in reader: instance = Instance.parse_from_str(phrase, word_vectors) words_embedded = word_vectors[instance.words] root_node, cost = rae.forward(words_embedded) # print "root node: ",root_node # vec = root_node.p.T[0] # convert n*1 vector to common vector vec=[] for node in root_node: vec.append(node.p.T[0]) # for nodes in root_node: # print "n:",nodes.p.shape # continue # writer.write(' '.join([str(vec[i]) for i in range(vec.size)])) for j in range(len(vec)): v=vec[j] writer.write(' '.join([str(v[i]) for i in range(v.size)])) if not j==(len(vec)-1):
print >> stderr, 'load RAE parameters...' theta = unpickle(theta_file) rae = RecursiveAutoencoder.build(theta, embsize) total_cost = 0 total_instance_num = 0 total_internal_node_num = 0 print '=' * 63 print '%20s %20s %20s' % ('all', 'avg/node', 'internal node') print '-' * 63 with Reader(phrases_file) as reader, Writer(output_file) as writer: for phrase in reader: instance = Instance.parse_from_str(phrase, word_vectors) words_embedded = word_vectors[instance.words] root_node, cost = rae.forward(words_embedded) vec = root_node.p.T[0] # convert n*1 vector to common vector writer.write(' '.join([str(vec[i]) for i in range(vec.size)])) writer.write('\n') internal_node_num = len(instance.words) - 1 if internal_node_num > 0: print '%20.8f, %20.8f, %18d' % (cost, cost / internal_node_num, internal_node_num) else: print '%20.8f, %20.8f, %18d' % (cost, cost, 0) total_cost += cost total_instance_num += 1