def process_options(args): options = argparser().parse_args(args) if options.max_rank is not None and options.max_rank < 1: raise ValueError('max-rank must be >= 1') if options.k is not None and options.k < 2: raise ValueError('cluster number must be >= 2') if options.method == MINIBATCH_KMEANS and not with_sklearn: logging.warning('minibatch kmeans not available, using kmeans (slow)') options.method = KMEANS if options.jobs != 1 and (options.method != KMEANS or not with_sklearn): logging.warning('jobs > 1 only supported scikit-learn %s' % KMEANS) options.jobs = 1 wv = wvlib.load(options.vectors[0], max_rank=options.max_rank) if options.k is None: options.k = int(math.ceil((len(wv.words())/2)**0.5)) logging.info('set k=%d (%d words)' % (options.k, len(wv.words()))) if options.normalize: logging.info('normalize vectors to unit length') wv.normalize() words, vectors = wv.words(), wv.vectors() if options.whiten: logging.info('normalize features to unit variance') vectors = scipy.cluster.vq.whiten(vectors) return words, vectors, options
def process_options(args): options = argparser().parse_args(args) if options.max_rank is not None and options.max_rank < 1: raise ValueError('max-rank must be >= 1') if options.k is not None and options.k < 2: raise ValueError('cluster number must be >= 2') if options.method == MINIBATCH_KMEANS and not with_sklearn: logging.warning('minibatch kmeans not available, using kmeans (slow)') options.method = KMEANS if options.jobs != 1 and (options.method != KMEANS or not with_sklearn): logging.warning('jobs > 1 only supported scikit-learn %s' % KMEANS) options.jobs = 1 wv = wvlib.load(options.vectors[0], max_rank=options.max_rank) if options.k is None: options.k = int(math.ceil((len(wv.words()) / 2)**0.5)) logging.info('set k=%d (%d words)' % (options.k, len(wv.words()))) if options.normalize: logging.info('normalize vectors to unit length') wv.normalize() words, vectors = wv.words(), wv.vectors() if options.whiten: logging.info('normalize features to unit variance') vectors = scipy.cluster.vq.whiten(vectors) return words, vectors, options
def process_options(args): options = argparser().parse_args(args) if options.max_rank is not None and options.max_rank < 1: raise ValueError('max-rank must be >= 1') if options.threshold is not None and options.threshold < 0.0: raise ValueError('threshold must be >= 0') if options.tolerance is not None and options.tolerance < 0.0: raise ValueError('tolerance must be >= 0') if options.approximate and not options.threshold: raise ValueError('approximate only makes sense with a threshold') if options.approximate and options.metric != 'cosine': raise NotImplementedError('approximate only supported for cosine') wv = wvlib.load(options.vectors[0], max_rank=options.max_rank) if options.normalize: logging.info('normalize vectors to unit length') wv.normalize() words, vectors = wv.words(), wv.vectors() if options.whiten: # whitening should be implemented in wvlib to support together with # approximate similarity if options.approximate: raise NotImplemenedError logging.info('normalize features to unit variance') vectors = whiten(vectors) return words, vectors, wv, options
def main(argv=None): if argv is None: argv = sys.argv options = argparser().parse_args(argv[1:]) if options.quiet: logging.getLogger().setLevel(logging.ERROR) wordsets = read_wordsets(options.wordset) if not enough_data(wordsets): return 1 if options.max_rank is not None and options.max_rank < 1: raise ValueError('max-rank must be >= 1') wv = wvlib.load(options.vectors[0], max_rank=options.max_rank).normalize() w2v = wv.word_to_vector_mapping() word_count, oov_count = 0, 0 filtered_wordsets = {} for k, wordset in wordsets.items(): filtered = [] for w in wordset: if w in w2v: filtered.append(w) else: logging.warning('ignoring out-of-vocabulary word "%s"' % w) oov_count += 1 word_count += 1 if filtered: filtered_wordsets[k] = filtered else: logging.warning('wordset %s empty after OOV filtering, removing' % k) wordsets = filtered_wordsets if not enough_data(wordsets): return 1 results = [] for n1, n2 in combinations(wordsets.keys(), 2): result = compare_sets(wordsets[n1], n1, wordsets[n2], n2, w2v, options) if result is not None: results.append(result) if not options.quiet: print('out of vocabulary %d/%d (%.2f%%)' % \ (oov_count, word_count, 100.*oov_count/word_count), file=sys.stderr) if results: print('OVERALL AVERAGE (macro):\t%.2f%%\t(%.2f%% OOV)' % \ (100*sum(results)/len(results), 100.*oov_count/word_count)) else: print('All comparisons failed!', file=sys.stderr)
def main(argv=None): if argv is None: argv = sys.argv options = process_args(argv[1:]) try: wv = wvlib.load(options.vectors, max_rank=options.max_rank) wv = wv.normalize() except Exception, e: print >> sys.stderr, 'Error: %s' % str(e) return 1
def main(argv=None): if argv is None: argv = sys.argv # TODO: remove irrelevant options options = process_args(argv[1:]) try: wv = wvlib.load(options.vectors, max_rank=options.max_rank) except Exception, e: print >> sys.stderr, 'Error: %s' % str(e) return 1
def main(): if not (os.path.isfile(model_path)): # Train a word2vec model sentences = LineSentence(token_path) model = Word2Vec(sentences, size=300, window=4, min_count=10) model.wv.save_word2vec_format('model.txt', binary=False) del model else: # Generate sentences for each unsimplified sentence wv = wvlib.load(model_path) f = open('test.txt', 'r') for line in f: sent = word_tokenize(line) sent_len = len(sent) # Loop through length of sentence for i in range(0, len(sent)): # Get nearest neighbors for each word for j in range(0, NEIGHBOR_NUM): nearest = wv.nearest(sent[i])[j] # Absolute cut-off for cosine distance if (nearest[1] >= DISTANCE_NUM): # Filter open-class words word_tag = nltk.pos_tag(nltk.word_tokenize(sent[i])) neighbor_tag = nltk.pos_tag( nltk.word_tokenize(nearest[0])) if word_tag[0][1] and neighbor_tag[0][1] in { 'NN', 'NNS', 'RB', 'RBR', 'RBS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS' }: word = [] word.append(nearest[0]) # word word.append(nearest[1]) # distance score sent.append(word) heap = [] # Generate all possible sentence candidates by combining and averaging word vectors # Each sentence is the average of its words for sent_gen in itertools.permutations(sent, sent_len): temp = [] score = 0 for word in sent_gen: if isinstance(word, list): score += word[1] temp.append(word[0]) else: score += 1 temp.append(word) average = score / sent_len # Ignore reordering of original words if (average != 1): heapq.heappush(heap, (average, temp)) # Output top CANDIDATE_NUM sentences for k in heapq.nlargest(CANDIDATE_NUM, heap): candidate = ' '.join(k[1]) print candidate, k[0]
def main(argv=None): if argv is None: argv = sys.argv options = argparser().parse_args(argv[1:]) if options.quiet: logging.getLogger().setLevel(logging.ERROR) wordsets = read_wordsets(options.wordset) if not enough_data(wordsets): return 1 if options.max_rank is not None and options.max_rank < 1: raise ValueError('max-rank must be >= 1') wv = wvlib.load(options.vectors[0], max_rank=options.max_rank).normalize() w2v = wv.word_to_vector_mapping() word_count, oov_count = 0, 0 filtered_wordsets = {} for k, wordset in wordsets.items(): filtered = [] for w in wordset: if w in w2v: filtered.append(w) else: logging.warn('ignoring out-of-vocabulary word "%s"' % w) oov_count += 1 word_count += 1 if filtered: filtered_wordsets[k] = filtered else: logging.warn('wordset %s empty after OOV filtering, removing' % k) wordsets = filtered_wordsets if not enough_data(wordsets): return 1 results = [] for n1, n2 in combinations(wordsets.keys(), 2): result = compare_sets(wordsets[n1], n1, wordsets[n2], n2, w2v, options) if result is not None: results.append(result) if not options.quiet: print >> sys.stderr, 'out of vocabulary %d/%d (%.2f%%)' % \ (oov_count, word_count, 100.*oov_count/word_count) if results: print 'OVERALL AVERAGE (macro):\t%.2f%%\t(%.2f%% OOV)' % \ (100*sum(results)/len(results), 100.*oov_count/word_count) else: print >> sys.stderr, 'All comparisons failed!'
def main(argv=None): if argv is None: argv = sys.argv options = process_args(argv[1:]) try: wv = wvlib.load(options.vectors, max_rank=options.max_rank) wv = wv.normalize() except Exception as e: print('Error: %s' % str(e), file=sys.stderr) return 1 return query_loop(wv, options, process_query, query_count=3)
def main(argv=None): if argv is None: argv = sys.argv options = argparser().parse_args(argv[1:]) if options.quiet: logging.getLogger().setLevel(logging.ERROR) try: wv = wvlib.load(options.vectors, max_rank=options.max_rank) wv = wv.normalize() except Exception, e: print >> sys.stderr, 'Error: %s' % str(e) return 1
def get_nearest(vectors, queries, nncount=100, options=None): nearest = {} wv = wvlib.load(vectors).normalize() for query in queries: words = query.split() v = query_vector(wv, words) if v is not None: if options is None or not options.approximate: word_sim = wv.nearest(v, n=nncount, exclude=words) else: word_sim = wv.approximate_nearest(v, n=nncount, exclude=words, evalnum=10*nncount) nearest[query] = [ws[0] for ws in word_sim] else: nearest[query] = [] # out of vocabulary return nearest
def main(argv=None): if argv is None: argv = sys.argv options = argparser().parse_args(argv[1:]) if options.max_rank is not None and options.max_rank < 1: raise ValueError('max-rank must be >= 1') wv = wvlib.load(options.input, options.input_format, max_rank=options.max_rank) if options.normalize: logging.info('normalize vectors to unit length') wv.normalize() wv.save(options.output, vector_format=options.vector_format) return 0
def main(): wv = wvlib.load(model_path).normalize() fp = open('test.txt', 'r') for line in fp: sent = word_tokenize(line) sent.insert(0,'<s>') sent.append('</s>') prev = '<s>' # The FST f = fst.new() for i in range(1, len(sent) - 1): if (sent[i] in wv.vocab): f.add_arc(prev, sent[i], prev, sent[i], -1) for j in range(0, NEIGHBOR_NUM): nearest = wv.nearest(sent[i])[j] # Absolute cut-off for cosine distance if (nearest[1] >= DISTANCE_NUM): # Filter open-class words word_tag = nltk.pos_tag(nltk.word_tokenize(sent[i])) neighbor_tag = nltk.pos_tag(nltk.word_tokenize(nearest[0])) if word_tag[0][1] and neighbor_tag[0][1] in open_class: f.add_arc(prev, sent[i], prev, nearest[0], -nearest[1]) else: f.add_arc(prev, sent[i], prev, prev, -1) # Update prev prev = sent[i] # Final state f.add_arc(prev, '</s>', '<epsilon>', '<epsilon>', 0) f.set_start('<s>') f.set_final('</s>') f.printf() sp = fst.shortest_path_list(f, CANDIDATE_NUM) for path in sp: print("%.2f\t%s" % (path[0], path[2]))
def process_options(args): options = argparser().parse_args(args) if options.max_rank is not None and options.max_rank < 1: raise ValueError('max-rank must be >= 1') if options.eps <= 0.0: raise ValueError('eps must be > 0') wv = wvlib.load(options.vectors[0], max_rank=options.max_rank) if options.normalize: logging.info('normalize vectors to unit length') wv.normalize() words, vectors = wv.words(), wv.vectors() if options.whiten: logging.info('normalize features to unit variance') vectors = scipy.cluster.vq.whiten(vectors) return words, vectors, options
import sys sys.path.insert(0, "wvlib-master/") import wvlib vec = wvlib.load('PubMed-and-PMC-ri.tar.gz') vec.save_bin('PubMed-and-PMC-ri.bin')
def main(): if not (os.path.isfile(model_path)): # Train a word2vec model sentences = LineSentence(token_path) model = Word2Vec(sentences, size=300, window=4, min_count=10) model.wv.save_word2vec_format('model.txt', binary=False) del model else: # Generate sentences for each unsimplified sentence wv = wvlib.load(model_path).normalize() f = open('test.txt', 'r') line_num = sum(1 for _ in f) f.seek(0) count = 0 # Build lattice graph for every sentence for line in f: count += 1 print "Sentence", count, "of", line_num sent = word_tokenize(line) G = nx.DiGraph() G.add_node("START") temp = [] words = [] # Loop through length of sentence for i in range(0, len(sent)): node = sent[i] + '*' + str( i) + '*' # Unique identifier for nodes G.add_node(node) # Connect edges if (i == 0): G.add_edge("START", node, weight=-1) else: prev_node = sent[i - 1] + '*' + str(i - 1) + '*' G.add_edge(prev_node, node, weight=-1) for w in range(0, len(words)): G.add_edge(words[w], node, weight=-1) # "Save" nodes to connect back to the edges temp = words words = [] # Get nearest neighbors for each word if (sent[i] in wv.vocab): for j in range(0, NEIGHBOR_NUM): nearest = wv.nearest(sent[i])[j] # Absolute cut-off for cosine distance if (nearest[1] >= DISTANCE_NUM): # Filter open-class words word_tag = nltk.pos_tag(nltk.word_tokenize( sent[i])) neighbor_tag = nltk.pos_tag( nltk.word_tokenize(nearest[0])) if word_tag[0][1] and neighbor_tag[0][1] in { 'NN', 'NNS', 'RB', 'RBR', 'RBS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS' }: neighbor_node = nearest[0] + '*' + str( j) + '*' # Unique identifier for nodes words.append(neighbor_node) G.add_node(neighbor_node) # Connect edges if (i == 0): G.add_edge("START", neighbor_node, weight=-round(nearest[1], 5)) else: G.add_edge(prev_node, neighbor_node, weight=-round(nearest[1], 5)) for t in range(0, len(temp)): G.add_edge( temp[t], neighbor_node, weight=-round(nearest[1], 5)) # Add END node G.add_node("END") G.add_edge(node, "END", weight=-1) for w in range(0, len(words)): G.add_edge(words[w], "END", weight=-1) f = open('output.txt', 'a+') candidate_list = [] # Write sentence candidates from the lattice to file # We find the shortest path because we use negative weights for path in k_shortest_paths(G, "START", "END", CANDIDATE_NUM): H = G.subgraph(path) # Candidate sentences only candidate = [] if (len(k_shortest_paths( G, "START", "END", CANDIDATE_NUM)) == 1): # sentences with no candidates for c in range(0, CANDIDATE_NUM): candidate_list.append('-------') continue if (-(len(sent) + 1) != nx.shortest_path_length( H, "START", "END", weight='weight')): for word in path: # remove unique identifiers candidate.append(re.sub(r'\*.*\*', '', word)) candidate = [c.decode('utf-8') for c in candidate] candidate = ' '.join(candidate[1:-1]) candidate_list.append(candidate) else: candidate_list.append('-------') # # Draw sub-lattice # pos = nx.spring_layout(H) # new_labels = dict(map(lambda x:((x[0],x[1]), str(x[2]['weight'])), H.edges(data=True))) # nx.draw_networkx(H, pos=pos, font_weight='bold', font_size=15, edge_color='g') # nx.draw_networkx_edge_labels(H, pos=pos, font_weight='bold', width=4, edge_labels=new_labels) # nx.draw_networkx_edges(H, pos, with_labels=True, width=2, arrows=False) # plt.show() for sent in range(0, len(candidate_list)): f.write(candidate_list[sent].encode('utf-8') + '\n') f.close()