Пример #1
0
def process_options(args):    
    options = argparser().parse_args(args)

    if options.max_rank is not None and options.max_rank < 1:
        raise ValueError('max-rank must be >= 1')
    if options.k is not None and options.k < 2:
        raise ValueError('cluster number must be >= 2')

    if options.method == MINIBATCH_KMEANS and not with_sklearn:
        logging.warning('minibatch kmeans not available, using kmeans (slow)')
        options.method = KMEANS

    if options.jobs != 1 and (options.method != KMEANS or not with_sklearn):
        logging.warning('jobs > 1 only supported scikit-learn %s' % KMEANS)
        options.jobs = 1

    wv = wvlib.load(options.vectors[0], max_rank=options.max_rank)

    if options.k is None:
        options.k = int(math.ceil((len(wv.words())/2)**0.5))
        logging.info('set k=%d (%d words)' % (options.k, len(wv.words())))

    if options.normalize:
        logging.info('normalize vectors to unit length')
        wv.normalize()

    words, vectors = wv.words(), wv.vectors()

    if options.whiten:
        logging.info('normalize features to unit variance')
        vectors = scipy.cluster.vq.whiten(vectors)

    return words, vectors, options
def process_options(args):
    options = argparser().parse_args(args)

    if options.max_rank is not None and options.max_rank < 1:
        raise ValueError('max-rank must be >= 1')
    if options.k is not None and options.k < 2:
        raise ValueError('cluster number must be >= 2')

    if options.method == MINIBATCH_KMEANS and not with_sklearn:
        logging.warning('minibatch kmeans not available, using kmeans (slow)')
        options.method = KMEANS

    if options.jobs != 1 and (options.method != KMEANS or not with_sklearn):
        logging.warning('jobs > 1 only supported scikit-learn %s' % KMEANS)
        options.jobs = 1

    wv = wvlib.load(options.vectors[0], max_rank=options.max_rank)

    if options.k is None:
        options.k = int(math.ceil((len(wv.words()) / 2)**0.5))
        logging.info('set k=%d (%d words)' % (options.k, len(wv.words())))

    if options.normalize:
        logging.info('normalize vectors to unit length')
        wv.normalize()

    words, vectors = wv.words(), wv.vectors()

    if options.whiten:
        logging.info('normalize features to unit variance')
        vectors = scipy.cluster.vq.whiten(vectors)

    return words, vectors, options
Пример #3
0
def process_options(args):    
    options = argparser().parse_args(args)

    if options.max_rank is not None and options.max_rank < 1:
        raise ValueError('max-rank must be >= 1')
    if options.threshold is not None and options.threshold < 0.0:
        raise ValueError('threshold must be >= 0')
    if options.tolerance is not None and options.tolerance < 0.0:
        raise ValueError('tolerance must be >= 0')
    if options.approximate and not options.threshold:
        raise ValueError('approximate only makes sense with a threshold')
    if options.approximate and options.metric != 'cosine':
        raise NotImplementedError('approximate only supported for cosine')

    wv = wvlib.load(options.vectors[0], max_rank=options.max_rank)

    if options.normalize:
        logging.info('normalize vectors to unit length')
        wv.normalize()

    words, vectors = wv.words(), wv.vectors()

    if options.whiten:
        # whitening should be implemented in wvlib to support together with
        # approximate similarity
        if options.approximate:
            raise NotImplemenedError
        logging.info('normalize features to unit variance')
        vectors = whiten(vectors)

    return words, vectors, wv, options
Пример #4
0
def process_options(args):
    options = argparser().parse_args(args)

    if options.max_rank is not None and options.max_rank < 1:
        raise ValueError('max-rank must be >= 1')
    if options.threshold is not None and options.threshold < 0.0:
        raise ValueError('threshold must be >= 0')
    if options.tolerance is not None and options.tolerance < 0.0:
        raise ValueError('tolerance must be >= 0')
    if options.approximate and not options.threshold:
        raise ValueError('approximate only makes sense with a threshold')
    if options.approximate and options.metric != 'cosine':
        raise NotImplementedError('approximate only supported for cosine')

    wv = wvlib.load(options.vectors[0], max_rank=options.max_rank)

    if options.normalize:
        logging.info('normalize vectors to unit length')
        wv.normalize()

    words, vectors = wv.words(), wv.vectors()

    if options.whiten:
        # whitening should be implemented in wvlib to support together with
        # approximate similarity
        if options.approximate:
            raise NotImplemenedError
        logging.info('normalize features to unit variance')
        vectors = whiten(vectors)

    return words, vectors, wv, options
Пример #5
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    options = argparser().parse_args(argv[1:])

    if options.quiet:
        logging.getLogger().setLevel(logging.ERROR)

    wordsets = read_wordsets(options.wordset)

    if not enough_data(wordsets):
        return 1

    if options.max_rank is not None and options.max_rank < 1:
        raise ValueError('max-rank must be >= 1')
    wv = wvlib.load(options.vectors[0], max_rank=options.max_rank).normalize()
    w2v = wv.word_to_vector_mapping()

    word_count, oov_count = 0, 0
    filtered_wordsets = {}
    for k, wordset in wordsets.items():
        filtered = []
        for w in wordset:
            if w in w2v:
                filtered.append(w)
            else:
                logging.warning('ignoring out-of-vocabulary word "%s"' % w)
                oov_count += 1
            word_count += 1
        if filtered:
            filtered_wordsets[k] = filtered
        else:
            logging.warning('wordset %s empty after OOV filtering, removing' %
                            k)
    wordsets = filtered_wordsets

    if not enough_data(wordsets):
        return 1

    results = []
    for n1, n2 in combinations(wordsets.keys(), 2):
        result = compare_sets(wordsets[n1], n1, wordsets[n2], n2, w2v, options)
        if result is not None:
            results.append(result)

    if not options.quiet:
        print('out of vocabulary %d/%d (%.2f%%)' % \
              (oov_count, word_count, 100.*oov_count/word_count),
              file=sys.stderr)

    if results:
        print('OVERALL AVERAGE (macro):\t%.2f%%\t(%.2f%% OOV)' % \
              (100*sum(results)/len(results), 100.*oov_count/word_count))
    else:
        print('All comparisons failed!', file=sys.stderr)
Пример #6
0
def main(argv=None):
    if argv is None:
        argv = sys.argv
    options = process_args(argv[1:])
    try:
        wv = wvlib.load(options.vectors, max_rank=options.max_rank)
        wv = wv.normalize()
    except Exception, e:
        print >> sys.stderr, 'Error: %s' % str(e)
        return 1
Пример #7
0
def main(argv=None):
    if argv is None:
        argv = sys.argv
    # TODO: remove irrelevant options
    options = process_args(argv[1:])
    try:
        wv = wvlib.load(options.vectors, max_rank=options.max_rank)
    except Exception, e:
        print >> sys.stderr, 'Error: %s' % str(e)
        return 1
Пример #8
0
def main(argv=None):
    if argv is None:
        argv = sys.argv
    options = process_args(argv[1:])
    try:
        wv = wvlib.load(options.vectors, max_rank=options.max_rank)
        wv = wv.normalize()
    except Exception, e:
        print >> sys.stderr, 'Error: %s' % str(e)
        return 1
Пример #9
0
def main():
    if not (os.path.isfile(model_path)):
        # Train a word2vec model
        sentences = LineSentence(token_path)
        model = Word2Vec(sentences, size=300, window=4, min_count=10)
        model.wv.save_word2vec_format('model.txt', binary=False)
        del model
    else:
        # Generate sentences for each unsimplified sentence
        wv = wvlib.load(model_path)
        f = open('test.txt', 'r')

        for line in f:
            sent = word_tokenize(line)
            sent_len = len(sent)
            # Loop through length of sentence
            for i in range(0, len(sent)):
                # Get nearest neighbors for each word
                for j in range(0, NEIGHBOR_NUM):
                    nearest = wv.nearest(sent[i])[j]
                    # Absolute cut-off for cosine distance
                    if (nearest[1] >= DISTANCE_NUM):
                        # Filter open-class words
                        word_tag = nltk.pos_tag(nltk.word_tokenize(sent[i]))
                        neighbor_tag = nltk.pos_tag(
                            nltk.word_tokenize(nearest[0]))
                        if word_tag[0][1] and neighbor_tag[0][1] in {
                                'NN', 'NNS', 'RB', 'RBR', 'RBS', 'VB', 'VBD',
                                'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS'
                        }:
                            word = []
                            word.append(nearest[0])  # word
                            word.append(nearest[1])  # distance score
                            sent.append(word)
            heap = []
            # Generate all possible sentence candidates by combining and averaging word vectors
            # Each sentence is the average of its words
            for sent_gen in itertools.permutations(sent, sent_len):
                temp = []
                score = 0
                for word in sent_gen:
                    if isinstance(word, list):
                        score += word[1]
                        temp.append(word[0])
                    else:
                        score += 1
                        temp.append(word)
                average = score / sent_len
                # Ignore reordering of original words
                if (average != 1):
                    heapq.heappush(heap, (average, temp))
            # Output top CANDIDATE_NUM sentences
            for k in heapq.nlargest(CANDIDATE_NUM, heap):
                candidate = ' '.join(k[1])
                print candidate, k[0]
Пример #10
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    options = argparser().parse_args(argv[1:])

    if options.quiet:
        logging.getLogger().setLevel(logging.ERROR)

    wordsets = read_wordsets(options.wordset)

    if not enough_data(wordsets):
        return 1

    if options.max_rank is not None and options.max_rank < 1:
        raise ValueError('max-rank must be >= 1')
    wv = wvlib.load(options.vectors[0], max_rank=options.max_rank).normalize()
    w2v = wv.word_to_vector_mapping()

    word_count, oov_count = 0, 0
    filtered_wordsets = {}
    for k, wordset in wordsets.items():
        filtered = []
        for w in wordset:
            if w in w2v:
                filtered.append(w)
            else:
                logging.warn('ignoring out-of-vocabulary word "%s"' % w)
                oov_count += 1
            word_count += 1
        if filtered:
            filtered_wordsets[k] = filtered
        else:
            logging.warn('wordset %s empty after OOV filtering, removing' % k)
    wordsets = filtered_wordsets
            
    if not enough_data(wordsets):
        return 1

    results = []
    for n1, n2 in combinations(wordsets.keys(), 2):
        result = compare_sets(wordsets[n1], n1, wordsets[n2], n2, w2v, options)
        if result is not None:
            results.append(result)

    if not options.quiet:
        print >> sys.stderr, 'out of vocabulary %d/%d (%.2f%%)' % \
            (oov_count, word_count, 100.*oov_count/word_count)

    if results:
        print 'OVERALL AVERAGE (macro):\t%.2f%%\t(%.2f%% OOV)' % \
            (100*sum(results)/len(results), 100.*oov_count/word_count)
    else:
        print >> sys.stderr, 'All comparisons failed!'
Пример #11
0
def main(argv=None):
    if argv is None:
        argv = sys.argv
    options = process_args(argv[1:])
    try:
        wv = wvlib.load(options.vectors, max_rank=options.max_rank)
        wv = wv.normalize()
    except Exception as e:
        print('Error: %s' % str(e), file=sys.stderr)
        return 1
    return query_loop(wv, options, process_query, query_count=3)
Пример #12
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    options = argparser().parse_args(argv[1:])

    if options.quiet:
        logging.getLogger().setLevel(logging.ERROR)
    try:
        wv = wvlib.load(options.vectors, max_rank=options.max_rank)
        wv = wv.normalize()
    except Exception, e:
        print >> sys.stderr, 'Error: %s' % str(e)
        return 1    
def main(argv=None):
    if argv is None:
        argv = sys.argv

    options = argparser().parse_args(argv[1:])

    if options.quiet:
        logging.getLogger().setLevel(logging.ERROR)
    try:
        wv = wvlib.load(options.vectors, max_rank=options.max_rank)
        wv = wv.normalize()
    except Exception, e:
        print >> sys.stderr, 'Error: %s' % str(e)
        return 1
def get_nearest(vectors, queries, nncount=100, options=None):
    nearest = {}
    wv = wvlib.load(vectors).normalize()
    for query in queries:
        words = query.split()
        v = query_vector(wv, words)
        if v is not None:
            if options is None or not options.approximate:
                word_sim = wv.nearest(v, n=nncount, exclude=words)
            else:
                word_sim = wv.approximate_nearest(v, n=nncount, exclude=words,
                                                  evalnum=10*nncount)
            nearest[query] = [ws[0] for ws in word_sim]
        else:
            nearest[query] = [] # out of vocabulary
    return nearest
Пример #15
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    options = argparser().parse_args(argv[1:])
    if options.max_rank is not None and options.max_rank < 1:
        raise ValueError('max-rank must be >= 1')

    wv = wvlib.load(options.input, options.input_format,
                    max_rank=options.max_rank)

    if options.normalize:
        logging.info('normalize vectors to unit length')
        wv.normalize()

    wv.save(options.output, vector_format=options.vector_format)

    return 0
Пример #16
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    options = argparser().parse_args(argv[1:])
    if options.max_rank is not None and options.max_rank < 1:
        raise ValueError('max-rank must be >= 1')

    wv = wvlib.load(options.input, options.input_format,
                    max_rank=options.max_rank)

    if options.normalize:
        logging.info('normalize vectors to unit length')
        wv.normalize()

    wv.save(options.output, vector_format=options.vector_format)

    return 0
Пример #17
0
def main():
    wv = wvlib.load(model_path).normalize()
    fp = open('test.txt', 'r')

    for line in fp:
        sent = word_tokenize(line)
        sent.insert(0,'<s>')
        sent.append('</s>')
        prev = '<s>'

        # The FST
        f = fst.new()

        for i in range(1, len(sent) - 1):
            if (sent[i] in wv.vocab):
                f.add_arc(prev, sent[i], prev, sent[i], -1)
                for j in range(0, NEIGHBOR_NUM):
                    nearest = wv.nearest(sent[i])[j]
                    # Absolute cut-off for cosine distance
                    if (nearest[1] >= DISTANCE_NUM):
                        # Filter open-class words
                        word_tag = nltk.pos_tag(nltk.word_tokenize(sent[i]))
                        neighbor_tag = nltk.pos_tag(nltk.word_tokenize(nearest[0]))
                        if word_tag[0][1] and neighbor_tag[0][1] in open_class:
                            f.add_arc(prev, sent[i], prev, nearest[0], -nearest[1])
            else:
                f.add_arc(prev, sent[i], prev, prev, -1)

            # Update prev
            prev = sent[i]

        # Final state
        f.add_arc(prev, '</s>', '<epsilon>', '<epsilon>', 0)

        f.set_start('<s>')
        f.set_final('</s>')
                            
        f.printf()
        sp = fst.shortest_path_list(f, CANDIDATE_NUM)
        for path in sp:
            print("%.2f\t%s" % (path[0], path[2]))
Пример #18
0
def process_options(args):    
    options = argparser().parse_args(args)

    if options.max_rank is not None and options.max_rank < 1:
        raise ValueError('max-rank must be >= 1')
    if options.eps <= 0.0:
        raise ValueError('eps must be > 0')

    wv = wvlib.load(options.vectors[0], max_rank=options.max_rank)

    if options.normalize:
        logging.info('normalize vectors to unit length')
        wv.normalize()

    words, vectors = wv.words(), wv.vectors()

    if options.whiten:
        logging.info('normalize features to unit variance')
        vectors = scipy.cluster.vq.whiten(vectors)

    return words, vectors, options
Пример #19
0
def process_options(args):
    options = argparser().parse_args(args)

    if options.max_rank is not None and options.max_rank < 1:
        raise ValueError('max-rank must be >= 1')
    if options.eps <= 0.0:
        raise ValueError('eps must be > 0')

    wv = wvlib.load(options.vectors[0], max_rank=options.max_rank)

    if options.normalize:
        logging.info('normalize vectors to unit length')
        wv.normalize()

    words, vectors = wv.words(), wv.vectors()

    if options.whiten:
        logging.info('normalize features to unit variance')
        vectors = scipy.cluster.vq.whiten(vectors)

    return words, vectors, options
import sys

sys.path.insert(0, "wvlib-master/")

import wvlib

vec = wvlib.load('PubMed-and-PMC-ri.tar.gz')
vec.save_bin('PubMed-and-PMC-ri.bin')
Пример #21
0
def main():
    if not (os.path.isfile(model_path)):
        # Train a word2vec model
        sentences = LineSentence(token_path)
        model = Word2Vec(sentences, size=300, window=4, min_count=10)
        model.wv.save_word2vec_format('model.txt', binary=False)
        del model
    else:
        # Generate sentences for each unsimplified sentence
        wv = wvlib.load(model_path).normalize()
        f = open('test.txt', 'r')
        line_num = sum(1 for _ in f)
        f.seek(0)
        count = 0
        # Build lattice graph for every sentence
        for line in f:
            count += 1
            print "Sentence", count, "of", line_num
            sent = word_tokenize(line)
            G = nx.DiGraph()
            G.add_node("START")
            temp = []
            words = []
            # Loop through length of sentence
            for i in range(0, len(sent)):
                node = sent[i] + '*' + str(
                    i) + '*'  # Unique identifier for nodes
                G.add_node(node)
                # Connect edges
                if (i == 0):
                    G.add_edge("START", node, weight=-1)
                else:
                    prev_node = sent[i - 1] + '*' + str(i - 1) + '*'
                    G.add_edge(prev_node, node, weight=-1)
                    for w in range(0, len(words)):
                        G.add_edge(words[w], node, weight=-1)
                # "Save" nodes to connect back to the edges
                temp = words
                words = []
                # Get nearest neighbors for each word
                if (sent[i] in wv.vocab):
                    for j in range(0, NEIGHBOR_NUM):
                        nearest = wv.nearest(sent[i])[j]
                        # Absolute cut-off for cosine distance
                        if (nearest[1] >= DISTANCE_NUM):
                            # Filter open-class words
                            word_tag = nltk.pos_tag(nltk.word_tokenize(
                                sent[i]))
                            neighbor_tag = nltk.pos_tag(
                                nltk.word_tokenize(nearest[0]))
                            if word_tag[0][1] and neighbor_tag[0][1] in {
                                    'NN', 'NNS', 'RB', 'RBR', 'RBS', 'VB',
                                    'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ',
                                    'JJR', 'JJS'
                            }:
                                neighbor_node = nearest[0] + '*' + str(
                                    j) + '*'  # Unique identifier for nodes
                                words.append(neighbor_node)
                                G.add_node(neighbor_node)
                                # Connect edges
                                if (i == 0):
                                    G.add_edge("START",
                                               neighbor_node,
                                               weight=-round(nearest[1], 5))
                                else:
                                    G.add_edge(prev_node,
                                               neighbor_node,
                                               weight=-round(nearest[1], 5))
                                    for t in range(0, len(temp)):
                                        G.add_edge(
                                            temp[t],
                                            neighbor_node,
                                            weight=-round(nearest[1], 5))

            # Add END node
            G.add_node("END")
            G.add_edge(node, "END", weight=-1)
            for w in range(0, len(words)):
                G.add_edge(words[w], "END", weight=-1)

            f = open('output.txt', 'a+')
            candidate_list = []

            # Write sentence candidates from the lattice to file
            # We find the shortest path because we use negative weights
            for path in k_shortest_paths(G, "START", "END", CANDIDATE_NUM):
                H = G.subgraph(path)
                # Candidate sentences only
                candidate = []
                if (len(k_shortest_paths(
                        G, "START", "END",
                        CANDIDATE_NUM)) == 1):  # sentences with no candidates
                    for c in range(0, CANDIDATE_NUM):
                        candidate_list.append('-------')
                    continue
                if (-(len(sent) + 1) != nx.shortest_path_length(
                        H, "START", "END", weight='weight')):
                    for word in path:  # remove unique identifiers
                        candidate.append(re.sub(r'\*.*\*', '', word))
                    candidate = [c.decode('utf-8') for c in candidate]
                    candidate = ' '.join(candidate[1:-1])
                    candidate_list.append(candidate)
                else:
                    candidate_list.append('-------')

                # # Draw sub-lattice
                # pos = nx.spring_layout(H)
                # new_labels = dict(map(lambda x:((x[0],x[1]), str(x[2]['weight'])), H.edges(data=True)))
                # nx.draw_networkx(H, pos=pos, font_weight='bold', font_size=15, edge_color='g')
                # nx.draw_networkx_edge_labels(H, pos=pos, font_weight='bold', width=4, edge_labels=new_labels)
                # nx.draw_networkx_edges(H, pos, with_labels=True, width=2, arrows=False)
                # plt.show()

            for sent in range(0, len(candidate_list)):
                f.write(candidate_list[sent].encode('utf-8') + '\n')
            f.close()