示例#1
0
def retrieval_by_doclength(X_train,
                           Y_train,
                           X_test,
                           Y_test,
                           len_test,
                           fraction=0.001,
                           len_bin=600,
                           multilabel=False):
    X_train = unitmatrix(X_train)  # normalize
    X_test = unitmatrix(X_test)
    score = X_test.dot(X_train.T)
    precisions = defaultdict(list)
    n_queries = len(X_test)
    ntop = int(fraction * len(X_train))
    # bins = [50, 100, 200, 300, 500, 1000, 2000, 3000, 4000, 5000]
    bins = [100, 120, 150, 200, 300, 1000, 1500, 2000, 4000]

    for idx in range(n_queries):
        retrieval_idx = score[idx].argsort()[::-1]
        pr = float(
            len([
                i for i in retrieval_idx[:ntop]
                if hit(Y_train[i], Y_test[idx], multilabel)
            ])) / ntop
        for each in bins:
            if len_test[idx] < each:
                precisions[each].append(pr)
                break
    import pdb
    pdb.set_trace()
    precisions = dict([(x, sum(y) / len(y)) for x, y in precisions.items()])

    return sorted(precisions.items(), key=lambda d: d[0])
示例#2
0
def retrieval_perlabel(X_train,
                       Y_train,
                       X_test,
                       Y_test,
                       fractions=[0.01, 0.5, 1.0]):
    X_train = unitmatrix(X_train)  # normalize
    X_test = unitmatrix(X_test)
    score = X_test.dot(X_train.T)
    precisions = defaultdict(dict)
    label_counter = Counter(Y_test.tolist())

    for idx in range(len(X_test)):
        retrieval_idx = score[idx].argsort()[::-1]
        for fr in fractions:
            ntop = int(fr * len(X_train))
            pr = float(
                len([
                    i
                    for i in retrieval_idx[:ntop] if Y_train[i] == Y_test[idx]
                ])) / ntop
            try:
                precisions[fr][Y_test[idx]] += pr
            except:
                precisions[fr][Y_test[idx]] = pr
    new_pr = {}
    for fr, val in precisions.items():
        avg_pr = 0.
        for label, pr in val.items():
            avg_pr += pr / label_counter[label]
        new_pr[fr] = avg_pr / len(label_counter)

    return sorted(new_pr.items(), key=lambda d: d[0])
示例#3
0
def retrieval(X_train,
              Y_train,
              X_test,
              Y_test,
              fractions=[0.01, 0.5, 1.0],
              multilabel=False):
    db_size = len(X_train)
    n_queries = len(X_test)
    X_train = unitmatrix(X_train)  # normalize
    X_test = unitmatrix(X_test)
    score = X_test.dot(X_train.T)
    X_train = None
    X_test = None
    precisions = defaultdict(float)

    for idx in range(n_queries):
        retrieval_idx = score[idx].argsort()[::-1]
        target = Y_test[idx]
        for fr in fractions:
            ntop = int(fr * db_size)
            pr = float(
                len([
                    i for i in retrieval_idx[:ntop]
                    if hit(Y_train[i], target, multilabel)
                ])) / ntop
            precisions[fr] += pr
    precisions = dict([(x, y / n_queries) for x, y in precisions.items()])

    return sorted(precisions.items(), key=lambda d: d[0])
def translate_words(model, query, vocab, revocab, topn=10):
    weights = model.get_weights()[0]
    weights = unitmatrix(weights) # normalize
    query_vec = weights[vocab[query[0]]] - weights[vocab[query[1]]] + weights[vocab[query[2]]]
    score = query_vec.dot(weights.T)
    vidx = score.argsort()[::-1][:topn]
    return [revocab[idx] for idx in vidx]
示例#5
0
def get_similar_words(model, query_id, vocab, topn=10):
    weights = model.get_weights()[0]
    weights = unitmatrix(weights)  # normalize
    query = weights[query_id]
    score = query.dot(weights.T)
    vidx = score.argsort()[::-1][:topn]

    return [vocab[idx] for idx in vidx]
def test(args):
    corpus = load_corpus(args.input)
    vocab, docs = corpus['vocab'], corpus['docs']
    n_vocab = len(vocab)

    doc_keys = docs.keys()
    X_docs = []
    for k in doc_keys:
        X_docs.append(vecnorm(doc2vec(docs[k], n_vocab), 'logmax1', 0))
        del docs[k]
    X_docs = np.r_[X_docs]

    ae = load_ae_model(args.load_model)
    doc_codes = ae.predict(X_docs)
    dump_json(dict(zip(doc_keys, doc_codes.tolist())), args.output)
    print 'Saved doc codes file to %s' % args.output

    if args.save_topics:
        topics_strength = get_topics_strength(ae, revdict(vocab), topn=50)
        print_topics(topics_strength)
        # save_topics_strength(topics_strength, args.save_topics)
        save_chinese_topics_strength(topics_strength, args.save_topics)
        # topics = get_topics(ae, revdict(vocab), topn=10)
        # write_file(topics, args.save_topics)
        print 'Saved topics file to %s' % args.save_topics

    if args.word_clouds:
        queries = ['interest', 'trust', 'cash', 'payment', 'rate', 'price', 'stock', 'share', 'award', 'risk', 'security', 'bank', 'company',
            'service', 'grant', 'agreement', 'proxy', 'loan', 'capital', 'asset', 'bonus', 'shareholder', 'income', 'financial', 'net', 'purchase',
            'position', 'management', 'loss', 'salary', 'stockholder', 'due', 'business', 'transaction', 'govern', 'trading',
            'tax', 'march', 'april', 'june', 'july']
        weights = ae.get_weights()[0]
        weights = unitmatrix(weights) # normalize
        word_cloud(weights, vocab, queries, save_file=args.word_clouds)

        print 'Saved word clouds file to %s' % args.word_clouds

    if args.sample_words:
        revocab = revdict(vocab)
        queries = ['weapon', 'christian', 'compani', 'israel', 'law', 'hockey', 'comput', 'space']
        words = []
        for each in queries:
            if each in vocab:
                words.append(get_similar_words(ae, vocab[each], revocab, topn=11))
        write_file(words, args.sample_words)
        print 'Saved sample words file to %s' % args.sample_words
    if args.translate_words:
        revocab = revdict(vocab)
        queries = [['father', 'man', 'woman'], ['mother', 'woman', 'man']]
        for each in queries:
            print each
            print translate_words(ae, each, vocab, revocab, topn=10)
    if args.calc_distinct:
        # mean, std = calc_pairwise_cosine(ae)
        # print 'Average pairwise angle (pi): %s (%s)' % (mean / math.pi, std / math.pi)
        sd = calc_pairwise_dev(ae)
        print 'Average squared deviation from 0 (90 degree): %s' % sd
示例#7
0
def calc_pairwise_cosine(model):
    weights = model.get_weights()[0]
    weights = unitmatrix(weights, axis=0)  # normalize
    n = weights.shape[1]
    score = []
    for i in range(n):
        for j in range(i + 1, n):
            score.append(np.arccos(weights[:, i].dot(weights[:, j])))

    return np.mean(score), np.std(score)
示例#8
0
def calc_pairwise_dev(model):
    # the average squared deviation from 0 (90 degree)
    weights = model.get_weights()[0]
    weights = unitmatrix(weights, axis=0)  # normalize
    n = weights.shape[1]
    score = 0.
    for i in range(n):
        for j in range(i + 1, n):
            score += (weights[:, i].dot(weights[:, j]))**2

    return np.sqrt(2. * score / n / (n - 1))
def test(args):
    corpus = load_corpus(args.corpus)
    vocab, docs = corpus['vocab'], corpus['docs']
    doc_bow = {}
    for k in docs.keys():
        bows = []
        for idx, count in docs[k].iteritems():
            bows.append((int(idx), count))
        doc_bow[k] = bows
        del docs[k]

    lda = load_model(args.load_model)
    # generate_doc_codes(lda, doc_bow, args.output)
    # print 'Saved doc codes file to %s' % args.output

    if args.word_clouds:
        queries = [
            'interest', 'trust', 'cash', 'payment', 'rate', 'price', 'stock',
            'share', 'award', 'risk', 'security', 'bank', 'company', 'service',
            'grant', 'agreement', 'proxy', 'loan', 'capital', 'asset', 'bonus',
            'shareholder', 'income', 'financial', 'net', 'purchase',
            'position', 'management', 'loss', 'salary', 'stockholder', 'due',
            'business', 'transaction', 'govern', 'trading', 'tax', 'march',
            'april', 'june', 'july'
        ]

        weights = lda.state.get_lambda()
        weights = unitmatrix(weights.T)  # normalize
        word_cloud(weights, vocab, queries, save_file=args.word_clouds)

        print 'Saved word clouds file to %s' % args.word_clouds

    if args.save_topics:
        topics_prob = show_topics_prob(lda)
        save_topics_prob(topics_prob, args.save_topics)
        # topics = show_topics(lda)
        # write_file(topics, args.save_topics)
        print 'Saved topics file to %s' % args.save_topics

    if args.calc_distinct:
        # mean, std = calc_pairwise_cosine(lda)
        # print 'Average pairwise angle (pi): %s (%s)' % (mean / math.pi, std / math.pi)
        sd = calc_pairwise_dev(lda)
        print 'Average squared deviation from 0 (90 degree): %s' % sd
示例#10
0
def test(args):
    corpus = load_corpus(args.input)
    vocab, docs = corpus['vocab'], corpus['docs']
    n_vocab = len(vocab)

    doc_keys = list(docs.keys())
    X_docs = []
    for k in doc_keys:
        X_docs.append(vecnorm(doc2vec(docs[k], n_vocab), 'logmax1', 0))
        del docs[k]
    X_docs = np.r_[X_docs]

    ae = load_ae_model(args.load_model)
    doc_codes = ae.predict(X_docs)
    dump_json(dict(zip(doc_keys, doc_codes.tolist())), args.output)
    print('Saved doc codes file to %s' % args.output)

    if args.save_topics:
        topics_strength = get_topics_strength(ae, revdict(vocab), topn=10)
        save_topics_strength(topics_strength, args.save_topics)
        # topics = get_topics(ae, revdict(vocab), topn=10)
        # write_file(topics, args.save_topics)
        print('Saved topics file to %s' % args.save_topics)

    if args.word_clouds:
        queries = [
            'interest', 'trust', 'cash', 'payment', 'rate', 'price', 'stock',
            'share', 'award', 'risk', 'security', 'bank', 'company', 'service',
            'grant', 'agreement', 'proxy', 'loan', 'capital', 'asset', 'bonus',
            'shareholder', 'income', 'financial', 'net', 'purchase',
            'position', 'management', 'loss', 'salary', 'stockholder', 'due',
            'business', 'transaction', 'govern', 'trading', 'tax', 'march',
            'april', 'june', 'july'
        ]
        weights = ae.get_weights()[0]
        weights = unitmatrix(weights)  # normalize
        word_cloud(weights, vocab, queries, save_file=args.word_clouds)

        print('Saved word clouds file to %s' % args.word_clouds)

    if args.sample_words:
        revocab = revdict(vocab)
        while True:
            print("----------------------------\n? ", end='')
            sys.stdout.flush()
            query = sys.stdin.readline()
            query = re.sub(r'[^\w\s-]', ' ',
                           query)  # remove punctuations except hyphen
            query_words = []
            for word in query.lower().split():  # convert to lowercase
                if word not in stopwords.words('english'):  # remove stop words
                    query_words.append(word)

            # ===== make the query length to be (32) = times_steps size
            """long_enough = False
                while not long_enough:
                        for word in query_words:
                                query_vectors.append(word2vec_map[word])
                                if len(query_vectors) == 32:
                                        long_enough = True
                                        break"""
            words = []
            for each in query_words:
                words.append(
                    get_similar_words(ae, vocab[each], revocab, topn=11))
                write_file(words, args.sample_words)
                print('Saved sample words file to %s' % args.sample_words)
    if args.translate_words:
        revocab = revdict(vocab)
        queries = [['father', 'man', 'woman'], ['mother', 'woman', 'man']]
        for each in queries:
            print(each)
            print(translate_words(ae, each, vocab, revocab, topn=10))
    if args.calc_distinct:
        # mean, std = calc_pairwise_cosine(ae)
        # print 'Average pairwise angle (pi): %s (%s)' % (mean / math.pi, std / math.pi)
        sd = calc_pairwise_dev(ae)
        print('Average squared deviation from 0 (90 degree): %s' % sd)