def retrieval_by_doclength(X_train, Y_train, X_test, Y_test, len_test, fraction=0.001, len_bin=600, multilabel=False): X_train = unitmatrix(X_train) # normalize X_test = unitmatrix(X_test) score = X_test.dot(X_train.T) precisions = defaultdict(list) n_queries = len(X_test) ntop = int(fraction * len(X_train)) # bins = [50, 100, 200, 300, 500, 1000, 2000, 3000, 4000, 5000] bins = [100, 120, 150, 200, 300, 1000, 1500, 2000, 4000] for idx in range(n_queries): retrieval_idx = score[idx].argsort()[::-1] pr = float( len([ i for i in retrieval_idx[:ntop] if hit(Y_train[i], Y_test[idx], multilabel) ])) / ntop for each in bins: if len_test[idx] < each: precisions[each].append(pr) break import pdb pdb.set_trace() precisions = dict([(x, sum(y) / len(y)) for x, y in precisions.items()]) return sorted(precisions.items(), key=lambda d: d[0])
def retrieval_perlabel(X_train, Y_train, X_test, Y_test, fractions=[0.01, 0.5, 1.0]): X_train = unitmatrix(X_train) # normalize X_test = unitmatrix(X_test) score = X_test.dot(X_train.T) precisions = defaultdict(dict) label_counter = Counter(Y_test.tolist()) for idx in range(len(X_test)): retrieval_idx = score[idx].argsort()[::-1] for fr in fractions: ntop = int(fr * len(X_train)) pr = float( len([ i for i in retrieval_idx[:ntop] if Y_train[i] == Y_test[idx] ])) / ntop try: precisions[fr][Y_test[idx]] += pr except: precisions[fr][Y_test[idx]] = pr new_pr = {} for fr, val in precisions.items(): avg_pr = 0. for label, pr in val.items(): avg_pr += pr / label_counter[label] new_pr[fr] = avg_pr / len(label_counter) return sorted(new_pr.items(), key=lambda d: d[0])
def retrieval(X_train, Y_train, X_test, Y_test, fractions=[0.01, 0.5, 1.0], multilabel=False): db_size = len(X_train) n_queries = len(X_test) X_train = unitmatrix(X_train) # normalize X_test = unitmatrix(X_test) score = X_test.dot(X_train.T) X_train = None X_test = None precisions = defaultdict(float) for idx in range(n_queries): retrieval_idx = score[idx].argsort()[::-1] target = Y_test[idx] for fr in fractions: ntop = int(fr * db_size) pr = float( len([ i for i in retrieval_idx[:ntop] if hit(Y_train[i], target, multilabel) ])) / ntop precisions[fr] += pr precisions = dict([(x, y / n_queries) for x, y in precisions.items()]) return sorted(precisions.items(), key=lambda d: d[0])
def translate_words(model, query, vocab, revocab, topn=10): weights = model.get_weights()[0] weights = unitmatrix(weights) # normalize query_vec = weights[vocab[query[0]]] - weights[vocab[query[1]]] + weights[vocab[query[2]]] score = query_vec.dot(weights.T) vidx = score.argsort()[::-1][:topn] return [revocab[idx] for idx in vidx]
def get_similar_words(model, query_id, vocab, topn=10): weights = model.get_weights()[0] weights = unitmatrix(weights) # normalize query = weights[query_id] score = query.dot(weights.T) vidx = score.argsort()[::-1][:topn] return [vocab[idx] for idx in vidx]
def test(args): corpus = load_corpus(args.input) vocab, docs = corpus['vocab'], corpus['docs'] n_vocab = len(vocab) doc_keys = docs.keys() X_docs = [] for k in doc_keys: X_docs.append(vecnorm(doc2vec(docs[k], n_vocab), 'logmax1', 0)) del docs[k] X_docs = np.r_[X_docs] ae = load_ae_model(args.load_model) doc_codes = ae.predict(X_docs) dump_json(dict(zip(doc_keys, doc_codes.tolist())), args.output) print 'Saved doc codes file to %s' % args.output if args.save_topics: topics_strength = get_topics_strength(ae, revdict(vocab), topn=50) print_topics(topics_strength) # save_topics_strength(topics_strength, args.save_topics) save_chinese_topics_strength(topics_strength, args.save_topics) # topics = get_topics(ae, revdict(vocab), topn=10) # write_file(topics, args.save_topics) print 'Saved topics file to %s' % args.save_topics if args.word_clouds: queries = ['interest', 'trust', 'cash', 'payment', 'rate', 'price', 'stock', 'share', 'award', 'risk', 'security', 'bank', 'company', 'service', 'grant', 'agreement', 'proxy', 'loan', 'capital', 'asset', 'bonus', 'shareholder', 'income', 'financial', 'net', 'purchase', 'position', 'management', 'loss', 'salary', 'stockholder', 'due', 'business', 'transaction', 'govern', 'trading', 'tax', 'march', 'april', 'june', 'july'] weights = ae.get_weights()[0] weights = unitmatrix(weights) # normalize word_cloud(weights, vocab, queries, save_file=args.word_clouds) print 'Saved word clouds file to %s' % args.word_clouds if args.sample_words: revocab = revdict(vocab) queries = ['weapon', 'christian', 'compani', 'israel', 'law', 'hockey', 'comput', 'space'] words = [] for each in queries: if each in vocab: words.append(get_similar_words(ae, vocab[each], revocab, topn=11)) write_file(words, args.sample_words) print 'Saved sample words file to %s' % args.sample_words if args.translate_words: revocab = revdict(vocab) queries = [['father', 'man', 'woman'], ['mother', 'woman', 'man']] for each in queries: print each print translate_words(ae, each, vocab, revocab, topn=10) if args.calc_distinct: # mean, std = calc_pairwise_cosine(ae) # print 'Average pairwise angle (pi): %s (%s)' % (mean / math.pi, std / math.pi) sd = calc_pairwise_dev(ae) print 'Average squared deviation from 0 (90 degree): %s' % sd
def calc_pairwise_cosine(model): weights = model.get_weights()[0] weights = unitmatrix(weights, axis=0) # normalize n = weights.shape[1] score = [] for i in range(n): for j in range(i + 1, n): score.append(np.arccos(weights[:, i].dot(weights[:, j]))) return np.mean(score), np.std(score)
def calc_pairwise_dev(model): # the average squared deviation from 0 (90 degree) weights = model.get_weights()[0] weights = unitmatrix(weights, axis=0) # normalize n = weights.shape[1] score = 0. for i in range(n): for j in range(i + 1, n): score += (weights[:, i].dot(weights[:, j]))**2 return np.sqrt(2. * score / n / (n - 1))
def test(args): corpus = load_corpus(args.corpus) vocab, docs = corpus['vocab'], corpus['docs'] doc_bow = {} for k in docs.keys(): bows = [] for idx, count in docs[k].iteritems(): bows.append((int(idx), count)) doc_bow[k] = bows del docs[k] lda = load_model(args.load_model) # generate_doc_codes(lda, doc_bow, args.output) # print 'Saved doc codes file to %s' % args.output if args.word_clouds: queries = [ 'interest', 'trust', 'cash', 'payment', 'rate', 'price', 'stock', 'share', 'award', 'risk', 'security', 'bank', 'company', 'service', 'grant', 'agreement', 'proxy', 'loan', 'capital', 'asset', 'bonus', 'shareholder', 'income', 'financial', 'net', 'purchase', 'position', 'management', 'loss', 'salary', 'stockholder', 'due', 'business', 'transaction', 'govern', 'trading', 'tax', 'march', 'april', 'june', 'july' ] weights = lda.state.get_lambda() weights = unitmatrix(weights.T) # normalize word_cloud(weights, vocab, queries, save_file=args.word_clouds) print 'Saved word clouds file to %s' % args.word_clouds if args.save_topics: topics_prob = show_topics_prob(lda) save_topics_prob(topics_prob, args.save_topics) # topics = show_topics(lda) # write_file(topics, args.save_topics) print 'Saved topics file to %s' % args.save_topics if args.calc_distinct: # mean, std = calc_pairwise_cosine(lda) # print 'Average pairwise angle (pi): %s (%s)' % (mean / math.pi, std / math.pi) sd = calc_pairwise_dev(lda) print 'Average squared deviation from 0 (90 degree): %s' % sd
def test(args): corpus = load_corpus(args.input) vocab, docs = corpus['vocab'], corpus['docs'] n_vocab = len(vocab) doc_keys = list(docs.keys()) X_docs = [] for k in doc_keys: X_docs.append(vecnorm(doc2vec(docs[k], n_vocab), 'logmax1', 0)) del docs[k] X_docs = np.r_[X_docs] ae = load_ae_model(args.load_model) doc_codes = ae.predict(X_docs) dump_json(dict(zip(doc_keys, doc_codes.tolist())), args.output) print('Saved doc codes file to %s' % args.output) if args.save_topics: topics_strength = get_topics_strength(ae, revdict(vocab), topn=10) save_topics_strength(topics_strength, args.save_topics) # topics = get_topics(ae, revdict(vocab), topn=10) # write_file(topics, args.save_topics) print('Saved topics file to %s' % args.save_topics) if args.word_clouds: queries = [ 'interest', 'trust', 'cash', 'payment', 'rate', 'price', 'stock', 'share', 'award', 'risk', 'security', 'bank', 'company', 'service', 'grant', 'agreement', 'proxy', 'loan', 'capital', 'asset', 'bonus', 'shareholder', 'income', 'financial', 'net', 'purchase', 'position', 'management', 'loss', 'salary', 'stockholder', 'due', 'business', 'transaction', 'govern', 'trading', 'tax', 'march', 'april', 'june', 'july' ] weights = ae.get_weights()[0] weights = unitmatrix(weights) # normalize word_cloud(weights, vocab, queries, save_file=args.word_clouds) print('Saved word clouds file to %s' % args.word_clouds) if args.sample_words: revocab = revdict(vocab) while True: print("----------------------------\n? ", end='') sys.stdout.flush() query = sys.stdin.readline() query = re.sub(r'[^\w\s-]', ' ', query) # remove punctuations except hyphen query_words = [] for word in query.lower().split(): # convert to lowercase if word not in stopwords.words('english'): # remove stop words query_words.append(word) # ===== make the query length to be (32) = times_steps size """long_enough = False while not long_enough: for word in query_words: query_vectors.append(word2vec_map[word]) if len(query_vectors) == 32: long_enough = True break""" words = [] for each in query_words: words.append( get_similar_words(ae, vocab[each], revocab, topn=11)) write_file(words, args.sample_words) print('Saved sample words file to %s' % args.sample_words) if args.translate_words: revocab = revdict(vocab) queries = [['father', 'man', 'woman'], ['mother', 'woman', 'man']] for each in queries: print(each) print(translate_words(ae, each, vocab, revocab, topn=10)) if args.calc_distinct: # mean, std = calc_pairwise_cosine(ae) # print 'Average pairwise angle (pi): %s (%s)' % (mean / math.pi, std / math.pi) sd = calc_pairwise_dev(ae) print('Average squared deviation from 0 (90 degree): %s' % sd)