def build_graph(filename, TOPN, A_name, indice2word_name, annoy=False, dim=100, tree_num=20): """ """ model = read_w2v(filename, dim) V = len(model.wv.vocab) print("Num. vocab = %i" % V) word_indice_dic = {word: i for i, word in enumerate(model.wv.vocab)} indice2word = {i: word for word, i in word_indice_dic.items()} A = dok_matrix((V, V), dtype=np.float32) if annoy: print("Using ANNOY...") from gensim.similarities.index import AnnoyIndexer annoy_index = AnnoyIndexer(model, tree_num) add_neighbors(A, TOPN, model, word_indice_dic, annoy_index=annoy_index) else: add_neighbors(A, TOPN, model, word_indice_dic) save_sparse_csr(A_name, A.tocsr()) pickle.dump(indice2word, open(indice2word_name, "wb"))
def get_sparse_matrix(train=None, test=None, params=None, remove_numbers_function=True, debug=True, save=False, load=True, data_dir="data"): """ Get sparse matrix form of the train and test set Parameters ------------------------- Each input is numpy array: train, test, params: See the documentation of tf_idf function save: To save the train and test sprse matrices on . npz format load: To load the train and test sprse matrices from your local machine data_dir: Specify the ro specifi the data directory where the matrices are saved Returns: -------------------------- train: train set in sparce marix form test: test set in sparce matrix form Example ------- >>> train = pd.read_csv("data/train.csv") >>> test = pd.read_csv("data/test.csv") >>> # to create and save the train and test set >>> train_sparse, test_sparse = get_sparse_matrix(train, test, params=None, remove_numbers_function=True, debug=True, save=True, load=False) >>> # to load the sparse matrices from your local machine >>> train, test = get_sparse_matrix(load=True) """ base_dir = data_dir + '/output/' if not os.path.exists(base_dir): os.makedirs(base_dir) name_train = base_dir + 'sparce_train.npz' name_test = base_dir + 'sparce_test.npz' if load: if os.path.exists(name_train) and os.path.exists(name_test): train, test = load_sparse_csr(name_train), load_sparse_csr(name_test) else: raise ValueError("You asked to load the features but they were not found" + "at the specified location: \n{}\n{}".format(name_train, name_test)) else: print('Computing the sparse matrixes, this will take a while...!') train, test = tf_idf(train, test, params, remove_numbers_function, debug) if save: print('Saving train file as {}'.format(name_train)) save_sparse_csr(name_train, train) print('Saving test file as {}'.format(name_test)) save_sparse_csr(name_test, test) return train, test
def save(self): fn = self.cfg.get('context', 'context_file') updated_fn = fn + '.pickle' if not fn.endswith('pickle') else fn data = { "words": self.words, "vocabulary": self.vocabulary, "binary_words": self.binary_words, "binary_vocab": self.binary_vocab, "coocc": self.coocc, "cfg": self.cfg} logging.info('saving cfg, vocabulary, and edges...') cPickle.dump(data, open(updated_fn, 'w')) logging.info('saving arrays...') bin_fn = fn + '.bin' save_sparse_csr(fn, self.zero_sparse) save_sparse_csr(bin_fn, self.binary_sparse)
def build_subgraph(seed_words, w2v_filename, TOPN, A_name, indice2word_filename, dim): A, indice2word, model, word_indice_dic = graph_setup(dim, w2v_filename) #Obtain k-NN finished = 0 for word in seed_words: if not word in word_indice_dic: print("%s is OOV" % word) continue indice = word_indice_dic[word] for sim_word, cos_sim in model.most_similar(positive=[word], topn=TOPN): print(sim_word, "%.2f" % cos_sim) target_indice = word_indice_dic[sim_word] if indice == target_indice: continue # avoid adding self-loops A[indice, target_indice] = max(cos_sim, 0.0) A[target_indice, indice] = max(cos_sim, 0.0) finished += 1 save_sparse_csr(A_name, A.tocsr()) pickle.dump(indice2word, open(indice2word_filename, "wb"))
# transform all the data we have for dataset in ['train', 'test']: neg_corpus = pd.read_pickle(DATA_FOLDER + 'pd.DF.' + dataset + '_neg.pickle')['text'] pos_corpus = pd.read_pickle(DATA_FOLDER + 'pd.DF.' + dataset + '_pos.pickle')['text'] # loop over transformers for (transformer_name, transformer) in zip(['hashing', 'tfidf'], [hv, tfidf]): FOLDER = DATA_FOLDER + './' + transformer_name + '_' \ + str(n_features)+'_'+str(n_gram[0])+'-'+str(n_gram[1])+'grams' neg = transformer.transform(neg_corpus) pos = transformer.transform(pos_corpus) X = sparse.vstack([neg, pos]) # shuffle later Y = np.hstack([np.zeros(neg.shape[0]), np.ones(pos.shape[0])]) assert Y.shape[0] == neg.shape[0] + pos.shape[ 0], 'Y did not have expected size' assert X.shape[0] == neg.shape[0] + pos.shape[ 0], 'X did not have expected size' create_folder(FOLDER) save_sparse_csr(FOLDER + '/X_' + dataset + '.csr', X) # np.save(FOLDER + '/X_' + dataset + '.npy', X.toarray()) np.save(FOLDER + '/Y_' + dataset + '.npy', Y)
def buildIndex(): """For boolean query""" term2tid = {} invertedIndex = [] # element form: {'docFreq':0, 'docIDs':[]} """For vector space""" tf=[] docID2NameFile = open("docID2Name.json", "r") docID2Name = json.load(docID2NameFile) docID2NameFile.close() total_docs = len(docID2Name) cur_tid = 0 for cur_docID in xrange(total_docs): name = docID2Name[str(cur_docID)] doc = open("tmp/doc/"+name, "r") contents = doc.readlines() tokens = myTokenize.tokenize(contents[0][7:-1]) tokens.extend(tokens) # add the title tokens twice, consider comment it? tokens.extend(myTokenize.tokenize(contents[1][9:-1])) for token in tokens: if token not in term2tid: term2tid[token] = cur_tid invertedIndex.append({ # 'term':token, 'docFreq':0, 'docIDs':[]}) tf.append([]) cur_tid = cur_tid + 1 tid = term2tid[token] if( len(invertedIndex[tid]['docIDs'])==0 or invertedIndex[tid]['docIDs'][-1] != cur_docID): invertedIndex[tid]['docIDs'].append(cur_docID) invertedIndex[tid]['docFreq'] = invertedIndex[tid]['docFreq'] + 1 tf[tid].append(1) else: tf[tid][-1] = tf[tid][-1] + 1 doc.close() idf = np.zeros(cur_tid, dtype = np.float64) W = scipy.sparse.lil_matrix((cur_tid, total_docs)) for tid in xrange(cur_tid): logtf = 1 + np.log10(np.array(tf[tid])) cosNorm = np.sqrt(np.sum(logtf * logtf)) logtf = logtf / cosNorm W[tid, invertedIndex[tid]['docIDs']] = logtf idf[tid] = np.log10(total_docs * 1.0 / invertedIndex[tid]['docFreq']) W = scipy.sparse.csr_matrix(W) # terms = sorted([key for key in term2tid]) # termsFile = open("terms.json", "w") # json.dump(terms, termsFile) # termsFile.close() term2tidFile = open("term2tid.json", "w") json.dump(term2tid, term2tidFile) term2tidFile.close() indexFile = open("invertedIndex.json", "w") json.dump(invertedIndex, indexFile) indexFile.close() np.save('idf.npy', idf) utils.save_sparse_csr("weightMatrix", W)
logger.info('Processing file %i...' % i) logger.info('Counting words...') count_matrix, doc_dict = get_count_matrix( args, 'sqlite', {'db_path': f} ) logger.info('Getting word-doc frequencies...') freqs = get_doc_freqs(count_matrix) basename = os.path.splitext(os.path.basename(f))[0] basename += ('-ngram=%d-hash=%d' % (args.ngram, args.hash_size)) if not os.path.exists(args.out_dir): logger.info("Creating data directory") os.makedirs(args.out_dir) filename = os.path.join(args.out_dir, basename) logger.info('Saving to %s.npz' % filename) metadata = { 'doc_freqs': freqs, 'hash_size': args.hash_size, 'ngram': args.ngram, 'doc_dict': doc_dict } utils.save_sparse_csr(filename, count_matrix, metadata)
metadata['doc_freqs'] += nxt_metadata['doc_freqs'] nxt_DOC2IDX, nxt_doc_ids = nxt_metadata['doc_dict'] if set(doc_ids).intersection(nxt_doc_ids): raise RuntimeError('overlapping doc id n %ith file' % i) for k in nxt_DOC2IDX.keys(): nxt_DOC2IDX[k] += len(DOC2IDX) DOC2IDX = {**DOC2IDX, **nxt_DOC2IDX} doc_ids += nxt_doc_ids metadata['doc_dict'] = (DOC2IDX, doc_ids) basename = 'count' + ('-ngram=%d-hash=%d' % (metadata['ngram'], metadata['hash_size'])) if not os.path.exists(args.out_dir): logger.info("Creating data directory") os.makedirs(args.out_dir) filename = os.path.join(args.out_dir, basename) logger.info('Saving to %s.npz' % filename) # sp.save_npz(filename, mat) # np.savez(filename+'meta', **metadata) mat = mat.tocsr() utils.save_sparse_csr(filename, mat, metadata)
interactions_map = load_interactions() # build user-rating matrix urm = interactions_to_urm(interactions_map) # compute item similarity matrix usm = cosine_similarity(urm, dense_output=False) usm = apply_shrinkage(urm, usm) usm = keep_top_k(usm) # compute estimated user-rating matrix print("computing estimated ratings...") estimated_urm = urm.T.dot(usm).T save_sparse_csr('urm_user_based_full', estimated_urm.tocsr()) # write recommendations print("writing recommendations...") estimated_urm = estimated_urm.tolil() with open(OUTPUT, 'w') as out: out.write("user_id,recommended_items\n") for target_user in load_target_users()['user_id']: row_nonzeros = estimated_urm.rows[target_user] row_data = estimated_urm.data[target_user] indicies_with_data = list(zip(row_nonzeros, row_data)) indicies_with_data.sort(key=lambda t: t[1], reverse=True) best_items = [t[0] for t in indicies_with_data] + MOST_POPULAR recommendations = [int(item_id) for item_id in best_items if
parser.add_argument('--num-workers', type=int, default=None, help='Number of CPU processes (for tokenizing, etc)') args = parser.parse_args() logging.info('Counting words...') count_matrix, doc_dict = get_count_matrix(args, 'sqlite', {'db_path': args.db_path}) logger.info('Making tfidf vectors...') tfidf = get_tfidf_matrix(count_matrix) logger.info('Getting word-doc frequencies...') freqs = get_doc_freqs(count_matrix) basename = os.path.splitext(os.path.basename(args.db_path))[0] basename += ('-tfidf-ngram=%d-hash=%d-tokenizer=%s' % (args.ngram, args.hash_size, args.tokenizer)) filename = os.path.join(args.out_dir, basename) logger.info('Saving to %s.npz' % filename) metadata = { 'doc_freqs': freqs, 'tokenizer': args.tokenizer, 'hash_size': args.hash_size, 'ngram': args.ngram, 'doc_dict': doc_dict } utils.save_sparse_csr(filename, tfidf, metadata)
for name in data_names: yield config.get_par_data(name) if __name__ == '__main__': print('Loading the instances') classifier = load_pickle(config.classifier) vectorizer = load_pickle(config.vectorizer) binarizer = load_pickle(config.binarizer) for data in get_next_data(config.data.keys()): print('Processing ' + data['name'] + ' data') create_dir(data['dir']) paragraphs, topics, line_map = build_topics_paragraphs_index_map( data['text']) y_true = binarizer.transform(topics) print('Building the data matrix using the TfidfVectorizer') x = vectorizer.transform(paragraphs) print('Classifying...') y = classifier.predict_proba(x) print('Saving the data') save_sparse_csr(data['x'], x) np.save(data['y'], y) np.save(data['y_true'], y_true) save_pickle(data['line_map'], line_map)