def build_graph(filename,
                TOPN,
                A_name,
                indice2word_name,
                annoy=False,
                dim=100,
                tree_num=20):
    """
    """
    model = read_w2v(filename, dim)
    V = len(model.wv.vocab)
    print("Num. vocab = %i" % V)
    word_indice_dic = {word: i for i, word in enumerate(model.wv.vocab)}
    indice2word = {i: word for word, i in word_indice_dic.items()}
    A = dok_matrix((V, V), dtype=np.float32)
    if annoy:
        print("Using ANNOY...")
        from gensim.similarities.index import AnnoyIndexer
        annoy_index = AnnoyIndexer(model, tree_num)
        add_neighbors(A, TOPN, model, word_indice_dic, annoy_index=annoy_index)
    else:
        add_neighbors(A, TOPN, model, word_indice_dic)

    save_sparse_csr(A_name, A.tocsr())
    pickle.dump(indice2word, open(indice2word_name, "wb"))
Пример #2
0
def get_sparse_matrix(train=None, test=None, params=None, remove_numbers_function=True, debug=True, save=False, load=True, data_dir="data"):
    """
    Get sparse matrix form of the train and test set

    Parameters
    -------------------------
    Each input is numpy array:
    train, test, params: See the documentation of tf_idf function
    save: To save the train and test sprse matrices on . npz format
    load: To load the train and test sprse matrices from your local machine
    data_dir: Specify the ro specifi the data directory where the matrices are saved

    Returns:
    --------------------------
    train: train set in sparce marix form
    test: test set in sparce matrix form

    Example
    -------
        >>> train = pd.read_csv("data/train.csv")
        >>> test = pd.read_csv("data/test.csv")
        >>> # to create and save the train and test set
        >>> train_sparse, test_sparse = get_sparse_matrix(train, test, params=None, remove_numbers_function=True, debug=True, save=True, load=False)
        >>> # to load the sparse matrices from your local machine
        >>> train, test = get_sparse_matrix(load=True)

    """
    base_dir = data_dir + '/output/'

    if not os.path.exists(base_dir):
        os.makedirs(base_dir)

    name_train = base_dir + 'sparce_train.npz'
    name_test = base_dir + 'sparce_test.npz'

    if load:
        if os.path.exists(name_train) and os.path.exists(name_test):
            train, test = load_sparse_csr(name_train), load_sparse_csr(name_test)
        else:
            raise ValueError("You asked to load the features but they were not found"
                             + "at the specified location: \n{}\n{}".format(name_train, name_test))

    else:
        print('Computing the sparse matrixes, this will take a while...!')
        train, test = tf_idf(train, test, params, remove_numbers_function, debug)

    if save:
        print('Saving train file as {}'.format(name_train))
        save_sparse_csr(name_train, train)
        print('Saving test file as {}'.format(name_test))
        save_sparse_csr(name_test, test)

    return train, test
Пример #3
0
 def save(self):
     fn = self.cfg.get('context', 'context_file')
     updated_fn = fn + '.pickle' if not fn.endswith('pickle') else fn
     data = {
         "words": self.words, "vocabulary": self.vocabulary,
         "binary_words": self.binary_words,
         "binary_vocab": self.binary_vocab, "coocc": self.coocc,
         "cfg": self.cfg}
     logging.info('saving cfg, vocabulary, and edges...')
     cPickle.dump(data, open(updated_fn, 'w'))
     logging.info('saving arrays...')
     bin_fn = fn + '.bin'
     save_sparse_csr(fn, self.zero_sparse)
     save_sparse_csr(bin_fn, self.binary_sparse)
def build_subgraph(seed_words, w2v_filename, TOPN, A_name,
                   indice2word_filename, dim):
    A, indice2word, model, word_indice_dic = graph_setup(dim, w2v_filename)

    #Obtain k-NN
    finished = 0
    for word in seed_words:
        if not word in word_indice_dic:
            print("%s is OOV" % word)
            continue
        indice = word_indice_dic[word]
        for sim_word, cos_sim in model.most_similar(positive=[word],
                                                    topn=TOPN):
            print(sim_word, "%.2f" % cos_sim)
            target_indice = word_indice_dic[sim_word]
            if indice == target_indice: continue  # avoid adding self-loops
            A[indice, target_indice] = max(cos_sim, 0.0)
            A[target_indice, indice] = max(cos_sim, 0.0)
        finished += 1

    save_sparse_csr(A_name, A.tocsr())
    pickle.dump(indice2word, open(indice2word_filename, "wb"))
Пример #5
0
        # transform all the data we have
        for dataset in ['train', 'test']:
            neg_corpus = pd.read_pickle(DATA_FOLDER + 'pd.DF.' + dataset +
                                        '_neg.pickle')['text']
            pos_corpus = pd.read_pickle(DATA_FOLDER + 'pd.DF.' + dataset +
                                        '_pos.pickle')['text']

            # loop over transformers
            for (transformer_name, transformer) in zip(['hashing', 'tfidf'],
                                                       [hv, tfidf]):

                FOLDER = DATA_FOLDER + './' + transformer_name + '_' \
                + str(n_features)+'_'+str(n_gram[0])+'-'+str(n_gram[1])+'grams'

                neg = transformer.transform(neg_corpus)
                pos = transformer.transform(pos_corpus)

                X = sparse.vstack([neg, pos])  # shuffle later
                Y = np.hstack([np.zeros(neg.shape[0]), np.ones(pos.shape[0])])

                assert Y.shape[0] == neg.shape[0] + pos.shape[
                    0], 'Y did not have expected size'
                assert X.shape[0] == neg.shape[0] + pos.shape[
                    0], 'X did not have expected size'

                create_folder(FOLDER)

                save_sparse_csr(FOLDER + '/X_' + dataset + '.csr', X)
                # np.save(FOLDER + '/X_' + dataset + '.npy', X.toarray())
                np.save(FOLDER + '/Y_' + dataset + '.npy', Y)
Пример #6
0
def buildIndex():
	"""For boolean query"""
	term2tid = {}
	invertedIndex = [] # element form: {'docFreq':0, 'docIDs':[]}

	"""For vector space"""
	tf=[]
	
	docID2NameFile = open("docID2Name.json", "r")
	docID2Name = json.load(docID2NameFile)
	docID2NameFile.close()
	
	total_docs = len(docID2Name)
		
	cur_tid = 0
	
	for cur_docID in xrange(total_docs):
		name = docID2Name[str(cur_docID)]
		doc = open("tmp/doc/"+name, "r")
		
		contents = doc.readlines()
		tokens = myTokenize.tokenize(contents[0][7:-1])
		tokens.extend(tokens) # add the title tokens twice, consider comment it?
	
		tokens.extend(myTokenize.tokenize(contents[1][9:-1]))
		
		for token in tokens:
			if token not in term2tid:
				term2tid[token] = cur_tid
				invertedIndex.append({
#					'term':token, 
					'docFreq':0, 
					'docIDs':[]})
				
				tf.append([])

				cur_tid = cur_tid + 1
							
			tid = term2tid[token]
			if( len(invertedIndex[tid]['docIDs'])==0 or invertedIndex[tid]['docIDs'][-1] != cur_docID):
				invertedIndex[tid]['docIDs'].append(cur_docID)
				invertedIndex[tid]['docFreq'] = invertedIndex[tid]['docFreq'] + 1
				tf[tid].append(1)
			else:
				tf[tid][-1] = tf[tid][-1] + 1
		doc.close()
	
	idf = np.zeros(cur_tid, dtype = np.float64)
	W = scipy.sparse.lil_matrix((cur_tid, total_docs))

	for tid in xrange(cur_tid):
		logtf = 1 + np.log10(np.array(tf[tid]))
		cosNorm = np.sqrt(np.sum(logtf * logtf))

		logtf = logtf / cosNorm
		W[tid, invertedIndex[tid]['docIDs']] = logtf
		
		idf[tid] = np.log10(total_docs * 1.0 / invertedIndex[tid]['docFreq'])
		
	
	W = scipy.sparse.csr_matrix(W)
		
#	terms = sorted([key for key in term2tid])
#	termsFile = open("terms.json", "w")
#	json.dump(terms, termsFile)
#	termsFile.close()
		
	term2tidFile = open("term2tid.json", "w")
	json.dump(term2tid, term2tidFile)
	term2tidFile.close()

	indexFile = open("invertedIndex.json", "w")
	json.dump(invertedIndex, indexFile)
	indexFile.close()
	
	np.save('idf.npy', idf)
	
	utils.save_sparse_csr("weightMatrix", W)
Пример #7
0
        logger.info('Processing file %i...' % i)
        
        logger.info('Counting words...')

        count_matrix, doc_dict = get_count_matrix(
            args, 'sqlite', {'db_path': f}
        )

        logger.info('Getting word-doc frequencies...')
        freqs = get_doc_freqs(count_matrix)

        basename = os.path.splitext(os.path.basename(f))[0]
        basename += ('-ngram=%d-hash=%d' %
                     (args.ngram, args.hash_size))

        if not os.path.exists(args.out_dir):
            logger.info("Creating data directory")
            os.makedirs(args.out_dir)

        filename = os.path.join(args.out_dir, basename)

        logger.info('Saving to %s.npz' % filename)
        metadata = {
            'doc_freqs': freqs,
            'hash_size': args.hash_size,
            'ngram': args.ngram,
            'doc_dict': doc_dict
        }

        utils.save_sparse_csr(filename, count_matrix, metadata)
Пример #8
0
        metadata['doc_freqs'] += nxt_metadata['doc_freqs']

        nxt_DOC2IDX, nxt_doc_ids = nxt_metadata['doc_dict']

        if set(doc_ids).intersection(nxt_doc_ids):
            raise RuntimeError('overlapping doc id n %ith file' % i)

        for k in nxt_DOC2IDX.keys():
            nxt_DOC2IDX[k] += len(DOC2IDX)

        DOC2IDX = {**DOC2IDX, **nxt_DOC2IDX}
        doc_ids += nxt_doc_ids

    metadata['doc_dict'] = (DOC2IDX, doc_ids)

    basename = 'count' + ('-ngram=%d-hash=%d' %
                          (metadata['ngram'], metadata['hash_size']))

    if not os.path.exists(args.out_dir):
        logger.info("Creating data directory")
        os.makedirs(args.out_dir)

    filename = os.path.join(args.out_dir, basename)

    logger.info('Saving to %s.npz' % filename)
    # sp.save_npz(filename, mat)
    # np.savez(filename+'meta', **metadata)
    mat = mat.tocsr()
    utils.save_sparse_csr(filename, mat, metadata)
Пример #9
0
interactions_map = load_interactions()

# build user-rating matrix
urm = interactions_to_urm(interactions_map)

# compute item similarity matrix
usm = cosine_similarity(urm, dense_output=False)
usm = apply_shrinkage(urm, usm)

usm = keep_top_k(usm)

# compute estimated user-rating matrix
print("computing estimated ratings...")
estimated_urm = urm.T.dot(usm).T

save_sparse_csr('urm_user_based_full', estimated_urm.tocsr())

# write recommendations
print("writing recommendations...")
estimated_urm = estimated_urm.tolil()
with open(OUTPUT, 'w') as out:
    out.write("user_id,recommended_items\n")
    for target_user in load_target_users()['user_id']:
        row_nonzeros = estimated_urm.rows[target_user]
        row_data = estimated_urm.data[target_user]

        indicies_with_data = list(zip(row_nonzeros, row_data))
        indicies_with_data.sort(key=lambda t: t[1], reverse=True)

        best_items = [t[0] for t in indicies_with_data] + MOST_POPULAR
        recommendations = [int(item_id) for item_id in best_items if
Пример #10
0
    parser.add_argument('--num-workers',
                        type=int,
                        default=None,
                        help='Number of CPU processes (for tokenizing, etc)')
    args = parser.parse_args()

    logging.info('Counting words...')
    count_matrix, doc_dict = get_count_matrix(args, 'sqlite',
                                              {'db_path': args.db_path})

    logger.info('Making tfidf vectors...')
    tfidf = get_tfidf_matrix(count_matrix)

    logger.info('Getting word-doc frequencies...')
    freqs = get_doc_freqs(count_matrix)

    basename = os.path.splitext(os.path.basename(args.db_path))[0]
    basename += ('-tfidf-ngram=%d-hash=%d-tokenizer=%s' %
                 (args.ngram, args.hash_size, args.tokenizer))
    filename = os.path.join(args.out_dir, basename)

    logger.info('Saving to %s.npz' % filename)
    metadata = {
        'doc_freqs': freqs,
        'tokenizer': args.tokenizer,
        'hash_size': args.hash_size,
        'ngram': args.ngram,
        'doc_dict': doc_dict
    }
    utils.save_sparse_csr(filename, tfidf, metadata)
Пример #11
0
    for name in data_names:
        yield config.get_par_data(name)


if __name__ == '__main__':
    print('Loading the instances')
    classifier = load_pickle(config.classifier)
    vectorizer = load_pickle(config.vectorizer)
    binarizer = load_pickle(config.binarizer)

    for data in get_next_data(config.data.keys()):
        print('Processing ' + data['name'] + ' data')

        create_dir(data['dir'])

        paragraphs, topics, line_map = build_topics_paragraphs_index_map(
            data['text'])
        y_true = binarizer.transform(topics)

        print('Building the data matrix using the TfidfVectorizer')
        x = vectorizer.transform(paragraphs)

        print('Classifying...')
        y = classifier.predict_proba(x)

        print('Saving the data')
        save_sparse_csr(data['x'], x)
        np.save(data['y'], y)
        np.save(data['y_true'], y_true)
        save_pickle(data['line_map'], line_map)