示例#1
0
def matrix_term_document(args):
    def calculate_tf_weights(lst_contents, words):

        rows = len(words)
        columns = len(lst_contents)

        TF_matrix = np.zeros((rows, columns), dtype=np.float32)

        for i, word in enumerate(words):
            for j, content in enumerate(lst_contents):
                TF_matrix[i, j] = content.count(word) / len(content)

        return TF_matrix

    def calculate_idf_weights(TF):

        IDF = 1 + np.log(TF.shape[1] / np.sum(TF != 0, axis=1))

        return np.array([IDF]).T

    # Bước 1: Load data from directory
    lst_contents, file_paths = load_data_from_directory(args['data_path'])

    # Bước 2: Build dictionary
    vocal = build_dictionary(lst_contents)

    # Bước 3: Calculate the TF weights for each document.
    TF_matrix = calculate_tf_weights(lst_contents, vocal)

    # Bước 4: Calculate the IDF weights
    IDF = calculate_idf_weights(TF_matrix)

    # Bước 5: Calculate the TF-IDF
    TF_IDF = TF_matrix * IDF

    # Bước 6: Make query
    qwords = args['query'].split()
    qTF = calculate_tf_weights([qwords], vocal)
    qTF_IDF = qTF * IDF

    # Bước 6: Calculate the similarity between qTF_IDF and TF_IDF
    dists = np.linalg.norm(TF_IDF - qTF_IDF, axis=0)

    # Bước 7: Ranking and display result
    ranked_result = np.argsort(dists)

    print("The ranking result matching with query {}".format(args['query']))

    N = int(args['numbers'])
    for index in ranked_result[0:N]:
        print(file_paths[index])
import senteval
from utils import build_dictionary
import torch
from ADNet import ADNet
import numpy as np

PATH_TO_DATA = "../../SentEVal-master/data"
file = "amazon_food_review/train.csv"
MODEL_PATH = "Modals/ADnets.dms"

TOKEN2ID = build_dictionary(file)
MODEL = ADNet(input_size=256,
              hidden_size=256,
              sentiment_size=256,
              max_len=35,
              vocab_size=len(TOKEN2ID) + 9,
              output_size=1)
MODEL.load_state_dict(torch.load(MODEL_PATH, map_location="cpu"))


def prepare(params, samples):
    params.word2id = TOKEN2ID


def batcher(params, batch):
    batch_size = len(batch)
    batch = [sent if sent != [] else ['<unk>'] for sent in batch]
    lengths = torch.LongTensor(
        [len(sent) if len(sent) <= 35 else 35 for sent in batch])
    padded_sentences = np.zeros((batch_size, 35))
    padded_t = np.zeros((batch_size, 35))
    def reconstruction(self,input_sentence,max_len):
        sentence = clean_sentence(input_sentence)
        sentence = convert_sentence2id(input_sentence,self.token2id,max_len)
        target = np.zeros((1,max_len))
        l = len(word_tokenize(input_sentence))
        target[0,:l] = sentence[0,:l]
        target = torch.LongTensor(target)
        #target = convert_sentence2id(input_sentence[:-1],self.token2id,max_len)
       # print(" ".join([self.id2token[id.item()] for id in target[0]]))
        print(sentence)
        print(target)

        output = self.model(inputs = sentence,targets=target,lengths=torch.LongTensor([len(sentence)]))
        prediction = output["predictions"][0]
        return " ".join(self.id2token[id.item()] for id in prediction)


if __name__ == "__main__":
    #model_path = "Test_Models/checkpoint.dms"
    model_path = "Test_Models/ElmoSentenceEmbdedding_model-3.dms"
    training_data = "amazon_food_review/small_train.csv"
    token2id = build_dictionary(training_data)
    #model = ADNet(input_size=512, hidden_size=512, sentiment_size=512, max_len=35,
    #              vocab_size=len(token2id) + 8, output_size=1)
    model = ElmoSentenceEmbeddingNets(input_size=512,hidden_size=512,max_len=35,
                                      vocab_size=len(token2id)+2,sentiment_size=512)
    re = ReconstructionSent(model=model,model_path=model_path,token2id=token2id)
    print(re.reconstruction("This is good iced tea.  It is hard to find locally in the Fall and Winter.",35))
    #print(re.reconstruction("My children love these rice milk boxes and they are just the right size for their lunches.",35))
    #print(re.reconstruction("Some may say this buffet is pricey but I think you get what you pay for and this place you are getting quite a lot!",35))
示例#4
0
import config as cfg

# query = "how do I schedule an event?"
query = "What is the purpose of 25Live Event Wizard?"

# ### Content ###
DATABASE = utils.get_file_path(cfg.DATABASE_FILE)
content = help_content.HelpContent(DATABASE)

# print( help( corpora.dictionary ) )
should_rebuild = False

# ### Dictionary ###
dict_file = utils.get_file_path(cfg.DICT_BACKUP)
# dictionary = corpora.dictionary.Dictionary.load(dict_file)
dictionary = utils.build_dictionary(content, should_rebuild, cfg.DICT_BACKUP)

# ### Corpus ###
corpus_file = utils.get_file_path(cfg.CORPUS_BACKUP)
# utils.pickle_save(corpus_file, corpus)
# corpus = corpora.MmCorpus(corpus_file)
corpus = utils.build_corpus(dictionary, content, should_rebuild,
                            cfg.CORPUS_BACKUP)
# corpus = pickle.load( open( corpus_file, "rb" ) )

# print( cfg.MODEL_NAME )

# ### LDA Model ###
bow = dictionary.doc2bow(utils.get_cleaned_text(query.lower()).split())
# bag_of_words = [word for word in bow]
model = utils.build_model(dictionary, corpus, should_rebuild)
        sentiment.append(y.item())

    data = {
        "sentiment_hidden": sentiment_hidden,
        "other_hidden": other_hidden,
        "sentiment": sentiment
    }
    with open("Sentiment_other.pk", "wb") as f:
        pickle.dump(data, f)

    print("data_has been saved successfully!!")


if __name__ == "__main__":
    training_data = "amaozn_food_review/small_train.csv"
    token2id = build_dictionary("amazon_food_review/small_train.csv")
    # print(len(token2id))
    id2token = dict(zip(token2id.values(), token2id.keys()))
    adnet = ADNet(input_size=512,
                  hidden_size=512,
                  sentiment_size=512,
                  max_len=35,
                  vocab_size=len(token2id) + 2,
                  output_size=1)
    test(adnet, "./Test_Models/ADnetsS.dms", token2id, 1,
         "amazon_food_review/valid_data.csv")
    #inputs = convert_sentence2id("It'll be a regular stop on my trips to Phoenix!",token2id,35)
    #target = convert_sentence2id("It'll be a regular stop on my trips to Phoenix!",token2id,35)
    #o = test_model(adnet,"./Modals/ADnets.dms",inputs,target,torch.LongTensor([len(inputs)]))

    #print("sentiment_hidden",o["sentiment_hidden"].detach().numpy())
        print("*" * 40)

        running_loss = 0
        for i, data in enumerate(dataloader):
            x, x_len, y, t = data
            predict = model(x, x_len)
            loss = criterion(predict.squeeze(1), y)

            model_optimizer.zero_grad()
            loss.backward()
            model_optimizer.step()

            running_loss += loss.item()

            if i % 10 == 0 and i != 0:
                print("Average batch loss: {}".format(running_loss / 10))
                running_loss = 0

    torch.save(model.state_dict(), "./basic_model")
    print("Model has been saved successfully !!")


if __name__ == "__main__":
    training_data = "sentiment_data/training_data_shuffle.csv"
    dictionary = build_dictionary(training_data)
    model = SentimentModel(input_size=64,
                           hidden_size=64,
                           max_len=85,
                           vocab_size=len(dictionary),
                           output_size=1)
    train(dictionary, training_data, 0.01, 32, 20, model)
示例#7
0
def inverted_index(args):
    def calculate_tf_weights(lst_contents, words):

        rows = len(words)
        columns = len(lst_contents)

        TF_word_dict = dict()

        for word in words:
            docs = list()
            for j, content in enumerate(lst_contents):
                count = content.count(word)
                if count != 0:
                    docs.append((j, count / len(content)))

            TF_word_dict[word] = docs

        return TF_word_dict

    def calculate_idf_weights(TF_word_dict, file_paths):

        number_of_docs = len(file_paths)

        IDF = np.zeros(len(TF_word_dict))

        # IDF = 1 + np.log(len(file_paths)/np.sum(TF_word_dict != 0, axis=1))
        i = 0
        for word, docs in TF_word_dict.items():
            idf = 1 + np.log(number_of_docs / len(docs))
            IDF[i] = idf
            i += 1

        return np.array([IDF]).T

    def calculate_TF_IDF(TF_word_dict, IDF, file_paths):

        TF_matrix = np.zeros((len(TF_word_dict), len(file_paths)),
                             dtype=np.float32)

        for i, word in enumerate(TF_word_dict):
            for i, pair in enumerate(TF_word_dict[word]):
                TF_matrix[i, pair[0]] = pair[1]

        return TF_matrix * IDF

    # Bước 1: Load data from directory
    lst_contents, file_paths = load_data_from_directory(args['data_path'])

    # Bước 2: Build dictionary
    vocal = build_dictionary(lst_contents)

    # Bước 3: Calculate the TF weights for each document.
    TF_word_dict = calculate_tf_weights(lst_contents, vocal)

    # Bước 4: Calculate the IDF weights
    IDF = calculate_idf_weights(TF_word_dict, file_paths)

    # Bước 5: Calculate the TF-IDF
    TF_IDF = calculate_TF_IDF(TF_word_dict, IDF, file_paths)

    # Bước 6: Make query
    qwords = args['query'].split()
    qTF = calculate_tf_weights([qwords], vocal)
    qTF_IDF = calculate_TF_IDF(qTF, IDF, file_paths)

    # Bước 6: Calculate the similarity between qTF_IDF and TF_IDF
    dists = np.linalg.norm(TF_IDF - qTF_IDF, axis=0)

    # Bước 7: Ranking and display result
    ranked_result = np.argsort(dists)

    print("The ranking result matching with query {}".format(args['query']))

    N = int(args['numbers'])
    for index in ranked_result[0:N]:
        print(file_paths[index])
示例#8
0
def main():
	## import data 
	train_raw = pd.read_csv('/home/bsong/Python_Stuff/Data/Kaggle_Mercari/train.tsv',delimiter= '\t')
	normalized_price = np.log1p(train_raw['price'].values)
	mean_price_norm = np.mean(normalized_price)
	std_price_norm = np.std(normalized_price) 
	train_raw['price'] = (normalized_price - mean_price_norm)/std_price_norm 

	# split the categories into three new columns
	train_raw['cat1'],train_raw['cat2'],train_raw['cat3'] = zip(*train_raw['category_name'].apply(lambda x: utils.split_cat(x))) 

	# remove the column that isn't needed anymore
	train_raw.drop('category_name',axis = 1, inplace = True) 

	# replaces NaN with a string placeholder 'missing'
	# note: this is mildly hardcoded so it has to come after splitting categories into three
	handle_missing_inplace(train_raw) 

	# make a dictionary for both name and item_description (figured similar words appear, so combining words from both)

	all_name_desc = np.hstack((train_raw['name'],train_raw['item_description'])) # get all dem words
	all_name_desc = utils.clean_and_tokenize(all_name_desc)
	all_name_desc = [item for sublist in all_name_desc for item in sublist]
	train_raw['name'] = utils.clean_and_tokenize(train_raw['name'])
	train_raw['item_description'] = utils.clean_and_tokenize(train_raw['item_description'])

	# Build dictionaries here
	vocabulary_size = 100000 # keeping 100000 words in the dictionary. 0.28% of total words were put into "UNK". so kept 99.72% "common" words
	word2vec_dict, reverse_dict = utils.build_dictionary(all_name_desc,vocabulary_size) 

	dict_brand_len = 3000 # .16% of the words were put into "UNK"
	dict_cat1_len = 12 # theres apparently less than 12 categories in cat1
	dict_cat2_len= 100 # .114% of the words were put into "UNK"
	dict_cat3_len = 700 # .04% of works were put into "UNK"

	brand_name_dict, brand_name_dict_rev = utils.build_dictionary(train_raw['brand_name'], dict_brand_len)
	train_raw['brand_name_inds'], count_unk_brand = utils.convert_word_to_ind(train_raw['brand_name'].values.reshape((-1,1)), brand_name_dict)
	cat1_dict ,cat1_rev_dict= utils.build_dictionary(train_raw['cat1'],dict_cat1_len)
	train_raw['cat1_inds'], count_unk_cat1 = utils.convert_word_to_ind(train_raw['cat1'].values.reshape((-1,1)), cat1_dict)
	cat2_dict ,cat2_rev_dict= utils.build_dictionary(train_raw['cat2'],dict_cat2_len)
	train_raw['cat2_inds'], count_unk_cat2 = utils.convert_word_to_ind(train_raw['cat2'].values.reshape((-1,1)), cat2_dict)
	cat3_dict ,cat3_rev_dict= utils.build_dictionary(train_raw['cat3'],dict_cat3_len)
	train_raw['cat3_inds'], count_unk_cat3 = utils.convert_word_to_ind(train_raw['cat3'].values.reshape((-1,1)), cat3_dict)

	# make some padded vectors and NOT store them back in pandas df (keeping it as np.array)
	name_pad_size = 9 # max length of name
	itemdesc_pad_size = 75 # 95th percentile of length of item descriptions

	name_padded , _ = utils.convert_word_to_padded(train_raw.name,word2vec_dict,name_pad_size) # without _, will get tuple lol.
	itemdesc_padded , _ = utils.convert_word_to_padded(train_raw.item_description,word2vec_dict,itemdesc_pad_size) 

	# Define some embedding lengths here
	name_emb_size = 15
	itemdesc_emb_size = 15
	brand_emb_size = 10
	cat1_emb_size = 10
	cat2_emb_size = 10
	cat3_emb_size = 10
	itemcond_emb_size = 10
	shipping_emb_size = 10

	# lengths needed here and a bit later
	itemcond_len = np.max(train_raw.item_condition_id.values)

	name_itemdesc_emb = embed([i for i in range(vocabulary_size)],vocabulary_size,name_emb_size, name= 'name_itemdesc_emb')
	brand_emb = embed(train_raw.brand_name_inds,dict_brand_len, brand_emb_size, name= 'brand_emb')
	cat1_emb = embed(train_raw.cat1_inds,dict_cat1_len,cat1_emb_size, name= 'cat1_emb')
	cat2_emb = embed(train_raw.cat2_inds,dict_cat2_len,cat2_emb_size, name= 'cat2_emb')
	cat3_emb = embed(train_raw.cat3_inds,dict_cat3_len,cat3_emb_size, name= 'cat3_emb')
	itemcond_emb = embed(train_raw.item_condition_id,itemcond_len ,itemcond_emb_size, name= 'itemcond_emb')
	shipping_emb = embed(train_raw.shipping, 2, shipping_emb_size, name= 'shipping_emb')

	# Setup feeding stuff here

	# somewhat state which variables will be used here
	# reshaped to fit better (not sure if too necessary in hindsight, but minimal loss in time)
	input_name = name_padded
	input_itemdesc = itemdesc_padded
	input_price = train_raw['price'].values.reshape((-1,1))
	input_brand = train_raw.brand_name_inds.values.reshape((-1,1))
	input_cat1 = train_raw.cat1_inds.values.reshape((-1,1))
	input_cat2 = train_raw.cat2_inds.values.reshape((-1,1))
	input_cat3 = train_raw.cat3_inds.values.reshape((-1,1))
	input_itemcond = train_raw.item_condition_id.values.reshape((-1,1))
	input_ship = train_raw.shipping.values.reshape((-1,1))

	# define some lengths for partitioning data after feeding
	input_name_len = input_name.shape[1]
	input_itemdesc_len = input_itemdesc.shape[1]

	# concatenate data to make into tensor slices
	temp_set = np.concatenate((input_name, input_itemdesc,input_cat1,input_cat2,input_cat3,
	                           input_brand, input_itemcond, input_ship),axis = 1) #name_and_desc ,input_itemcond,input_shipping
	shape_set = temp_set.shape[1] 
	batch_len = 10000

	num_epoch = 25
	tot_iter = train_raw.shape[0]* num_epoch // batch_len + 1

	print('splitting labels and features...')
	features_input = temp_set.astype(np.int32)
	label_input = input_price.astype(np.float32)
	# make some placeholders to avoid GraphDef exceeding 2GB
	feat_placeholder = tf.placeholder(features_input.dtype, features_input.shape)
	label_placeholder = tf.placeholder(label_input.dtype, label_input.shape)
	print('making tensor slices...')
	dataset = tf.data.Dataset.from_tensor_slices((feat_placeholder, label_placeholder))
	print('shuffling...')
	#np.random.shuffle(temp_set) # shuffle the data
	dataset = dataset.shuffle(buffer_size =10000)
	print('making epochs...')
	dataset = dataset.repeat(num_epoch) # epoch
	print('making batches...')
	dataset = dataset.batch(batch_len) 
	iterator = dataset.make_initializable_iterator()
	next_batch = iterator.get_next()

	# Tensorflow model setup

	input_x = tf.placeholder(tf.int32,[None, shape_set], name = "input_x") # pad_length = 25 or something defined earlier
	input_y = tf.placeholder(tf.float32,[None,1], name = "input_y") # train agianst this


	input_x_name = input_x[:,:input_name_len]
	input_x_itemdesc = input_x[:,input_name_len:(input_name_len + input_itemdesc_len)]
	input_x_cat1 = input_x[:,(input_name_len + input_itemdesc_len)]
	input_x_cat2 = input_x[:,(input_name_len + input_itemdesc_len)+1]
	input_x_cat3 = input_x[:,(input_name_len + input_itemdesc_len)+2]
	input_x_brand = input_x[:,(input_name_len + input_itemdesc_len)+3]
	input_x_itemcond = input_x[:,(input_name_len + input_itemdesc_len)+4]
	input_x_shipping = input_x[:,(input_name_len + input_itemdesc_len)+5]


	name_emb_lookup = tf.nn.embedding_lookup(name_itemdesc_emb, input_x_name)
	itemdesc_emb_lookup = tf.nn.embedding_lookup(name_itemdesc_emb,input_x_itemdesc)
	brand_emb_lookup = tf.nn.embedding_lookup(brand_emb,input_x_brand)
	cat1_emb_lookup = tf.nn.embedding_lookup(cat1_emb,input_x_cat1)
	cat2_emb_lookup = tf.nn.embedding_lookup(cat2_emb,input_x_cat2)
	cat3_emb_lookup = tf.nn.embedding_lookup(cat3_emb,input_x_cat3)
	itemcond_emb_lookup = tf.nn.embedding_lookup(itemcond_emb, input_x_itemcond)
	shipping_emb_lookup = tf.nn.embedding_lookup(shipping_emb, input_x_shipping)

	# expand name and item_desc because conv2d wants it 4-d
	name_emb_lookup_expand = tf.expand_dims(name_emb_lookup,-1)
	itemdesc_emb_lookup_expand = tf.expand_dims(itemdesc_emb_lookup,-1)

	# set some lazy parameters here
	out_nodes = 15
	dropout_keep_prob = tf.placeholder(tf.float32)

	W_shape_name = [1,name_emb_size,1,out_nodes] #figure this out if it works
	b_shape_name = out_nodes # same as last dimension in W

	W_shape_itemdesc = [1,itemdesc_emb_size,1,out_nodes]
	b_shape_itemdesc = out_nodes

	#layers_namedesc = test_cnn(input_x_namedesc,W_shape_namedesc,b_shape_namedesc,dropout_keep_prob)
	layers_name = CNN(name_emb_lookup_expand,W_shape_name,b_shape_name,dropout_keep_prob,name_pad_size)
	layers_itemdesc = CNN(itemdesc_emb_lookup_expand,W_shape_itemdesc,b_shape_itemdesc,dropout_keep_prob,itemdesc_pad_size)
	layers_brand = RegNN(brand_emb_lookup, dropout_keep_prob, dict_brand_len, brand_emb_size, batch_len, out_nodes)
	layers_cat1 = RegNN(cat1_emb_lookup, dropout_keep_prob, dict_cat1_len, cat1_emb_size, batch_len, out_nodes)
	layers_cat2 = RegNN(cat2_emb_lookup, dropout_keep_prob, dict_cat2_len, cat2_emb_size, batch_len, out_nodes)
	layers_cat3 = RegNN(cat3_emb_lookup, dropout_keep_prob, dict_cat3_len, cat3_emb_size, batch_len, out_nodes)
	layers_itemcond = RegNN(itemcond_emb_lookup, dropout_keep_prob, itemcond_len, itemcond_emb_size, batch_len, out_nodes)
	layers_shipping = RegNN(shipping_emb_lookup, dropout_keep_prob, 2, shipping_emb_size, batch_len, out_nodes)
	comb_layers = tf.concat([layers_name,layers_itemdesc, layers_brand, layers_cat1, 
	                         layers_cat2, layers_cat3,layers_itemcond, layers_shipping],axis=1) #, input_x_name, input_x_shipping

	#dense 
	dense1 = dense_NN(comb_layers, 64, batch_len)
	dense2 = dense_NN(dense1, 128, batch_len)
	predictions = dense_NN(dense2, 1, batch_len) 

	loss = 2
	loss,train_step  = train_the_NN(predictions,input_y,loss)
	# as is, normalized predictions cause NaN in rmsle solving. adding .00001 just in case
	unwind_true = tf.log(tf.expm1((input_y* std_price_norm) + mean_price_norm)+ .00001) 
	unwind_pred = tf.log(tf.expm1((predictions* std_price_norm) + mean_price_norm)+ .00001) 
	rmsle_ = tf.sqrt(tf.reduce_mean(tf.square(unwind_true - unwind_pred)))

	# Training model starts here
	with tf.Session() as sess:
示例#9
0
    arguments['up_sample_input_dim'] = arguments['hidden_size'] + arguments[
        'noise_dim']
    arguments['num_channels'] = 3

    # 下采样参数
    arguments['image_feature_size'] = 512

    # 文本解码参数
    arguments['hidden_size'] = 512
    arguments['num_layers'] = 1
    arguments["sentence_embedding_size"] = arguments['hidden_size']

    # 划分训练集和验证集
    split_train_validation_set(arguments['sentence_path'],
                               arguments['train_sentence_path'],
                               arguments['val_sentence_path'], 200)
    # 建立词典
    sentences = read_sentences(arguments['sentence_path'])
    word2idx, idx2word, lengths = build_dictionary(sentences)
    arguments['word2idx'] = word2idx
    arguments['idx2word'] = idx2word
    arguments["lengths"] = lengths
    arguments['word_number'] = len(word2idx)
    arguments['max_seq_length'] = len(word2idx)
    arguments['sentence_max_length'] = np.max(lengths) + 1

    arguments['use_sentence_generator'] = False

    trainer = Trainer(arguments)
    trainer.train()