def load_most_common_word_ap_list(file, ap_list): temp = {} for ap in ap_list: vocab = set() path = file + '/' + ap + '.txt' vocab.update(helpers.load_doc(path).split()) vocab = sorted(vocab) temp.update({ap: vocab}) return temp
embedding_file = '../data/glove.6B.100d.txt' res_embedding_file = '../data/restaurant_emb.vec' negative_words = '../data/negative-words.txt' positive_words = '../data/positive-words.txt' model_file_name = 'model_invidual_sentiment_ap_classifier' model_folder = '../data/model' if not os.path.exists(model_folder): os.makedirs(model_folder) if not os.path.exists(model_folder + '/' + model_file_name): os.makedirs(model_folder + '/' + model_file_name) data_train = pd.read_csv(train_csv, sep='\t') data_test = pd.read_csv(test_csv, sep='\t') data_sample = pd.read_csv(sample_csv, sep='\t') vocab = helpers.load_doc(vocab_file) vocab = set(vocab.split()) vocab_positive = helpers.load_doc(positive_words) vocab_positive = set(vocab_positive.split()) vocab_negative = helpers.load_doc(negative_words) vocab_negative = set(vocab_negative.split()) # define default ap_list ap_list = [ 'FOOD#QUALITY', 'FOOD#PRICES', 'FOOD#STYLE_OPTIONS', 'RESTAURANT#GENERAL', 'RESTAURANT#PRICES', 'RESTAURANT#MISCELLANEOUS', 'DRINKS#PRICES', 'DRINKS#QUALITY', 'DRINKS#STYLE_OPTIONS', 'AMBIENCE#GENERAL', 'SERVICE#GENERAL', 'LOCATION#GENERAL' ]
embedding_matrix[i] = embedding_vector return embedding_matrix # define file train_file = '../data/official_data/ABSA16_Restaurants_Train_SB1_v2.xml' train_csv = '../data/official_data/data_train.csv' test_file = '../data/official_data/EN_REST_SB1_TEST_gold.xml' test_csv = '../data/official_data/data_test.csv' vocab_file = '../data/vocab.txt' embedding_file = '../data/glove.6B.100d.txt' data_train = pd.read_csv(train_csv, sep='\t') data_test = pd.read_csv(test_csv, sep='\t') vocab = helpers.load_doc(vocab_file) vocab = set(vocab.split()) # init train_texts = process_texts(data_train.text, vocab) test_texts = process_texts(data_test.text, vocab) tokenizer = create_tokenizer(train_texts) vocab_size = len(tokenizer.word_index) + 1 print(' Vocabulary size: %d ' % vocab_size) max_length = max([len(s.split()) for s in train_texts]) print(' Maximum length: %d ' % max_length) aspect_category_list = data_train.aspect_category.unique() # X1, review w2v X1_train = encode_X1(train_texts, tokenizer, max_length)
def load_most_common_word(file, ap_list): vocab = set() for ap in ap_list: path = file + '/' + ap + '.txt' vocab.update(helpers.load_doc(path).split()) return vocab