def process_data(): vocab_to_code, code_to_vocab = build_vocabulary() max_vocab_size = len(vocab_to_code) print('Final Vocab Size : ' + str(max_vocab_size)) try: tokenized_dataset = [] all_sentences = [] for i, review in enumerate(read_binary(FORMATTED_FILE_NAME)): tokenized_aspect = [] tokenized_sentences = [] if i == 0: print(review) sentences = review[1] aspect_words = review[0] polarities = review[2] for aspect_word in aspect_words: tokenized_aspect.append(aspect_word) all_sentences.append([aspect_word]) for sent in sentences: tokenized_sentence = [] # remove duplicate spaces from the sentence. This is causing problem for elmo. s = re.sub(' +', ' ', sent[0]) tokens = NLP.tokenizer(s) for token in tokens: tokenized_sentence.append(token.orth_) tokenized_sentences.append(tokenized_sentence) # all these sentences will be written to a separate txt file at the end of the process. all_sentences.append(tokenized_sentence) tokenized_review = [ tokenized_aspect, tokenized_sentences, polarities ] # dataset tokenized_dataset.append(tokenized_review) write_binary(tokenized_dataset, PROCESSED_FILE_NAME) print('dump at {}'.format(i)) all_sentences = space_separated_token_string(all_sentences) save_sentences_to_text(all_sentences) # hack for elmo remove_duplicate_sentences() except KeyboardInterrupt: pass
def build_vocabulary(lower=1, n=MAX_VOCAB_SIZE): """ 1. Get word frequency distribution 2. Sort is based on word frequencies 3. Make a vocab dist using the most frequent words 4. Store vocab dist in a file in format <word, identifier> :param lower: Identifiers below this are reserved :param n: Number of unique expected words :return: A dict of vocabulary words and an assigned identifier """ try: vocab_to_code = read_binary(VOCAB_TO_CODE_FILE) code_to_vocab = read_binary(CODE_TO_VOCAB_FILE) print('vocabulary loaded') return vocab_to_code, code_to_vocab except IOError: print('building vocabulary') freq = build_word_frequency_distribution() # sorting words in ascending order based on frequency and then pick top n words top_words = list(sorted(freq.items(), key=lambda x: -x[1]))[:n - lower + 1] # create optimum vocab size print('Vocab count : ' + str(len(top_words))) # global MAX_VOCAB_SIZE # global UNKNOWN max_vocab_size = len(top_words) + 2 unknown = max_vocab_size - 1 vocab_to_code = {} code_to_vocab = {} vocab_to_code['<UNK>'] = unknown code_to_vocab[unknown] = '<UNK>' vocab_to_code['<PAD>'] = PAD code_to_vocab[PAD] = '<PAD>' # lower vocab indexes are reserved for padding and unknown words i = lower for w, freq in top_words: vocab_to_code[w] = i code_to_vocab[i] = w i += 1 write_binary(vocab_to_code, VOCAB_TO_CODE_FILE) write_binary(code_to_vocab, CODE_TO_VOCAB_FILE) return vocab_to_code, code_to_vocab
def combine_processed_data(): combined_dataset = [] restaurant = read_binary(filename=PROCESSED_RESTAURANT_FILE_NAME) print('Restaurant-' + str(len(restaurant))) combined_dataset.extend(restaurant) print(len(combined_dataset)) laptops = read_binary(filename=PROCESSED_LAPTOPS_FILE_NAME) print('Laptops-' + str(len(laptops))) combined_dataset.extend(laptops) print(len(combined_dataset)) # organic = read_binary(filename = PROCESSED_ORGANIC_FILE_NAME) # print('Organic-' + str(len(organic))) # combined_dataset.extend(organic) # print(len(combined_dataset)) write_binary(combined_dataset, OUTPUT_FILE_NAME)
def build_word_frequency_distribution(): """ 1. Extract tokens from the review text 2. Calculate frequency of each token 3. Create a freq dict and store it in a file :return: A dict of <token, freq> """ try: freq_dist_f = read_binary(WORD_FREQ_FILE) print('frequency distribution loaded') return freq_dist_f except IOError: pass print('building frequency distribution') freq = defaultdict(int) if FILE_NAME == 'restaurant': for aspect_word in RESTAURANT_ASPECT_WORDS: freq[aspect_word] += 1 elif FILE_NAME == 'laptops': for aspect_word in LAPTOPS_ASPECT_WORDS: freq[aspect_word] += 1 files = [FORMATTED_FILE_NAME] if EMBEDDING_TYPE == 'fasttext': files.append(FORMATTED_FILE_NAME.replace('train', 'test')) files.append(FORMATTED_FILE_NAME.replace('train', 'val')) for file_path in files: print('building vocab from file - ' + file_path) for i, review in enumerate(read_binary(file_path)): sentences = review[1] for sent in sentences: tokens = NLP.tokenizer(sent[0]) for token in tokens: freq[token.orth_] += 1 if i % 100 == 0: write_binary(freq, WORD_FREQ_FILE) print('dump at {}'.format(i)) write_binary(freq, WORD_FREQ_FILE) return freq
def process_data(): vocab_to_code, code_to_vocab = build_vocabulary() max_vocab_size = len(vocab_to_code) unknown = max_vocab_size - 1 print('Final Vocab Size : ' + str(max_vocab_size)) try: coded_dataset = [] for i, review in enumerate(read_binary(FORMATTED_FILE_NAME)): coded_aspect = [] coded_sentences = [] if i == 0: print(review) sentences = review[1] aspect_words = review[0] polarities = review[2] for aspect_word in aspect_words: coded_aspect.append(vocab_to_code.get(aspect_word, unknown)) for sent in sentences: coded_sentence = [] tokens = NLP.tokenizer(sent[0]) for token in tokens: coded_sentence.append( vocab_to_code.get(token.orth_, unknown)) coded_sentences.append(coded_sentence) coded_review = [coded_aspect, coded_sentences, polarities] # dataset coded_dataset.append(coded_review) write_binary(coded_dataset, PROCESSED_FILE_NAME) print('dump at {}'.format(i)) datapoint = coded_dataset[0] print(datapoint) print(get_uncoded_data(code_to_vocab, datapoint)) except KeyboardInterrupt: pass
def process_data(): vocab_to_code, code_to_vocab = build_vocabulary() vocab_size = len(vocab_to_code) unknown = vocab_size - 1 print('Final Vocab Size : ' + str(vocab_size)) coded_dataset = [] for i, review in enumerate(read_binary(FORMATTED_FILE_NAME)): coded_aspect = [] coded_text = [] if i == 0: print(review) text = review[1] aspect_words = review[0] polarity = review[2] for aspect_word in aspect_words: a = vocab_to_code.get(aspect_word, unknown) if a == unknown: print('STOP') print(aspect_word) coded_aspect.append(a) for word in text: word_code = vocab_to_code.get(word, unknown) coded_text.append(word_code) coded_review = [coded_aspect, [coded_text], [polarity]] coded_dataset.append(coded_review) write_binary(coded_dataset, PROCESSED_FILE_NAME) print('dump at {}'.format(i)) datapoint = coded_dataset[0] print(datapoint) print(get_uncoded_data(code_to_vocab, datapoint))
def make_flat_data(): """ [ [[aspect1], [review1], [polarity]], [[aspect2], [review1], [polarity]] ] [['food', 'quality'], [[['Judging from previous posts this used to be a good place, but not any longer.'], [0, 0, 0, 1]], ,[['We, there were four of us, arrived at noon - the place was empty - and the staff acted like we were imposing on them and they were very rude.'], [0, 0, 0, 1]], [['They never brought us complimentary noodles, ignored repeated requests for sugar, and threw our dishes on the table.'], [0, 0, 0, 1]], [['The food was lousy - too sweet or too salty and the portions tiny.'], [0, 1, 0, 0]], [['After all that, they complained to me about the small tip.'], [0, 0, 0, 1]], [['Avoid this place!'], [0, 0, 0, 1]] ] ] This method reads data from the original xml file and formats it in the way shown above. If N is the number of possible aspects in this dataset then we repeat or augment each review N times once for each aspect. A review can consist of any number of sentences. Each sentence in a review has a label. Labels represent sentiment polarity or non applicability of a sentence corresponding to an aspect. For instance, in the above example labels for each sentence are generated for the aspect food#quality. Sentences which either do dont talk about this particular aspect or any of the possible aspects are labeled as N/A in this datapoint. For instance, the last sentence "Avoid this place" is maked as N/A in this datapoint. Although this same sentence will be labelled as NEGATIVE in another datapoint of the same review for another aspect restaurant#general. :return: """ possible_categories = [ 'allgemein', 'atmosphäre', 'connectivity', 'design', 'gastronomisches_angebot', 'informationen', 'db_app_und_website', 'service_und_kundenbetreuung', 'komfort_und_ausstattung', 'gepäck', 'auslastung_und_platzangebot', 'ticketkauf', 'toiletten', 'zugfahrt', 'reisen_mit_kindern', 'image', 'qr-code', 'barrierefreiheit', 'sicherheit', 'sonstige_unregelmässigkeiten' ] global TOTAL_REVIEW_COUNT global TOTAL_AUGMENTED_REVIEW_COUNT global TOTAL_POSITIVE_LABEL_COUNT global TOTAL_NEGATIVE_LABEL_COUNT global TOTAL_NEUTRAL_LABEL_COUNT global TOTAL_NOT_APPLICABLE_LABEL_COUNT global INCLUDE_NOT_APPLICABLE global INCLUDE_PERCENTAGE doc = read_xml(INPUT_FILE_PATH) dataset = [] for i, review in enumerate(doc['Documents']['Document']): TOTAL_REVIEW_COUNT += 1 print('document-' + str(i)) tokenized_review_text = [] category_polarity_map = {} text = review['text'] tokens = NLP(text) if 'Opinions' in review.keys(): opinions = review['Opinions']['Opinion'] if isinstance(opinions, dict): opinions = [opinions] for opinion in opinions: category = opinion['@category'].lower().split('#')[0] update_aspect_to_text_frequency(category) polarity = get_categorical_sentiment(opinion['@polarity']) category_polarity_map[category] = polarity for token in tokens: tokenized_review_text.append(token.text) if INCLUDE_NOT_APPLICABLE: for possible_category in possible_categories: sentiment = category_polarity_map.get(possible_category, None) if sentiment is None: ran = random.random() if ran <= INCLUDE_PERCENTAGE: sentiment = 3 TOTAL_NOT_APPLICABLE_LABEL_COUNT += 1 else: continue category_tokens = possible_category.split('_') if 'und' in category_tokens: category_tokens.remove('und') datapoint = [category_tokens, tokenized_review_text, sentiment] # print(datapoint) dataset.append(datapoint) TOTAL_AUGMENTED_REVIEW_COUNT += 1 else: for category, polarity in category_polarity_map.items(): category_tokens = category.split('_') if 'und' in category_tokens: category_tokens.remove('und') datapoint = [category_tokens, tokenized_review_text, polarity] print(datapoint) dataset.append(datapoint) print('---------') print(dataset[0]) print(len(dataset)) write_binary(dataset, filename=OUTPUT_FILE_NAME) print('TOTAL_REVIEW_COUNT: ', TOTAL_REVIEW_COUNT) print('TOTAL_AUGMENTED_REVIEW_COUNT: ', TOTAL_AUGMENTED_REVIEW_COUNT) print('TOTAL_POSITIVE_LABEL_COUNT: ', TOTAL_POSITIVE_LABEL_COUNT) print('TOTAL_NEGATIVE_LABEL_COUNT: ', TOTAL_NEGATIVE_LABEL_COUNT) print('TOTAL_NEUTRAL_LABEL_COUNT: ', TOTAL_NEUTRAL_LABEL_COUNT) print('TOTAL_NOT_APPLICABLE_LABEL_COUNT: ', TOTAL_NOT_APPLICABLE_LABEL_COUNT) total_label_count = TOTAL_POSITIVE_LABEL_COUNT + TOTAL_NEGATIVE_LABEL_COUNT + TOTAL_NEUTRAL_LABEL_COUNT + TOTAL_NOT_APPLICABLE_LABEL_COUNT print('TOTAL_LABELS: ', total_label_count) print('CLASS 0: ', (TOTAL_POSITIVE_LABEL_COUNT / total_label_count) * 100) print('CLASS 1: ', (TOTAL_NEGATIVE_LABEL_COUNT / total_label_count) * 100) print('CLASS 2: ', (TOTAL_NEUTRAL_LABEL_COUNT / total_label_count) * 100) print('CLASS 3: ', (TOTAL_NOT_APPLICABLE_LABEL_COUNT / total_label_count) * 100) print('ASPECT_TO_TEXT_FREQUENCY:') for k, v in ASPECT_TO_TEXT_FREQUENCY.items(): print(k + ": " + str(v))
def build_vocabulary(lower=1, n=MAX_VOCAB_SIZE): """ 1. Get word frequency distribution 2. Sort is based on word frequencies 3. Make a vocab dist using the most frequent words 4. Store vocab dist in a file in format <word, identifier> :param lower: Identifiers below this are reserved :param n: Number of unique expected words :return: A dict of vocabulary words and an assigned identifier """ try: vocab_to_code = read_binary(VOCAB_TO_CODE_FILE) code_to_vocab = read_binary(CODE_TO_VOCAB_FILE) print('vocabulary loaded') return vocab_to_code, code_to_vocab except IOError: print('building vocabulary') freq = build_word_frequency_distribution() # get glove embeddings print('loading embeddings') if EMBEDDING_TYPE == 'glove': word_to_embeddings = load_glove_embeddings() elif EMBEDDING_TYPE == 'fasttext': word_to_embeddings = load_oov_fastText_embeddings() else: word_to_embeddings = {} # sorting words in ascending order based on frequency and then pick top n words top_words = list(sorted(freq.items(), key=lambda x: -x[1]))[:n - lower + 1] # create optimum vocab size print('Vocab count : ' + str(len(top_words))) # global MAX_VOCAB_SIZE # global UNKNOWN max_vocab_size = len(top_words) + 2 unknown = max_vocab_size - 1 vocab_to_code = {} code_to_vocab = {} # an array of embeddings with index referring to the vocab code. First and last index is # reserved for padding and unknown words respectively. code_to_embed = np.zeros(shape=(max_vocab_size, EMBEDDING_DIMENSION), dtype=np.float32) code_to_embed[PAD] = PAD_EMBEDDING code_to_embed[unknown] = UNKNOWN_EMBEDDING vocab_to_code['<UNK>'] = unknown code_to_vocab[unknown] = '<UNK>' vocab_to_code['<PAD>'] = PAD code_to_vocab[PAD] = '<PAD>' # lower vocab indexes are reserved for padding and unknown words i = lower for w, freq in top_words: vocab_to_code[w] = i code_to_vocab[i] = w try: if EMBEDDING_TYPE == 'glove': embedding = word_to_embeddings.word_vec(w) elif EMBEDDING_TYPE == 'fasttext': embedding = word_to_embeddings.get_word_vector(w) except KeyError: embedding = UNKNOWN_EMBEDDING code_to_embed[i] = embedding i += 1 write_binary(vocab_to_code, VOCAB_TO_CODE_FILE) write_binary(code_to_vocab, CODE_TO_VOCAB_FILE) write_binary(code_to_embed, CODE_TO_EMBED_FILE) return vocab_to_code, code_to_vocab
def make_flatten_restaurant_data_sentence_level(reviews, mode='train'): """ [ [[aspect1], [review1], [polarity]], [[aspect2], [review1], [polarity]] ] [['food', 'quality'], [[['Judging from previous posts this used to be a good place, but not any longer.'], [0, 0, 0, 1]], ,[['We, there were four of us, arrived at noon - the place was empty - and the staff acted like we were imposing on them and they were very rude.'], [0, 0, 0, 1]], [['They never brought us complimentary noodles, ignored repeated requests for sugar, and threw our dishes on the table.'], [0, 0, 0, 1]], [['The food was lousy - too sweet or too salty and the portions tiny.'], [0, 1, 0, 0]], [['After all that, they complained to me about the small tip.'], [0, 0, 0, 1]], [['Avoid this place!'], [0, 0, 0, 1]] ] ] This method reads data from the original xml file and formats it in the way shown above. If N is the number of possible aspects in this dataset then we repeat or augment each review N times once for each aspect. A review can consist of any number of sentences. Each sentence in a review has a label. Labels represent sentiment polarity or non applicability of a sentence corresponding to an aspect. For instance, in the above example labels for each sentence are generated for the aspect food#quality. Sentences which either do dont talk about this particular aspect or any of the possible aspects are labeled as N/A in this datapoint. For instance, the last sentence "Avoid this place" is maked as N/A in this datapoint. Although this same sentence will be labelled as NEGATIVE in another datapoint of the same review for another aspect restaurant#general. :return: """ restaurant_possible_aspects = [ 'restaurant#general', 'restaurant#prices', 'restaurant#miscellaneous', 'food#prices', 'food#quality', 'food#style_options', 'drinks#prices', 'drinks#quality', 'drinks#style_options', 'ambience#general', 'service#general', 'location#general' ] # we have 22 entities, 9 attributes so total 198 possible aspects # but in training data we have only 81 aspects present. In total we selected 116 aspects based our understanding of # which entity-attribute pair makes sense. laptops_possible_aspects = [ 'laptop#general', 'laptop#price', 'laptop#quality', 'laptop#operation_performance', 'laptop#usability', 'laptop#design_features', 'laptop#portability', 'laptop#connectivity', 'laptop#miscellaneous', 'display#general', 'display#quality', 'display#operation_performance', 'display#usability', 'display#design_features', 'display#portability', 'display#miscellaneous', 'cpu#general', 'cpu#price', 'cpu#quality', 'cpu#operation_performance', 'cpu#design_features', 'cpu#miscellaneous', 'motherboard#general', 'motherboard#price', 'motherboard#quality', 'motherboard#design_features', 'motherboard#miscellaneous', 'hard_disc#general', 'hard_disc#price', 'hard_disc#quality', 'hard_disc#operation_performance', 'hard_disc#design_features', 'hard_disc#miscellaneous', 'memory#general', 'memory#price', 'memory#design_features', 'memory#miscellaneous', 'battery#general', 'battery#quality', 'battery#operation_performance', 'battery#design_features', 'battery#miscellaneous', 'power_supply#general', 'power_supply#price', 'power_supply#quality', 'power_supply#operation_performance', 'power_supply#design_features', 'power_supply#miscellaneous', 'keyboard#general', 'keyboard#quality', 'keyboard#operation_performance', 'keyboard#usability', 'keyboard#design_features', 'keyboard#miscellaneous', 'mouse#general', 'mouse#quality', 'mouse#operation_performance', 'mouse#usability', 'mouse#design_features', 'mouse#miscellaneous', 'fans_cooling#general', 'fans_cooling#quality', 'fans_cooling#operation_performance', 'fans_cooling#design_features', 'fans_cooling#miscellaneous', 'optical_drives#general', 'optical_drives#quality', 'optical_drives#operation_performance', 'optical_drives#design_features', 'optical_drives#miscellaneous', 'ports#general', 'ports#quality', 'ports#operation_performance', 'ports#design_features', 'ports#miscellaneous', 'graphics#general', 'graphics#quality', 'graphics#design_features', 'graphics#miscellaneous', 'multimedia_devices#general', 'multimedia_devices#quality', 'multimedia_devices#operation_performance', 'multimedia_devices#usability', 'multimedia_devices#design_features', 'multimedia_devices#miscellaneous', 'hardware#general', 'hardware#quality', 'hardware#operation_performance', 'hardware#usability', 'hardware#design_features', 'hardware#miscellaneous', 'os#general', 'os#quality', 'os#operation_performance', 'os#usability', 'os#design_features', 'os#miscellaneous', 'software#general', 'software#price', 'software#quality', 'software#operation_performance', 'software#usability', 'software#design_features', 'software#miscellaneous', 'warranty#general', 'warranty#price', 'warranty#miscellaneous', 'shipping#general', 'shipping#price', 'shipping#quality', 'shipping#miscellaneous', 'support#general', 'support#price', 'support#quality', 'support#miscellaneous', 'company#general' ] global TOTAL_SENTENCE_COUNT global TOTAL_REVIEW_COUNT global TOTAL_AUGMENTED_REVIEW_COUNT global TOTAL_POSITIVE_LABEL_COUNT global TOTAL_NEGATIVE_LABEL_COUNT global TOTAL_NEUTRAL_LABEL_COUNT global TOTAL_NOT_APPLICABLE_LABEL_COUNT global ASPECT_TO_SENTENCE_FREQUENCY global DATA_TYPE if DATA_TYPE == 'restaurant': possible_aspects = restaurant_possible_aspects elif DATA_TYPE == 'laptops': possible_aspects = laptops_possible_aspects TOTAL_SENTENCE_COUNT = 0 TOTAL_REVIEW_COUNT = 0 TOTAL_AUGMENTED_REVIEW_COUNT = 0 TOTAL_POSITIVE_LABEL_COUNT = 0 TOTAL_NEGATIVE_LABEL_COUNT = 0 TOTAL_NEUTRAL_LABEL_COUNT = 0 TOTAL_NOT_APPLICABLE_LABEL_COUNT = 0 ASPECT_TO_SENTENCE_FREQUENCY = {} dataset = [] for i, review in enumerate(reviews): TOTAL_REVIEW_COUNT += 1 print('review-' + str(i)) review_text = [] aspect_sentence_polarity_map = {} sentences = review['sentences']['sentence'] if isinstance(sentences, dict): sentences = [sentences] for j, sentence in enumerate(sentences): TOTAL_SENTENCE_COUNT += 1 sentence_text = [] sentence_text.append(sentence['text']) if 'Opinions' in sentence.keys(): opinions = sentence['Opinions']['Opinion'] if isinstance(opinions, dict): opinions = [opinions] for opinion in opinions: aspect_category = opinion['@category'].lower() update_aspect_to_sentence_frequency(aspect_category) polarity = get_categorical_sentiment(opinion['@polarity']) # Here we are trying to create a map of sentences and aspects. Basicly, for the current review which # sentence is related to which aspect. sentence_polarity = aspect_sentence_polarity_map.get( aspect_category, []) sentence_polarity.append([j, polarity]) aspect_sentence_polarity_map[ aspect_category] = sentence_polarity # else: # # no aspect, contains no sentiment, either out of domain or just some fact # sentence_polarity = aspect_sentence_polarity_map.get('relevance', []) # sentence_polarity.append([j, 3]) # aspect_sentence_polarity_map['relevance'] = sentence_polarity review_text.append(sentence_text) # It could be that a particular review has no sentence for some aspects. Here we are just adding an empty # sentence list for such aspects. if not REDUCED: for aspect in possible_aspects: if aspect not in aspect_sentence_polarity_map.keys(): aspect_sentence_polarity_map[aspect] = [] # Now for every possible aspect we will create a datapoint using this particular review. for a, sent_polarities in aspect_sentence_polarity_map.items(): TOTAL_AUGMENTED_REVIEW_COUNT += 1 aspect_words = [] aspects = a.split('#') aspect_words.extend(aspects[0].split('_')) if len(aspects) > 1: aspect_words.extend(aspects[1].split('_')) augmented_review = [] augmented_polarity = [] # check which sentences from the current review are related to this aspect 'a' and has some polarity. # Iterate over each sentence from the review and check in the aspect's map whether it is present there # or not. If yes, mark the sentence's sentiment polarity accordinly or otherwise mark it N/A(3) for j, s in enumerate(review_text): updated_polarity = 3 for sent_polarity in sent_polarities: if j == sent_polarity[0]: # sentence j contains current aspect updated_polarity = sent_polarity[1] break if updated_polarity == 3: TOTAL_NOT_APPLICABLE_LABEL_COUNT += 1 augmented_polarity.append(updated_polarity) augmented_review.append(s) augmented_datapoint = [ aspect_words, augmented_review, augmented_polarity ] dataset.append(augmented_datapoint) if OVERSAMPLING: oversampled_datapoints = oversampling(augmented_datapoint) if oversampled_datapoints is not None: for oversampled_datapoint in oversampled_datapoints: TOTAL_NEUTRAL_LABEL_COUNT += 1 TOTAL_AUGMENTED_REVIEW_COUNT += 1 dataset.append(oversampled_datapoint) print('---------') print(dataset[0]) print(dataset[1]) print(dataset[2]) print(dataset[3]) print(dataset[4]) print(dataset[5]) print(dataset[6]) print(dataset[7]) print(dataset[8]) print(dataset[9]) print(dataset[10]) print(dataset[11]) print(dataset[12]) print(len(dataset)) output_file_name = 'formatted_' + DATA_TYPE + '_' + mode + '.pickle' write_binary(dataset, filename=output_file_name) print('---', mode, '---') print('TOTAL_REVIEW_COUNT: ', TOTAL_REVIEW_COUNT) print('TOTAL_SENTENCE_COUNT: ', TOTAL_SENTENCE_COUNT) print('TOTAL_AUGMENTED_REVIEW_COUNT: ', TOTAL_AUGMENTED_REVIEW_COUNT) print('TOTAL_POSITIVE_LABEL_COUNT: ', TOTAL_POSITIVE_LABEL_COUNT) print('TOTAL_NEGATIVE_LABEL_COUNT: ', TOTAL_NEGATIVE_LABEL_COUNT) print('TOTAL_NEUTRAL_LABEL_COUNT: ', TOTAL_NEUTRAL_LABEL_COUNT) print('TOTAL_NOT_APPLICABLE_LABEL_COUNT: ', TOTAL_NOT_APPLICABLE_LABEL_COUNT) total_label_count = TOTAL_POSITIVE_LABEL_COUNT + TOTAL_NEGATIVE_LABEL_COUNT + TOTAL_NEUTRAL_LABEL_COUNT + TOTAL_NOT_APPLICABLE_LABEL_COUNT print('TOTAL_LABELS: ', total_label_count) print('CLASS 0: ', (TOTAL_POSITIVE_LABEL_COUNT / total_label_count) * 100) print('CLASS 1: ', (TOTAL_NEGATIVE_LABEL_COUNT / total_label_count) * 100) print('CLASS 2: ', (TOTAL_NEUTRAL_LABEL_COUNT / total_label_count) * 100) print('CLASS 3: ', (TOTAL_NOT_APPLICABLE_LABEL_COUNT / total_label_count) * 100) print('ASPECT_TO_SENTENCE_FREQUENCY:') for k, v in ASPECT_TO_SENTENCE_FREQUENCY.items(): print(k + ": " + str(v))
def make_flatten_restaurant_data_sentence_level(reviews, mode='train'): # we have 9 entities, 14 attributes so total 126 possible aspects # but in training data we have only 111 aspects present. In total we selected 111 aspects based our understanding of # which entity-attribute pair makes sense. organic_possible_aspects = [ 'organic_general#general', 'organic_general#price', 'organic_general#taste', 'organic_general#nutritional_quality_freshness_appearance', 'organic_general#safety', 'organic_general#healthiness', 'organic_general#chemicals_pesticides', 'organic_general#label', 'organic_general#origin_source', 'organic_general#local', 'organic_general#availability', 'organic_general#environment', 'organic_general#animal_welfare', 'organic_general#productivity', 'organic_products#general', 'organic_products#price', 'organic_products#taste', 'organic_products#nutritional_quality_freshness_appearance', 'organic_products#safety', 'organic_products#healthiness', 'organic_products#chemicals_pesticides', 'organic_products#label', 'organic_products#origin_source', 'organic_products#local', 'organic_products#availability', 'organic_products#environment', 'organic_products#animal_welfare', 'organic_products#productivity', 'organic_farming#general', 'organic_farming#price', 'organic_farming#taste', 'organic_farming#nutritional_quality_freshness_appearance', 'organic_farming#safety', 'organic_farming#healthiness', 'organic_farming#chemicals_pesticides', 'organic_farming#label', 'organic_farming#origin_source', 'organic_farming#local', 'organic_farming#availability', 'organic_farming#environment', 'organic_farming#animal_welfare', 'organic_farming#productivity', 'organic_companies#general', 'organic_companies#price', 'organic_companies#taste', 'organic_companies#nutritional_quality_freshness_appearance', 'organic_companies#safety', 'organic_companies#healthiness', 'organic_companies#chemicals_pesticides', 'organic_companies#label', 'organic_companies#origin_source', 'organic_companies#local', 'organic_companies#availability', 'organic_companies#environment', 'organic_companies#animal_welfare', 'organic_companies#productivity', 'conventional_general#general', 'conventional_general#price', 'conventional_general#nutritional_quality_freshness_appearance', 'conventional_general#safety', 'conventional_general#healthiness', 'conventional_general#chemicals_pesticides', 'conventional_general#label', 'conventional_general#origin_source', 'conventional_general#productivity', 'conventional_products#general', 'conventional_products#price', 'conventional_products#taste', 'conventional_products#nutritional_quality_freshness_appearance', 'conventional_products#safety', 'conventional_products#healthiness', 'conventional_products#chemicals_pesticides', 'conventional_products#label', 'conventional_products#origin_source', 'conventional_products#local', 'conventional_products#availability', 'conventional_products#environment', 'conventional_products#animal_welfare', 'conventional_products#productivity', 'conventional_farming#general', 'conventional_farming#price', 'conventional_farming#taste', 'conventional_farming#nutritional_quality_freshness_appearance', 'conventional_farming#safety', 'conventional_farming#healthiness', 'conventional_farming#chemicals_pesticides', 'conventional_farming#label', 'conventional_farming#origin_source', 'conventional_farming#environment', 'conventional_farming#animal_welfare', 'conventional_farming#productivity', 'conventional_companies#general', 'conventional_companies#taste', 'conventional_companies#safety', 'conventional_companies#chemicals_pesticides', 'conventional_companies#label', 'conventional_companies#availability', 'conventional_companies#environment', 'conventional_companies#animal_welfare', 'conventional_companies#productivity', 'gmo_genetic_engineering#general', 'gmo_genetic_engineering#price', 'gmo_genetic_engineering#taste', 'gmo_genetic_engineering#nutritional_quality_freshness_appearance', 'gmo_genetic_engineering#safety', 'gmo_genetic_engineering#healthiness', 'gmo_genetic_engineering#chemicals_pesticides', 'gmo_genetic_engineering#label', 'gmo_genetic_engineering#origin_source', 'gmo_genetic_engineering#environment', 'gmo_genetic_engineering#productivity' ] reduced_organic_possible_aspects = [ 'organic#general', 'organic#price', 'organic#quality', 'organic#safety_healthiness', 'organic#trustworthy_sources', 'organic#environment', 'conventional#general', 'conventional#price', 'conventional#quality', 'conventional#safety_healthiness', 'conventional#trustworthy_sources', 'conventional#environment', 'gmo_genetic_engineering#general', 'gmo_genetic_engineering#price', 'gmo_genetic_engineering#quality', 'gmo_genetic_engineering#safety_healthiness', 'gmo_genetic_engineering#trustworthy_sources', 'gmo_genetic_engineering#environment' ] global TOTAL_SENTENCE_COUNT global TOTAL_REVIEW_COUNT global TOTAL_AUGMENTED_REVIEW_COUNT global TOTAL_POSITIVE_LABEL_COUNT global TOTAL_NEGATIVE_LABEL_COUNT global TOTAL_NEUTRAL_LABEL_COUNT global TOTAL_NOT_APPLICABLE_LABEL_COUNT global ASPECT_TO_SENTENCE_FREQUENCY global DATA_TYPE if DATA_TYPE == 'organic': possible_aspects = organic_possible_aspects elif DATA_TYPE == 'organic_reduced': possible_aspects = reduced_organic_possible_aspects TOTAL_SENTENCE_COUNT = 0 TOTAL_REVIEW_COUNT = 0 TOTAL_AUGMENTED_REVIEW_COUNT = 0 TOTAL_POSITIVE_LABEL_COUNT = 0 TOTAL_NEGATIVE_LABEL_COUNT = 0 TOTAL_NEUTRAL_LABEL_COUNT = 0 TOTAL_NOT_APPLICABLE_LABEL_COUNT = 0 ASPECT_TO_SENTENCE_FREQUENCY = {} dataset = [] for i, review in enumerate(reviews): TOTAL_REVIEW_COUNT += 1 print('review-' + str(i)) review_text = [] aspect_sentence_polarity_map = {} sentences = review['sentences']['sentence'] if isinstance(sentences, dict): sentences = [sentences] for j, sentence in enumerate(sentences): TOTAL_SENTENCE_COUNT += 1 sentence_text = [] sentence_text.append(sentence['text']) if 'Opinions' in sentence.keys(): opinions = sentence['Opinions']['Opinion'] if isinstance(opinions, dict): opinions = [opinions] for opinion in opinions: aspect_category = opinion['@category'].lower() update_aspect_to_sentence_frequency(aspect_category) polarity = get_categorical_sentiment(opinion['@polarity']) # Here we are trying to create a map of sentences and aspects. Basicly, for the current review which # sentence is related to which aspect. sentence_polarity = aspect_sentence_polarity_map.get( aspect_category, []) sentence_polarity.append([j, polarity]) aspect_sentence_polarity_map[ aspect_category] = sentence_polarity # else: # # no aspect, contains no sentiment, either out of domain or just some fact # sentence_polarity = aspect_sentence_polarity_map.get('relevance', []) # sentence_polarity.append([j, 3]) # aspect_sentence_polarity_map['relevance'] = sentence_polarity review_text.append(sentence_text) # It could be that a particular review has no sentence for some aspects. Here we are just adding an empty # sentence list for such aspects. if not REDUCED: for aspect in possible_aspects: if aspect not in aspect_sentence_polarity_map.keys(): aspect_sentence_polarity_map[aspect] = [] # Now for every possible aspect we will create a datapoint using this particular review. for a, sent_polarities in aspect_sentence_polarity_map.items(): TOTAL_AUGMENTED_REVIEW_COUNT += 1 aspect_words = [] aspects = a.split('#') aspect_words.extend(aspects[0].split('_')) if len(aspects) > 1: aspect_words.extend(aspects[1].split('_')) augmented_review = [] augmented_polarity = [] # check which sentences from the current review are related to this aspect 'a' and has some polarity. # Iterate over each sentence from the review and check in the aspect's map whether it is present there # or not. If yes, mark the sentence's sentiment polarity accordinly or otherwise mark it N/A(3) for j, s in enumerate(review_text): updated_polarity = 3 for sent_polarity in sent_polarities: if j == sent_polarity[0]: # sentence j contains current aspect updated_polarity = sent_polarity[1] break if updated_polarity == 3: TOTAL_NOT_APPLICABLE_LABEL_COUNT += 1 augmented_polarity.append(updated_polarity) augmented_review.append(s) augmented_datapoint = [ aspect_words, augmented_review, augmented_polarity ] dataset.append(augmented_datapoint) if OVERSAMPLING: oversampled_datapoints = oversampling(augmented_datapoint) if oversampled_datapoints is not None: for oversampled_datapoint in oversampled_datapoints: TOTAL_NEUTRAL_LABEL_COUNT += 1 TOTAL_AUGMENTED_REVIEW_COUNT += 1 dataset.append(oversampled_datapoint) print('---------') print(dataset[0]) print(dataset[1]) print(dataset[2]) print(dataset[3]) print(dataset[4]) print(dataset[5]) print(dataset[6]) print(dataset[7]) print(dataset[8]) print(dataset[9]) print(dataset[10]) print(dataset[11]) print(dataset[12]) print(len(dataset)) output_file_name = 'formatted_' + DATA_TYPE + '_' + mode + '.pickle' write_binary(dataset, filename=output_file_name) print('---', mode, '---') print('TOTAL_REVIEW_COUNT: ', TOTAL_REVIEW_COUNT) print('TOTAL_SENTENCE_COUNT: ', TOTAL_SENTENCE_COUNT) print('TOTAL_AUGMENTED_REVIEW_COUNT: ', TOTAL_AUGMENTED_REVIEW_COUNT) print('TOTAL_POSITIVE_LABEL_COUNT: ', TOTAL_POSITIVE_LABEL_COUNT) print('TOTAL_NEGATIVE_LABEL_COUNT: ', TOTAL_NEGATIVE_LABEL_COUNT) print('TOTAL_NEUTRAL_LABEL_COUNT: ', TOTAL_NEUTRAL_LABEL_COUNT) print('TOTAL_NOT_APPLICABLE_LABEL_COUNT: ', TOTAL_NOT_APPLICABLE_LABEL_COUNT) total_label_count = TOTAL_POSITIVE_LABEL_COUNT + TOTAL_NEGATIVE_LABEL_COUNT + TOTAL_NEUTRAL_LABEL_COUNT + TOTAL_NOT_APPLICABLE_LABEL_COUNT print('TOTAL_LABELS: ', total_label_count) print('CLASS 0: ', (TOTAL_POSITIVE_LABEL_COUNT / total_label_count) * 100) print('CLASS 1: ', (TOTAL_NEGATIVE_LABEL_COUNT / total_label_count) * 100) print('CLASS 2: ', (TOTAL_NEUTRAL_LABEL_COUNT / total_label_count) * 100) print('CLASS 3: ', (TOTAL_NOT_APPLICABLE_LABEL_COUNT / total_label_count) * 100) print('ASPECT_TO_SENTENCE_FREQUENCY:') for k, v in ASPECT_TO_SENTENCE_FREQUENCY.items(): print(k + ": " + str(v))