def build_vocabulary(lower=1, n=MAX_VOCAB_SIZE): """ 1. Get word frequency distribution 2. Sort is based on word frequencies 3. Make a vocab dist using the most frequent words 4. Store vocab dist in a file in format <word, identifier> :param lower: Identifiers below this are reserved :param n: Number of unique expected words :return: A dict of vocabulary words and an assigned identifier """ try: vocab_to_code = read_binary(VOCAB_TO_CODE_FILE) code_to_vocab = read_binary(CODE_TO_VOCAB_FILE) print('vocabulary loaded') return vocab_to_code, code_to_vocab except IOError: print('building vocabulary') freq = build_word_frequency_distribution() # sorting words in ascending order based on frequency and then pick top n words top_words = list(sorted(freq.items(), key=lambda x: -x[1]))[:n - lower + 1] # create optimum vocab size print('Vocab count : ' + str(len(top_words))) # global MAX_VOCAB_SIZE # global UNKNOWN max_vocab_size = len(top_words) + 2 unknown = max_vocab_size - 1 vocab_to_code = {} code_to_vocab = {} vocab_to_code['<UNK>'] = unknown code_to_vocab[unknown] = '<UNK>' vocab_to_code['<PAD>'] = PAD code_to_vocab[PAD] = '<PAD>' # lower vocab indexes are reserved for padding and unknown words i = lower for w, freq in top_words: vocab_to_code[w] = i code_to_vocab[i] = w i += 1 write_binary(vocab_to_code, VOCAB_TO_CODE_FILE) write_binary(code_to_vocab, CODE_TO_VOCAB_FILE) return vocab_to_code, code_to_vocab
def combine_processed_data(): combined_dataset = [] restaurant = read_binary(filename=PROCESSED_RESTAURANT_FILE_NAME) print('Restaurant-' + str(len(restaurant))) combined_dataset.extend(restaurant) print(len(combined_dataset)) laptops = read_binary(filename=PROCESSED_LAPTOPS_FILE_NAME) print('Laptops-' + str(len(laptops))) combined_dataset.extend(laptops) print(len(combined_dataset)) # organic = read_binary(filename = PROCESSED_ORGANIC_FILE_NAME) # print('Organic-' + str(len(organic))) # combined_dataset.extend(organic) # print(len(combined_dataset)) write_binary(combined_dataset, OUTPUT_FILE_NAME)
def build_word_frequency_distribution(): """ 1. Extract tokens from the review text 2. Calculate frequency of each token 3. Create a freq dict and store it in a file :return: A dict of <token, freq> """ try: freq_dist_f = read_binary(WORD_FREQ_FILE) print('frequency distribution loaded') return freq_dist_f except IOError: pass print('building frequency distribution') freq = defaultdict(int) if FILE_NAME == 'restaurant': for aspect_word in RESTAURANT_ASPECT_WORDS: freq[aspect_word] += 1 elif FILE_NAME == 'laptops': for aspect_word in LAPTOPS_ASPECT_WORDS: freq[aspect_word] += 1 files = [FORMATTED_FILE_NAME] if EMBEDDING_TYPE == 'fasttext': files.append(FORMATTED_FILE_NAME.replace('train', 'test')) files.append(FORMATTED_FILE_NAME.replace('train', 'val')) for file_path in files: print('building vocab from file - ' + file_path) for i, review in enumerate(read_binary(file_path)): sentences = review[1] for sent in sentences: tokens = NLP.tokenizer(sent[0]) for token in tokens: freq[token.orth_] += 1 if i % 100 == 0: write_binary(freq, WORD_FREQ_FILE) print('dump at {}'.format(i)) write_binary(freq, WORD_FREQ_FILE) return freq
def process_data(): vocab_to_code, code_to_vocab = build_vocabulary() max_vocab_size = len(vocab_to_code) print('Final Vocab Size : ' + str(max_vocab_size)) try: tokenized_dataset = [] all_sentences = [] for i, review in enumerate(read_binary(FORMATTED_FILE_NAME)): tokenized_aspect = [] tokenized_sentences = [] if i == 0: print(review) sentences = review[1] aspect_words = review[0] polarities = review[2] for aspect_word in aspect_words: tokenized_aspect.append(aspect_word) all_sentences.append([aspect_word]) for sent in sentences: tokenized_sentence = [] # remove duplicate spaces from the sentence. This is causing problem for elmo. s = re.sub(' +', ' ', sent[0]) tokens = NLP.tokenizer(s) for token in tokens: tokenized_sentence.append(token.orth_) tokenized_sentences.append(tokenized_sentence) # all these sentences will be written to a separate txt file at the end of the process. all_sentences.append(tokenized_sentence) tokenized_review = [ tokenized_aspect, tokenized_sentences, polarities ] # dataset tokenized_dataset.append(tokenized_review) write_binary(tokenized_dataset, PROCESSED_FILE_NAME) print('dump at {}'.format(i)) all_sentences = space_separated_token_string(all_sentences) save_sentences_to_text(all_sentences) # hack for elmo remove_duplicate_sentences() except KeyboardInterrupt: pass
def process_data(): vocab_to_code, code_to_vocab = build_vocabulary() max_vocab_size = len(vocab_to_code) unknown = max_vocab_size - 1 print('Final Vocab Size : ' + str(max_vocab_size)) try: coded_dataset = [] for i, review in enumerate(read_binary(FORMATTED_FILE_NAME)): coded_aspect = [] coded_sentences = [] if i == 0: print(review) sentences = review[1] aspect_words = review[0] polarities = review[2] for aspect_word in aspect_words: coded_aspect.append(vocab_to_code.get(aspect_word, unknown)) for sent in sentences: coded_sentence = [] tokens = NLP.tokenizer(sent[0]) for token in tokens: coded_sentence.append( vocab_to_code.get(token.orth_, unknown)) coded_sentences.append(coded_sentence) coded_review = [coded_aspect, coded_sentences, polarities] # dataset coded_dataset.append(coded_review) write_binary(coded_dataset, PROCESSED_FILE_NAME) print('dump at {}'.format(i)) datapoint = coded_dataset[0] print(datapoint) print(get_uncoded_data(code_to_vocab, datapoint)) except KeyboardInterrupt: pass
def process_data(): vocab_to_code, code_to_vocab = build_vocabulary() vocab_size = len(vocab_to_code) unknown = vocab_size - 1 print('Final Vocab Size : ' + str(vocab_size)) coded_dataset = [] for i, review in enumerate(read_binary(FORMATTED_FILE_NAME)): coded_aspect = [] coded_text = [] if i == 0: print(review) text = review[1] aspect_words = review[0] polarity = review[2] for aspect_word in aspect_words: a = vocab_to_code.get(aspect_word, unknown) if a == unknown: print('STOP') print(aspect_word) coded_aspect.append(a) for word in text: word_code = vocab_to_code.get(word, unknown) coded_text.append(word_code) coded_review = [coded_aspect, [coded_text], [polarity]] coded_dataset.append(coded_review) write_binary(coded_dataset, PROCESSED_FILE_NAME) print('dump at {}'.format(i)) datapoint = coded_dataset[0] print(datapoint) print(get_uncoded_data(code_to_vocab, datapoint))
def build_vocabulary(lower=1, n=MAX_VOCAB_SIZE): """ 1. Get word frequency distribution 2. Sort is based on word frequencies 3. Make a vocab dist using the most frequent words 4. Store vocab dist in a file in format <word, identifier> :param lower: Identifiers below this are reserved :param n: Number of unique expected words :return: A dict of vocabulary words and an assigned identifier """ try: vocab_to_code = read_binary(VOCAB_TO_CODE_FILE) code_to_vocab = read_binary(CODE_TO_VOCAB_FILE) print('vocabulary loaded') return vocab_to_code, code_to_vocab except IOError: print('building vocabulary') freq = build_word_frequency_distribution() # get glove embeddings print('loading embeddings') if EMBEDDING_TYPE == 'glove': word_to_embeddings = load_glove_embeddings() elif EMBEDDING_TYPE == 'fasttext': word_to_embeddings = load_oov_fastText_embeddings() else: word_to_embeddings = {} # sorting words in ascending order based on frequency and then pick top n words top_words = list(sorted(freq.items(), key=lambda x: -x[1]))[:n - lower + 1] # create optimum vocab size print('Vocab count : ' + str(len(top_words))) # global MAX_VOCAB_SIZE # global UNKNOWN max_vocab_size = len(top_words) + 2 unknown = max_vocab_size - 1 vocab_to_code = {} code_to_vocab = {} # an array of embeddings with index referring to the vocab code. First and last index is # reserved for padding and unknown words respectively. code_to_embed = np.zeros(shape=(max_vocab_size, EMBEDDING_DIMENSION), dtype=np.float32) code_to_embed[PAD] = PAD_EMBEDDING code_to_embed[unknown] = UNKNOWN_EMBEDDING vocab_to_code['<UNK>'] = unknown code_to_vocab[unknown] = '<UNK>' vocab_to_code['<PAD>'] = PAD code_to_vocab[PAD] = '<PAD>' # lower vocab indexes are reserved for padding and unknown words i = lower for w, freq in top_words: vocab_to_code[w] = i code_to_vocab[i] = w try: if EMBEDDING_TYPE == 'glove': embedding = word_to_embeddings.word_vec(w) elif EMBEDDING_TYPE == 'fasttext': embedding = word_to_embeddings.get_word_vector(w) except KeyError: embedding = UNKNOWN_EMBEDDING code_to_embed[i] = embedding i += 1 write_binary(vocab_to_code, VOCAB_TO_CODE_FILE) write_binary(code_to_vocab, CODE_TO_VOCAB_FILE) write_binary(code_to_embed, CODE_TO_EMBED_FILE) return vocab_to_code, code_to_vocab
def fasttext_embeddings(shape): print('using fasttext..') fasttext = read_binary(CODE_TO_EMBED_FILE) fasttext = fasttext[0:shape[0], 0:shape[1]] return tf.convert_to_tensor(value=fasttext)
def glove_embeddings(shape): print('using glove...') glove = read_binary(CODE_TO_EMBED_FILE) glove = glove[0:shape[0], 0:shape[1]] return tf.convert_to_tensor(value=glove)
def vocab_to_code(words): vocab_to_code_map = read_binary(VOCAB_TO_CODE_FILE) codes = [] for word in words: codes.append(vocab_to_code_map.get(word)) return codes
def code_to_vocab(codes): code_to_vocab_map = read_binary(CODE_TO_VOCAB_FILE) words = [] for code in codes: words.append(code_to_vocab_map.get(code)) return words
def read_words(): data = read_binary(WORD_FREQ_FILE) for i, (w, f) in enumerate(data.items()): print(str(i) + '-' + w + ' : ' + str(f))