def createVocab(self, dirs): print("Creating vocab mapping (max size: %d, min frequency: %d)..." % (self.max_vocab_size, self.min_count)) dic = {} for d in dirs: indices = [] for f in os.listdir(d): with open(os.path.join(d, f), 'r') as review: tokens = tokenizer.tokenize(review.read().lower(), self.remove_punct, self.remove_stopwords) for t in tokens: if t not in dic: dic[t] = 1 else: dic[t] += 1 d = {} counter = 0 with open(self.dataDir + 'vocab.txt', 'w') as v: for w in sorted(dic, key=dic.get, reverse=True): # take word more frequent than min_count if dic[w] < self.min_count: break v.write(w + " " + str(dic[w]) + "\n") d[w] = counter counter += 1 # take most frequent max_vocab_size tokens if self.max_vocab_size > -1 and counter >= self.max_vocab_size: break
def createVocab_old(self, dirs): print("Creating vocab mapping (max size: %d, min frequency: %d)..." % (self.max_vocab_size, self.min_count)) dic = {} for d in dirs: indices = [] for f in os.listdir(d): with open(os.path.join(d, f), 'r') as review: tokens = tokenizer.tokenize(review.read().lower(), self.remove_punct, self.remove_stopwords) for t in tokens: if t not in dic: dic[t] = 1 else: dic[t] += 1 d = {} counter = 0 for w in sorted(dic, key=dic.get, reverse=True): # take word more frequent than min_count if dic[w] < self.min_count: break d[w] = counter counter += 1 # take most frequent max_vocab_size tokens if self.max_vocab_size > -1 and counter >= self.max_vocab_size: break # add out of vocab token and pad token d["<UNK>"] = counter counter += 1 d["<PAD>"] = counter print("vocab mapping created: size: %d discarded: %d" % (len(d), len(dic) - len(d) + 2)) with open(self.dataDir + 'vocab.txt', 'wb') as handle: pickle.dump(d, handle)
def createProcessedDataFile(self, vocab_mapping, directory, pid, max_seq_length, lock): count = 0 data = np.array([i for i in range(max_seq_length + 2)]) for f in os.listdir(directory): count += 1 if count % 100 == 0: lock.acquire() print("Processing: " + f + " the " + str(count) + "th file... on process: " + str(pid)) lock.release() with open(os.path.join(directory, f), 'r') as review: tokens = tokenizer.tokenize(review.read().lower(), self.remove_punct, self.remove_stopwords) numTokens = len(tokens) indices = [vocab_mapping.getIndex(j) for j in tokens] # pad sequence to max length if len(indices) < max_seq_length: indices = indices + [vocab_mapping.getIndex("<PAD>") for i in range(max_seq_length - len(indices))] else: indices = indices[0:max_seq_length] if "pos" in directory: indices.append(1) else: indices.append(0) indices.append(min(numTokens, max_seq_length)) assert len(indices) == max_seq_length + 2, str(len(indices)) data = np.vstack((data, indices)) indices = [] # remove first placeholder value data = data[1::] lock.acquire() print("Saving data file{0} to disk...".format(str(pid))) lock.release() self.saveData(data, pid, directory)
def get_tokenized_sentences(self): instances = [] for idx, sent in enumerate(self.sentences): tokens = tk.tokenize(sent.text) cleaned_tokens = [clean_str(token) for token in tokens] instances.append( TokenizedInstance(sent.text, self.labels[idx], cleaned_tokens)) return instances
def createCorpus(self): corpus = "" for dir in self.vocabDirs: print("\tNow processing folder: " + dir) for f in os.listdir(dir): with open(os.path.join(dir, f), 'r') as review: review_tkn = tokenizer.tokenize(review.read(), self.remove_punct, self.remove_stopwords) corpus += " ".join(review_tkn) + "\n" # name_corpus = "corpus{p}{s}".format(p="_nopunct" if args.punct else "", s="_nostop" if args.stop else "") with open(self.dataDir + "corpus.txt", "w") as text_file: text_file.write(corpus) text_file.close()
def extract_features(self): instances = [] for text in self.sentences: tokens = tk.tokenize(text) # list to hold features for text feature_list = [] # check for features in tokens # SUFFIX3, PREFIX2, WORD/W, ALL_CAPS, EXCLAMATION or if a word matches the labels, i.e. joy , anger etc. for idx, token in enumerate(tokens): # suffix3 feature_list.append("SUFFIX3=" + token[-3:]) # prefix2 feature_list.append("PREFIX2=" + token[:2]) # w feature_list.append("W=" + token) # window = 2 feature_list.append("WORD+1=" + tokens[(idx + 1) % len(tokens)]) feature_list.append("WORD-1=" + tokens[(idx - 1) % len(tokens)]) feature_list.append("WORD+2=" + tokens[(idx + 2) % len(tokens)]) feature_list.append("WORD-2=" + tokens[(idx - 2) % len(tokens)]) if token.isupper(): feature_list.append('ALL_CAPS=' + token) if "!" in token: feature_list.append('EXCLAMATION=1') if token in self.labels: feature_list.append('LABEL=' + token) # create DataInstance index_of_text = self.sentences.index(text) data_instance = DataInstance(text=text, label=self.labels[index_of_text], features=feature_list) instances.append(data_instance) return instances
def count_words_in_labels(self): wordcount_per_labels = dict( zip(self.distinct_labels, [{} for _ in range(len(self.distinct_labels))])) for idx, sent in enumerate(self.sentences): label = self.labels[idx] cleared_tokens = [ clean_str(token) for token in tk.tokenize(sent.text) ] for cleared_token in cleared_tokens: wordcount_per_labels[label][ cleared_token] = wordcount_per_labels[label].get( cleared_token, 0) + 1 wordsum_per_labels = dict( zip(self.distinct_labels, [ sum(wordcount_per_labels[label].values()) for label in self.distinct_labels ])) return wordcount_per_labels, wordsum_per_labels