def from_dataframe(cls, review_df, cutoff=25): """ Instantiate the vectorizer from the dataset dataframe. Args: review_df (pandas.Dataframe): the serializable dictionary Returns: an instance of the ReviewVectorizer """ review_vocab = Vocabulary(add_unk=True) rating_vocab = Vocabulary(add_unk=False) # Add ratings for rating in sorted(set(review_df.rating)): rating_vocab.add_token(rating) # Add top words if count > provided count word_counts = Counter() for review in review_df.review: for word in review.split(" "): if word not in string.punctuation: word_counts[word] += 1 for word, count in word_counts.items(): if count > cutoff: review_vocab.add_token(word) return cls(review_vocab, rating_vocab)
def __init__(self, vocabulary: Vocabulary, tokenizer=split_tokenizer, init_token=None, eos_token=None, pad_token=None, reverse=False): self.vocab = vocabulary if init_token: self.init_idx = vocabulary.add_token(init_token) self.init_token = init_token self.init_present = 1 else: self.init_present = 0 if eos_token: self.eos_idx = vocabulary.add_token(eos_token) self.eos_token = eos_token self.eos_present = 1 else: self.eos_present = 0 if pad_token: self.pad_idx = vocabulary.add_token(pad_token) self.tokenizer = tokenizer self.reverse = reverse
# DATA FILES # train_loc = locations['train_loc'] dev_loc = locations['test_loc'] fasttext_loc = locations['embeddings_loc'] w2vec_loc = locations['w2vec_loc'] model_loc = locations['model_loc'] stopwordsfile = locations['stopwordsfile'] # VOCABULARY # special_tokens = [INIT_TOKEN, UNK_TOKEN, END_TOKEN, PAD_TOKEN] with open(train_loc) as f: raw_text = f.read() voc = Vocabulary(raw_text, bigram=bigram) voc.prune(threshold=1) for token in special_tokens: voc.add_token(token) w2idx = voc.w2idx idx2w = voc.idx2w voc_size = voc.get_length() pad_idx = w2idx[PAD_TOKEN] init_idx = w2idx[INIT_TOKEN] # STOP WORDS # with open(stopwordsfile) as f: stop_words = f.read().split() stop_words.extend(special_tokens) stop_idx = [w2idx[w] for w in stop_words if w in w2idx.keys()] # PRE-TRAINED EMBEDDINGS # if os.path.exists(w2vec_loc): with open(w2vec_loc, 'rb') as f: