def from_dataframe(cls, surname_df): character_vocab = SequenceVocabulary(add_unk=False) nationality_vocab = Vocabulary(add_unk=False) for index, row in surname_df.iterrows(): for letter in row.surname: character_vocab.add_token(letter) nationality_vocab.add_token(row.nationality) return cls(character_vocab, nationality_vocab)
def from_dataframe(cls, surname_df): surname_vocab = Vocabulary(unk_token="@") nationality_vocab = Vocabulary(add_unk=False) for index, row in surname_df.iterrows(): for letter in row.surname: surname_vocab.add_token(letter) nationality_vocab.add_token(row.nationality) return cls(surname_vocab, nationality_vocab)
def from_dataframe(cls, surname_df): character_vocab = Vocabulary(unk_token="@") nationality_vocab = Vocabulary(add_unk=False) max_surname_length = 0 for index, row in surname_df.iterrows(): max_surname_length = max(max_surname_length, len(row.surname)) for letter in row.surname: character_vocab.add_token(letter) nationality_vocab.add_token(row.nationality) return cls(character_vocab, nationality_vocab, max_surname_length)
def from_dataframe(cls, news_df, cutoff=25): title_vocab = SequenceVocabulary() category_vocab = Vocabulary(add_unk=False) word_counts = Counter() for title in news_df.title: for token in title.split(" "): if token not in string.punctuation: word_counts[token] += 1 for word, word_count in word_counts.items(): if word_count >= cutoff: title_vocab.add_token(word) for category in sorted(set(news_df.category)): category_vocab.add_token(category) return cls(title_vocab, category_vocab)
def from_dataframe(cls, review_df, cutoff=25): """ Instantiate the vectorizer from the dataset dataframe :param review_df (pandas.DataFrame): the review dataset :param cutoff (int): the parameter for frequency-based filtering :return an instance of the ReviewVectorizer """ review_vocab = Vocabulary(add_unk=True) rating_vocab = Vocabulary(add_unk=False) for rating in sorted(set(review_df.rating)): rating_vocab.add_token(rating) word_counts = Counter() for review in review_df.review: for word in review.split(" "): if word not in string.punctuation: word_counts[word] += 1 for word, count in word_counts.items(): if count > cutoff: review_vocab.add_token(word) return cls(review_vocab, rating_vocab)