class Reuters_Dataset(TorchnlpDataset): def __init__(self, root: str, normal_class=0, tokenizer='spacy', use_tfidf_weights=False, append_sos=False, append_eos=False, clean_txt=False, max_seq_len_prior=None): super().__init__(root) self.n_classes = 2 # 0: normal, 1: outlier classes = [ 'earn', 'acq', 'crude', 'trade', 'money-fx', 'interest', 'ship' ] # classes_full_list = [ # 'acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee', # 'copper', 'copra-cake', 'corn', 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', 'dmk', # 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-oil', 'heat', 'hog', 'housing', # 'income', 'instal-debt', 'interest', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', # 'lin-oil', 'livestock', 'lumber', 'meal-feed', 'money-fx', 'money-supply', 'naphtha', 'nat-gas', 'nickel', # 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-oil', 'palmkernel', 'pet-chem', 'platinum', # 'potato', 'propane', 'rand', 'rape-oil', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'rye', # 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar', 'sun-meal', # 'sun-oil', 'sunseed', 'tea', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', 'zinc' # ] self.normal_classes = [classes[normal_class]] del classes[normal_class] self.outlier_classes = classes # Load the reuters dataset self.train_set, self.test_set = reuters_dataset(directory=root, train=True, test=True, clean_txt=clean_txt) # Pre-process self.train_set.columns.add('index') self.test_set.columns.add('index') self.train_set.columns.add('weight') self.test_set.columns.add('weight') train_idx_normal = [] # for subsetting train_set to normal class for i, row in enumerate(self.train_set): if any(label in self.normal_classes for label in row['label']) and (len(row['label']) == 1): train_idx_normal.append(i) row['label'] = torch.tensor(0) else: row['label'] = torch.tensor(1) row['text'] = row['text'].lower() test_idx = [ ] # for subsetting test_set to selected normal and anomalous classes test_n_idx = [] # subsetting test_set to selected normal classes test_a_idx = [] # subsetting test_set to selected anomalous classes for i, row in enumerate(self.test_set): if any(label in self.normal_classes for label in row['label']) and (len(row['label']) == 1): test_idx.append(i) test_n_idx.append(i) row['label'] = torch.tensor(0) elif any(label in self.outlier_classes for label in row['label']) and (len(row['label']) == 1): test_idx.append(i) test_a_idx.append(i) row['label'] = torch.tensor(1) else: row['label'] = torch.tensor(1) row['text'] = row['text'].lower() # Subset train_set to normal class self.train_set = Subset(self.train_set, train_idx_normal) # Subset test_set to selected normal and anomalous classes self.test_set = Subset(self.test_set, test_idx) # Subset test_set to selected normal classes self.test_n_set = Subset(self.test_set, test_n_idx) # Subset test_set to selected anomalous classes self.test_a_set = Subset(self.test_set, test_a_idx) # Make corpus and set encoder text_corpus = [ row['text'] for row in datasets_iterator(self.train_set, self.test_set) ] if tokenizer == 'spacy': self.encoder = SpacyEncoder(text_corpus, min_occurrences=3, append_eos=append_eos) if tokenizer == 'bert': self.encoder = MyBertTokenizer.from_pretrained('bert-base-uncased', cache_dir=root) # Encode self.max_seq_len = 0 for row in datasets_iterator(self.train_set, self.test_set): if append_sos: sos_id = self.encoder.stoi[DEFAULT_SOS_TOKEN] row['text'] = torch.cat((torch.tensor(sos_id).unsqueeze(0), self.encoder.encode(row['text']))) else: row['text'] = self.encoder.encode(row['text']) if len(row['text']) > self.max_seq_len: self.max_seq_len = len(row['text']) # Compute tf-idf weights if use_tfidf_weights: compute_tfidf_weights(self.train_set, self.test_set, vocab_size=self.encoder.vocab_size) else: for row in datasets_iterator(self.train_set, self.test_set): row['weight'] = torch.empty(0) # Get indices after pre-processing for i, row in enumerate(self.train_set): row['index'] = i for i, row in enumerate(self.test_set): row['index'] = i # # length prior sent_lengths = [len(row['text']) for row in self.train_set] sent_lengths_freq = np.bincount(np.array(sent_lengths)) sent_lengths_freq = np.concatenate( (sent_lengths_freq, np.array((max_seq_len_prior - max(sent_lengths)) * [0])), axis=0) sent_lengths_freq = sent_lengths_freq + 1 self.length_prior = np.log(sent_lengths_freq) - np.log( sent_lengths_freq.sum())
class IMDB_Dataset(TorchnlpDataset): def __init__(self, root: str, normal_class=0, tokenizer='spacy', use_tfidf_weights=False, append_sos=False, append_eos=False, clean_txt=False, max_seq_len_prior=None): super().__init__(root) self.n_classes = 2 # 0: normal, 1: outlier classes = ['pos', 'neg'] if normal_class == -1: self.normal_classes = classes self.outlier_classes = [] else: self.normal_classes = [classes[normal_class]] del classes[normal_class] self.outlier_classes = classes if root not in nltk.data.path: nltk.data.path.append(root) # Load the imdb dataset self.train_set, self.test_set = imdb_dataset(directory=root, train=True, test=True) # Pre-process self.train_set.columns.add('index') self.test_set.columns.add('index') self.train_set.columns.remove('sentiment') self.test_set.columns.remove('sentiment') self.train_set.columns.add('label') self.test_set.columns.add('label') self.train_set.columns.add('weight') self.test_set.columns.add('weight') train_idx_normal = [] # for subsetting train_set to normal class for i, row in enumerate(self.train_set): row['label'] = row.pop('sentiment') if row['label'] in self.normal_classes: train_idx_normal.append(i) row['label'] = torch.tensor(0) else: row['label'] = torch.tensor(1) if clean_txt: row['text'] = clean_text(row['text'].lower()) else: row['text'] = row['text'].lower() test_n_idx = [] # subsetting test_set to selected normal classes test_a_idx = [] # subsetting test_set to selected anomalous classes for i, row in enumerate(self.test_set): row['label'] = row.pop('sentiment') if row['label'] in self.normal_classes: test_n_idx.append(i) else: test_a_idx.append(i) row['label'] = torch.tensor( 0) if row['label'] in self.normal_classes else torch.tensor(1) if clean_txt: row['text'] = clean_text(row['text'].lower()) else: row['text'] = row['text'].lower() # Subset train_set to normal class self.train_set = Subset(self.train_set, train_idx_normal) # Subset test_set to selected normal classes self.test_n_set = Subset(self.test_set, test_n_idx) # Subset test_set to selected anomalous classes self.test_a_set = Subset(self.test_set, test_a_idx) # Make corpus and set encoder text_corpus = [ row['text'] for row in datasets_iterator(self.train_set, self.test_set) ] if tokenizer == 'spacy': self.encoder = SpacyEncoder(text_corpus, min_occurrences=3, append_eos=append_eos) if tokenizer == 'bert': self.encoder = MyBertTokenizer.from_pretrained('bert-base-uncased', cache_dir=root) # Encode self.max_seq_len = 0 for row in datasets_iterator(self.train_set, self.test_set): if append_sos: sos_id = self.encoder.stoi[DEFAULT_SOS_TOKEN] row['text'] = torch.cat((torch.tensor(sos_id).unsqueeze(0), self.encoder.encode(row['text']))) else: row['text'] = self.encoder.encode(row['text']) if len(row['text']) > self.max_seq_len: self.max_seq_len = len(row['text']) # Compute tf-idf weights if use_tfidf_weights: compute_tfidf_weights(self.train_set, self.test_set, vocab_size=self.encoder.vocab_size) else: for row in datasets_iterator(self.train_set, self.test_set): row['weight'] = torch.empty(0) # Get indices after pre-processing for i, row in enumerate(self.train_set): row['index'] = i for i, row in enumerate(self.test_set): row['index'] = i # length prior sent_lengths = [len(row['text']) for row in self.train_set] sent_lengths_freq = np.bincount(np.array(sent_lengths)) sent_lengths_freq = np.concatenate( (sent_lengths_freq, np.array((max_seq_len_prior - max(sent_lengths)) * [0])), axis=0) sent_lengths_freq = sent_lengths_freq + 1 self.length_prior = np.log(sent_lengths_freq) - np.log( sent_lengths_freq.sum())
class Newsgroups20_Dataset(TorchnlpDataset): def __init__(self, root: str, normal_class=0, tokenizer='spacy', use_tfidf_weights=False, append_sos=False, append_eos=False, clean_txt=False, max_seq_len_prior=None): super().__init__(root) self.n_classes = 2 # 0: normal, 1: outlier classes = list(range(6)) groups = [[ 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x' ], [ 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey' ], ['sci.crypt', 'sci.electronics', 'sci.med', 'sci.space'], ['misc.forsale'], [ 'talk.politics.misc', 'talk.politics.guns', 'talk.politics.mideast' ], [ 'talk.religion.misc', 'alt.atheism', 'soc.religion.christian' ]] short_group_names = ['comp', 'rec', 'sci', 'misc', 'pol', 'rel'] self.subset = short_group_names[normal_class] self.normal_classes = groups[normal_class] self.outlier_classes = [] del classes[normal_class] for i in classes: self.outlier_classes += groups[i] # Load the 20 Newsgroups dataset self.train_set, self.test_set = newsgroups20_dataset( directory=root, train=True, test=True, clean_txt=clean_txt, groups=groups, short_group_names=short_group_names) # Pre-process self.train_set.columns.add('index') self.test_set.columns.add('index') self.train_set.columns.add('weight') self.test_set.columns.add('weight') train_idx_normal = [] # for subsetting train_set to normal class for i, row in enumerate(self.train_set): if row['label'] in self.normal_classes: train_idx_normal.append(i) row['label'] = torch.tensor(0) else: row['label'] = torch.tensor(1) row['text'] = row['text'].lower() test_n_idx = [] # subsetting test_set to selected normal classes test_a_idx = [] # subsetting test_set to selected anomalous classes for i, row in enumerate(self.test_set): if row['label'] in self.normal_classes: test_n_idx.append(i) else: test_a_idx.append(i) row['label'] = torch.tensor( 0) if row['label'] in self.normal_classes else torch.tensor(1) row['text'] = row['text'].lower() # Subset train_set to normal class self.train_set = Subset(self.train_set, train_idx_normal) # Subset test_set to selected normal classes self.test_n_set = Subset(self.test_set, test_n_idx) # Subset test_set to selected anomalous classes self.test_a_set = Subset(self.test_set, test_a_idx) # Make corpus and set encoder text_corpus = [ row['text'] for row in datasets_iterator(self.train_set, self.test_set) ] if tokenizer == 'spacy': self.encoder = SpacyEncoder(text_corpus, min_occurrences=3, append_eos=append_eos) if tokenizer == 'bert': self.encoder = MyBertTokenizer.from_pretrained('bert-base-uncased', cache_dir=root) # Encode self.max_seq_len = 0 for row in datasets_iterator(self.train_set, self.test_set): if append_sos: sos_id = self.encoder.stoi[DEFAULT_SOS_TOKEN] row['text'] = torch.cat((torch.tensor(sos_id).unsqueeze(0), self.encoder.encode(row['text']))) else: row['text'] = self.encoder.encode(row['text']) if len(row['text']) > self.max_seq_len: self.max_seq_len = len(row['text']) # Compute tf-idf weights if use_tfidf_weights: compute_tfidf_weights(self.train_set, self.test_set, vocab_size=self.encoder.vocab_size) else: for row in datasets_iterator(self.train_set, self.test_set): row['weight'] = torch.empty(0) # Get indices after pre-processing for i, row in enumerate(self.train_set): row['index'] = i for i, row in enumerate(self.test_set): row['index'] = i # length prior sent_lengths = [len(row['text']) for row in self.train_set] sent_lengths_freq = np.bincount(np.array(sent_lengths)) sent_lengths_freq = np.concatenate( (sent_lengths_freq, np.array((max_seq_len_prior - max(sent_lengths)) * [0])), axis=0) sent_lengths_freq = sent_lengths_freq + 1 self.length_prior = np.log(sent_lengths_freq) - np.log( sent_lengths_freq.sum())
class IMDB_Dataset(TorchnlpDataset): def __init__(self, root: str, normal_class=0, tokenizer='spacy', use_tfidf_weights=False, append_sos=False, append_eos=False, clean_txt=False): super().__init__(root) self.n_classes = 2 # 0: normal, 1: outlier classes = ['pos', 'neg'] if normal_class == -1: self.normal_classes = classes self.outlier_classes = [] else: self.normal_classes = [classes[normal_class]] del classes[normal_class] self.outlier_classes = classes # Load the imdb dataset self.train_set, self.test_set = imdb_dataset(directory=root, train=True, test=True) # Pre-process self.train_set.columns.add('index') self.test_set.columns.add('index') self.train_set.columns.remove('sentiment') self.test_set.columns.remove('sentiment') self.train_set.columns.add('label') self.test_set.columns.add('label') self.train_set.columns.add('weight') self.test_set.columns.add('weight') train_idx_normal = [] # for subsetting train_set to normal class for i, row in enumerate(self.train_set): row['label'] = row.pop('sentiment') if row['label'] in self.normal_classes: train_idx_normal.append(i) row['label'] = torch.tensor(0) else: row['label'] = torch.tensor(1) if clean_txt: row['text'] = clean_text(row['text'].lower()) else: row['text'] = row['text'].lower() for i, row in enumerate(self.test_set): row['label'] = row.pop('sentiment') row['label'] = torch.tensor(0) if row['label'] in self.normal_classes else torch.tensor(1) if clean_txt: row['text'] = clean_text(row['text'].lower()) else: row['text'] = row['text'].lower() # Subset train_set to normal class self.train_set = Subset(self.train_set, train_idx_normal) # Make corpus and set encoder text_corpus = [row['text'] for row in datasets_iterator(self.train_set, self.test_set)] if tokenizer == 'spacy': self.encoder = SpacyEncoder(text_corpus, min_occurrences=3, append_eos=append_eos) if tokenizer == 'bert': self.encoder = MyBertTokenizer.from_pretrained('bert-base-uncased', cache_dir=root) # Encode for row in datasets_iterator(self.train_set, self.test_set): if append_sos: sos_id = self.encoder.stoi[DEFAULT_SOS_TOKEN] row['text'] = torch.cat((torch.tensor(sos_id).unsqueeze(0), self.encoder.encode(row['text']))) else: row['text'] = self.encoder.encode(row['text']) # Compute tf-idf weights if use_tfidf_weights: compute_tfidf_weights(self.train_set, self.test_set, vocab_size=self.encoder.vocab_size) else: for row in datasets_iterator(self.train_set, self.test_set): row['weight'] = torch.empty(0) # Get indices after pre-processing for i, row in enumerate(self.train_set): row['index'] = i for i, row in enumerate(self.test_set): row['index'] = i
drop_indices = np.random.choice(idx_positive, nbr_to_drop, replace=False) df = df.drop(drop_indices) _logger.info((df['labels'] == 'negative').mean()) text_as_list = df['text'].tolist() labels_as_list = df['labels'].tolist() from torchnlp.encoders.text import SpacyEncoder, pad_tensor from sklearn.model_selection import train_test_split from tqdm.notebook import tqdm # 使用pytorch NLP 工具对txt进行编码 encoder = SpacyEncoder(text_as_list) _logger.info("{} encoder is {}".format(text_as_list[0], encoder.encode(text_as_list[0]))) encoded_texts = [] for i in tqdm(range(len(text_as_list))): encoded_texts.append(encoder.encode(text_as_list[i])) lengths = [len(i) for i in tqdm(encoded_texts)] import seaborn as sns import matplotlib.pyplot as plt length_as_series = pd.Series(lengths) plt.title("Probability Density Function for text lengths") sns.distplot(length_as_series) max_pad_length = length_as_series.quantile(0.9)