class One_hot_vector(Vectors): def __init__(self): self.itos = [] self.stoi = {} self.vectors = None self.dim = 30000 self.init_vec = torch.zeros(self.dim) self.fill_value = 5 temp = torch.zeros(self.dim) temp[-1] = self.fill_value self.unk_init_vector = temp # self.vector_init() self.pst = PunktSentenceTokenizer() # self.pre_train() self.st = sent_tokenize self.method = 'origin' self.threshold = 25 def clean_string(self, string): """ Performs tokenization and string cleaning for the Reuters dataset """ # string = re.sub(r"[^A-Za-z0-9(),!?\'`]", " ", string) string = re.sub(r"\s{2,}", " ", string) return string.lower().strip().split() def clean_string_np(self, string): """ Performs tokenization and string cleaning for the Reuters dataset """ # string = re.sub(r"[^A-Za-z0-9(),!?\'`]", " ", string) string = re.sub(r"\s{2,}", " ", string) string = string.lower().strip().split() string = list(filter(None, string)) res = np.array(list(map(self.stoi_take, string ))) return res def stoi_take(self, word): if word in self.stoi.keys(): return self.stoi[word] else: return 0 def unk_init(self, tensor): size_tup = tuple(tensor.size()) if size_tup not in cls.cache: cls.cache[size_tup] = torch.zeros(tensor.size())[-1] = 1 # cls.cache[size_tup] return cls.cache[size_tup] def __getitem__(self, token): if token in self.stoi: return self.init_vec.scatter_(0,torch.LongTensor([self.stoi[token]]), self.fill_value) else: return self.unk_init_vector def vector_init(self): data_path = '/home/zhangxin/sentiment_analysis/hedwig-data/datasets/aclImdb/' output_data_path = '/home/zhangxin/sentiment_analysis/hedwig-data/datasets/IMDB_stanford/' ''' Acutally, we don't need to conduct any data in here. Since this Class are designed for storing the word embedding which doesn't exist in our model. ''' # path_pt =output_data_path+"Vocabulary_matrix.pt" # if os.path.isfile(path_pt): # logger.info('Loading vectors from {}'.format(path_pt)) # self.itos, self.stoi, self.vectors, self.dim = torch.load(path_pt) # else: # text = [] # dataset = ['train',] # for name in dataset: # dataset = name # if name =='dev': # dataset = 'test' # data = [] # for rate in ['pos', 'neg']: # input_data_dir = os.path.join(data_path, dataset, rate) # file_list = os.listdir(input_data_dir) # # import pdb; pdb.set_trace() # for file in file_list: # if file.split('.')[-1]!='txt': # continue # with open(os.path.join(input_data_dir,file), 'r') as review: # text.extend(self.clean_string(review.readline())) # word_frequence = Counter(text) # top_3w = word_frequence.most_common(30000-1) #30000th is the position of unk vector # for i,word in enumerate(top_3w): # self.itos.append(word) # self.stoi[word] = i # logger.info('Saving vectors to {}'.format(path_pt)) # torch.save((self.itos, self.stoi, self.vectors, self.dim), path_pt) def split_sentence(self, string): string_list = re.split(r"<br /><br />", string) sentence_list = [] for string in string_list: string = re.sub(r"[^A-Za-z0-9():;.,!?\'`]", " ", string) string = re.sub(r"([.?!](\s*)){2,}",".", string) sentence_list_tmp = re.split(r'[;.!?]',string.strip()) sentence_list.extend(list(filter(None, sentence_list_tmp))) # string = re.sub(r"<br />", " ", string) # get rid of huanhangfu # string = re.sub(r"[^A-Za-z0-9():.,!?\'`]", " ", string) # string = re.sub(r"([.?!](\s*)){2,}",".", string) # sentence_list = re.split(r'[.!?]',string.strip()) # sentence_list = list(filter(None, sentence_list)) return sentence_list def count_word(self, string): string = re.sub(r"[^A-Za-z0-9():.,!?\'`]", " ", string) return len(string.strip().split()) def split_doc(self): data_path = '/home/zhangxin/sentiment_analysis/hedwig-data/datasets/aclImdb/' output_data_path = '/home/zhangxin/sentiment_analysis/hedwig-data/datasets/IMDB_stanford/' text = [] max_sentence = 0 sentence_len = [] max_word_len = [] dataset = ['train','test'] for name in dataset: dataset = name if name =='dev': dataset = 'test' data = [] for rate in ['pos', 'neg']: input_data_dir = os.path.join(data_path, dataset, rate) file_list = os.listdir(input_data_dir) # import pdb; pdb.set_trace() for file in file_list: if file.split('.')[-1]!='txt': continue with open(os.path.join(input_data_dir,file), 'r') as review: sen_list = self.split_sentence(review.readline()) max_sentence = max(len(sen_list), max_sentence) sentence_len.append(len(sen_list)) max_word_len.append(max([self.count_word(sen) for sen in sen_list])) # if len(sen_list)>117: # import pdb; pdb.set_trace() sentence_len.sort() max_word_len.sort() print(max_sentence) self.wirte_xls([sentence_len,max_word_len]) # import pdb; pdb.set_trace() logger.info('Success!') def pre_train(self): self.pst.train(reuters.raw()) # trainer = PunktTrainer() # trainer.INCLUDE_ALL_COLLOCS = True # trainer.train(text) # tokenizer = PunktSentenceTokenizer(trainer.get_params()) logger.info('pretrain success!') def __len__(self): return self.dim def wirte_xls(self, data): file = Workbook(encoding = 'utf-8') #指定file以utf-8的格式打开 table = file.add_sheet('data') #指定打开的文件名 method = 'origin' num_freq = Counter(data[0]) word_num_freq = Counter(data[1]) for i,num in enumerate(data[0]): table.write(i,1, num) for i, name in enumerate(num_freq): table.write(i,2, name) table.write(i,3, num_freq[name]) for i, name in enumerate(word_num_freq): table.write(i,4, name) table.write(i,5, word_num_freq[name]) file.save('/home/zhangxin/sentiment_analysis/hedwig/models/oh_cnn_HAN/data_{}.xls'.format(method)) def onehot2int(self, label): '''str -> int''' label = list(label) return [i for i, l in enumerate(label) if l=='1'] def tsv2np(self): database = 'Reuters' dataset = ['train', 'test'] output_file = './{}_data.pkl'.format(database) data_path = '/home/zhangxin/sentiment_analysis/hedwig-data/datasets/{}/'.format(database) data = {} labels= {} for name in dataset: doc = [] doc_count = [] labels[name] = [] with open(data_path + '{}.tsv'.format(name), 'r') as input_tsv: tsv_data_raw = csv.reader(input_tsv, delimiter='\t') for label, text in tsv_data_raw: labels[name].append(self.onehot2int(label)) # if name == 'train': doc_count.extend(self.clean_string(text)) # if name == 'train': word_frequence = Counter(doc_count) import pdb; pdb.set_trace() # top_3w = word_frequence.most_common(30000-2) #30000th is the position of unk vector # self.itos.extend(['<UNK>']) # self.stoi['<UNK>'] = 0 # # self.stoi['<PAD>'] = 1 # for i,(word, count) in enumerate(top_3w): # self.itos.append(word) # self.stoi[word] = i + 1 # with open(data_path + '{}.tsv'.format(name), 'r') as input_tsv: # tsv_data_raw = csv.reader(input_tsv, delimiter='\t') # for label, text in tsv_data_raw: # doc_tmp = list(map(self.clean_string_np, self.split_sentence(text))) # # doc_tmp = list(filter(None, doc_tmp)) # doc.append(doc_tmp) # data[name] = doc data_store = IMDB_data_struct(self.itos,self.stoi,labels,data) with open(output_file,'wb') as file: pickle.dump(data_store, file) def tsv_count(self): dataset = ['train', 'test'] output_file = './imdb_flaw.pkl' data_path = '/home/s/CNN-BiLSTM2/hedwig-data/datasets/IMDB_stanford/' data = {} labels= {} broke_text = {} broke_text_list = {} broke_text_data = {} max_lensen = 0 max_doc_len = 0 mean_sen = [] mean_doc = [] for name in dataset: doc = [] doc_count = [] labels[name] = [] with open(data_path + '{}.tsv'.format(name), 'r') as input_tsv: broke_text[name] = 0 broke_text_list[name] = [] broke_text_data[name] = [] tsv_data_raw = csv.reader(input_tsv, delimiter='\t') for idx, (label, text) in enumerate(tsv_data_raw): doc_tmp = list(map(self.clean_string, self.split_sentence(text))) doc_tmp = list(filter(None, doc_tmp)) max_lensen = max( max( [len(sen) for sen in doc_tmp]),max_lensen) mean_sen.append(sum([len(sen) for sen in doc_tmp])/len(doc_tmp)) mean_doc.append(len(doc_tmp)) # if max_lensen == 275: # import pdb; pdb.set_trace() max_doc_len = max(max_doc_len, len(doc_tmp)) if len(doc_tmp) > 50 or max( [len(sen) for sen in doc_tmp])>100: broke_text[name] += 1 broke_text_list[name].append(idx) # broke_text_data[name].append(text) # for i, sen in enumerate(doc_tmp): # if len(sen) == 0: # import pdb; pdb.set_trace() # if min([len(i) for i in doc_tmp]) == 0: # broke_text[name] += 1 # broke_text_list[name].append(idx) # broke_text_data[name].append(text) mean_sen = sum(mean_sen)/50000 mean_doc = sum(mean_doc)/50000 with open(output_file,'wb') as file: pickle.dump([broke_text, broke_text_list], file) import pdb; pdb.set_trace()
from nltk.tokenize import PunktSentenceTokenizer,word_tokenize,sent_tokenize from nltk.corpus import state_union train_text = state_union.raw("2005-GWBush.txt") sample_text = state_union.raw("2006-GWBush.txt") custom_sent_tokenizer = PunktSentenceTokenizer.train(train_text) tokenized = custom_sent_tokenizer.tokenize(sample_text)