def heuristics(self, data, minwords=6, maxcomma=2, maxpunc=2, maxdigits=6): punc = "#$%&\'()*+-/:;<=>?@[\\]^_`{|}~" if '\n' in data: return False if "<" in data or ">" in data: return False if data[0] in punc: return False if minwords - 1 > data.count(' '): return False if maxcomma < data.count(','): return False for p in punc: if maxpunc < data.count(p): return False if "Wikipedia" in data: return False if re.search(r'\$[0-9]', data): return False count = 0 for n in string.digits: count += data.count(n) if count > maxdigits: return False return True
def heuristics(self, data, minwords=6, maxcomma=2, maxpunc=2, maxdigits=6): punc = "#$%&\'()*+-/:;<=>?@[\\]^_`{|}~" if '\n' in data: return False if "<" in data or ">" in data: return False if data[0] in punc: return False if minwords-1 > data.count(' '): return False if maxcomma < data.count(','): return False for p in punc: if maxpunc < data.count(p): return False if "Wikipedia" in data: return False if re.search(r'\$[0-9]', data): return False count = 0 for n in string.digits: count += data.count(n) if count > maxdigits: return False return True
def charData(self, data): if not self.parsedText: self.parsedText += data.lstrip() else: self.parsedText += data # append the data to parsedText #self.parsedText = re.sub(r'[\n\r\t\s]+', ' ', self.parsedText) self.parsedText = self.parsedText.replace(' ', ' ') punktCompensation = 0 for sequence in self.punktCompensate: punktCompensation += data.count(sequence) if punktCompensation: # compensate for chars omitted by punkt-sentence-tokenization decisions print '|' + data + '|' self.parsedText += 'x'*punktCompensation if self.inTag: self.cData = unicode(data)
def charData(self, data): if not self.parsedText: self.parsedText += data.lstrip() else: self.parsedText += data # append the data to parsedText #self.parsedText = re.sub(r'[\n\r\t\s]+', ' ', self.parsedText) self.parsedText = self.parsedText.replace(' ', ' ') punktCompensation = 0 for sequence in self.punktCompensate: punktCompensation += data.count(sequence) if punktCompensation: # compensate for chars omitted by punkt-sentence-tokenization decisions print '|' + data + '|' self.parsedText += 'x' * punktCompensation if self.inTag: self.cData = unicode(data)
# maximum words contained in a sentence max_length = 100 # 끝에서 부터 내용들이 유실된다 trunc_type = 'post' # 0 을 뒤에서 부터 padding 해준다 padding_type = 'post' oov_tok = "<OOV>" # This data has 27000 records, we are going to use 20000 for training and the other for testing training_size = 20000 # ================================================================================================================================================================== # Open json file and pre-process it with open('Sarcasm_Headlines_Dataset.json','r') as f: data = f.read() data = "[" + data.replace("}", "},", data.count("}")-1) + "]" data_store = json.loads(data) # print(data_store) # Data_store is a group of 'dictionary' # ================================================================================================================================================================== sentences = [] labels = [] # Item is a 'dictionary type' variable for item in data_store: sentences.append(item['headline']) labels.append(item['is_sarcastic']) # ==================================================================================================================================================================