示例#1
0
 def heuristics(self, data, minwords=6, maxcomma=2, maxpunc=2, maxdigits=6):
     punc = "#$%&\'()*+-/:;<=>?@[\\]^_`{|}~"
     if '\n' in data:
         return False
     if "<" in data or ">" in data:
         return False
     if data[0] in punc:
         return False
     if minwords - 1 > data.count(' '):
         return False
     if maxcomma < data.count(','):
         return False
     for p in punc:
         if maxpunc < data.count(p):
             return False
     if "Wikipedia" in data:
         return False
     if re.search(r'\$[0-9]', data):
         return False
     count = 0
     for n in string.digits:
         count += data.count(n)
     if count > maxdigits:
         return False
     return True
示例#2
0
	def heuristics(self, data, minwords=6, maxcomma=2, maxpunc=2, maxdigits=6):
		punc = "#$%&\'()*+-/:;<=>?@[\\]^_`{|}~"
		if '\n' in data:
			return False
		if "<" in data or ">" in data:
			return False
		if data[0] in punc:
			return False
		if minwords-1 > data.count(' '):
			return False
		if maxcomma < data.count(','):
			return False
		for p in punc:
			if maxpunc < data.count(p):
				return False
		if "Wikipedia" in data:
			return False
		if re.search(r'\$[0-9]', data):
			return False
		count = 0
		for n in string.digits:
			count += data.count(n)
		if count > maxdigits:
			return False
		return True
示例#3
0
 def charData(self,  data):
    
     if not self.parsedText:
         self.parsedText += data.lstrip()
     else:
         self.parsedText += data # append the data to parsedText
     #self.parsedText = re.sub(r'[\n\r\t\s]+', ' ', self.parsedText)
     self.parsedText = self.parsedText.replace('  ', ' ')
     
     punktCompensation = 0
     for sequence in self.punktCompensate:
         punktCompensation += data.count(sequence)
     if punktCompensation: # compensate for chars omitted by punkt-sentence-tokenization decisions
         print '|' + data + '|'
         self.parsedText += 'x'*punktCompensation
     if self.inTag:
         self.cData = unicode(data)
示例#4
0
    def charData(self, data):

        if not self.parsedText:
            self.parsedText += data.lstrip()
        else:
            self.parsedText += data  # append the data to parsedText
        #self.parsedText = re.sub(r'[\n\r\t\s]+', ' ', self.parsedText)
        self.parsedText = self.parsedText.replace('  ', ' ')

        punktCompensation = 0
        for sequence in self.punktCompensate:
            punktCompensation += data.count(sequence)
        if punktCompensation:  # compensate for chars omitted by punkt-sentence-tokenization decisions
            print '|' + data + '|'
            self.parsedText += 'x' * punktCompensation
        if self.inTag:
            self.cData = unicode(data)
示例#5
0
# maximum words contained in a sentence
max_length    = 100
# 끝에서 부터 내용들이 유실된다
trunc_type    = 'post'
# 0 을 뒤에서 부터 padding 해준다
padding_type  = 'post'
oov_tok       = "<OOV>"
# This data has 27000 records, we are going to use 20000 for training and the other for testing
training_size = 20000

# ==================================================================================================================================================================
# Open json file and pre-process it
with open('Sarcasm_Headlines_Dataset.json','r') as f:
    data = f.read()

data = "[" + data.replace("}", "},", data.count("}")-1) + "]"
data_store = json.loads(data)

# print(data_store)
# Data_store is a group of 'dictionary'

# ==================================================================================================================================================================
sentences = []
labels    = []

# Item is a 'dictionary type' variable
for item in data_store:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])

# ==================================================================================================================================================================