def textToText(buffer): # extract the title (first lines beforce void carriage return) #get title title = "" # start = 0 # iter = re.finditer('\n', buffer) # while 1: # try: # match = iter.next() # except StopIteration: # break # # line = buffer[start:match.start()] # start = match.start() + 1 # line = line.rstrip() # line = line.lstrip() # if line != '': # title = line # # contraction of the content of the buffer contractedText = cStringIO.StringIO() iter = re.finditer('\w+', buffer) bufferLength = len(buffer) pos = -1 space = 1 while 1: pos += 1 if pos >= bufferLength: break c = buffer[pos] if ord(c) < 32: c = ' ' if texttools.isSpace(c): if space == 0: contractedText.write(' ') space = 1 else: contractedText.write(c) space = 0 text = contractedText.getvalue() title = text[0:60] return (title, text)
def feed(self, data=None, input=None): if data: data_length = len(data) else: data_length = 0 self.__pos = -1 while 1: self.__pos += 1 if self.__pos >= data_length: break if data: c = data[self.__pos] else: c = input.read(1) # print self.__state, c # waiting for '<' if self.__state == WAITING_INF: if c == '<': self.__state = WAITING_FIRSTCHAR_ATTNAME else: self.__text.write(c) # waiting for tagname elif self.__state == WAITING_FIRSTCHAR_ATTNAME: if c == '!': self.__state = WAITING_NEXTCHAR_REM1 elif c == '/': self.handle_data(self.__text.getvalue()) self.__text = cStringIO.StringIO() self.__endingTag = 1 self.__state = WAITING_NEXTCHAR_ATTNAME elif texttools.isAlpha(c): self.__endingTag = 0 self.handle_data(self.__text.getvalue()) self.__text = cStringIO.StringIO() self.__atts = [] self.__attName = c self.__state = WAITING_NEXTCHAR_ATTNAME else: self.__text.write("<" + c) elif self.__state == WAITING_NEXTCHAR_REM1: if c == '-': self.__state = WAITING_NEXTCHAR_REM2 elif c == '>': self.__state = WAITING_INF else: self.__state = WAITING_NEXTCHAR_LIGHTREM elif self.__state == WAITING_NEXTCHAR_LIGHTREM: if c == '>': self.__state = WAITING_INF elif self.__state == WAITING_NEXTCHAR_REM2: if c == '-': self.handle_data(self.__text.getvalue()) self.__text = cStringIO.StringIO() self.__state = WAITING_IN_REM elif c == '>': self.__state = WAITING_INF elif self.__state == WAITING_IN_REM: if c == '-': self.__state = WAITING_END_REM1 else: self.__text.write(c) elif self.__state == WAITING_END_REM1: if c == '-': self.__state = WAITING_END_REM2 else: self.__text.write("-" + c) self.__state = WAITING_IN_REM elif self.__state == WAITING_END_REM2: if c == '>': self.handle_comment(self.__text.getvalue()) self.__text = cStringIO.StringIO() self.__state = WAITING_INF else: self.__text.write("--" + c) self.__state = WAITING_IN_REM elif self.__state == WAITING_NEXTCHAR_ATTNAME: if texttools.isAlphaNum(c): self.__attName += c elif texttools.isSpace(c): self.__state = WAITING_NEXT_AFF elif c == '=': self.__state = WAITING_FIRSTCHAR_VALUE elif c == '<': self.__state = READ_INF_END_TAG elif c == '>': self.__state = READ_SUP_END_TAG elif self.__state == WAITING_NEXT_AFF: if texttools.isSpace(c): pass elif c == '>': self.__state = READ_SUP_END_TAG else: if self.__attName: self.__atts.append((self.__attName, self.__attValue)) self.__attName = c self.__attValue = None self.__attCount += 1 self.__state = WAITING_NEXTCHAR_ATTNAME elif self.__state == WAITING_FIRSTCHAR_VALUE: if texttools.isSpace(c): pass elif c in ("\"", "'"): self.__quote = c self.__attValue = "" self.__state = WAITING_CHAR_IN_QUOTE_VALUE elif c == '<': # self.__state = READ_END_TAG # to verify... self.__state = READ_INF_END_TAG else: if not self.__attValue: self.__attValue = c else: self.__attValue += c self.__state = WAITING_CHAR_VALUE elif self.__state == WAITING_CHAR_VALUE: if texttools.isSpace(c): self.__atts.append((self.__attName, self.__attValue)) self.__attName = "" self.__attValue = None self.__state = WAITING_NEXTCHAR_ATTNAME elif c == '>': self.__state = READ_SUP_END_TAG else: if not self.__attValue: self.__attValue = c else: self.__attValue += c elif self.__state == WAITING_CHAR_IN_QUOTE_VALUE: if c == self.__quote: self.__state = WAITING_AFTER_QUOTE else: if not self.__attValue: self.__attValue = c else: self.__attValue += c elif self.__state == WAITING_AFTER_QUOTE: if texttools.isSpace(c): self.__state = WAITING_SPACE_AFTER_AFF elif c == '<': self.__state = READ_INF_END_TAG elif c == '>': self.__state = READ_SUP_END_TAG elif self.__state == WAITING_SPACE_AFTER_AFF: if texttools.isSpace(c): pass elif c == '<': self.__state = READ_INF_END_TAG elif c == '>': self.__state = READ_SUP_END_TAG else: self.__atts.append((self.__attName, self.__attValue)) self.__attName = c self.__attValue = None self.__state = WAITING_NEXTCHAR_ATTNAME elif self.__state == WAITING_IN_SCRIPT: if c == '<': self.__state = WAITING_END_SCRIPT1 else: self.__text.write(c) elif self.__state == WAITING_END_SCRIPT1: if c == '/': self.__attName = "" self.__endingTag = 1 self.__state = WAITING_END_SCRIPT2 else: self.__state = WAITING_IN_SCRIPT self.__text.write("<" + c) elif self.__state == WAITING_END_SCRIPT2: if texttools.isAlpha(c): self.__attName += c elif c == '>': # print self.__attName if self.__attName.lower() == self.__jumpedTag: self.handle_data(self.__text.getvalue()) self.__text = cStringIO.StringIO() self.__state = READ_SUP_END_TAG else: self.__state = WAITING_IN_SCRIPT self.__text.write("</" + self.__attName + c) else: self.__text.write("</" + self.__attName + c) self.__state = WAITING_IN_SCRIPT # if self.__state == READ_AFF: # if self.__attName: # self.__atts.append((self.__attName, self.__attValue)) # self.__attName = "" # self.__state = WAITING_INF if self.__state == READ_SUP_END_TAG: if self.__attName: if self.__lastC == '/' and self.__lastLastC == ' ': self.handle_startendtag(self.__atts[0][0], self.__atts[1:]) self.__state = WAITING_INF else: self.__atts.append((self.__attName, self.__attValue)) self.__state = WAITING_INF if self.__endingTag: self.handle_endtag(self.__atts[0][0], self.__atts[1:]) self.__state = WAITING_INF else: self.handle_starttag(self.__atts[0][0], self.__atts[1:]) if self.__atts[0][0].lower() in ('script', 'style'): self.__state = WAITING_IN_SCRIPT self.__jumpedTag = self.__atts[0][0].lower() else: self.__state = WAITING_INF self.__attName = "" self.__attValue = None self.__atts = [] if self.__state == READ_INF_END_TAG: if self.__attName: self.__atts.append((self.__attName, self.__attValue)) self.__attName = "" self.__state = WAITING_FIRSTCHAR_ATTNAME self.__lastLastC = self.__lastC self.__lastC = c self.__offset += self.__pos