def getOtherTaggedText(info): taggedtext = TextMarker.getTaggedText(info) # print taggedtext # print '' btags2 = ['B_WHAT', 'B_WHO', 'B_WHEN', 'B_WHERE', 'B_WHY', 'B_HOW'] etags2 = ['E_WHAT', 'E_WHO', 'E_WHEN', 'E_WHERE', 'E_WHY', 'E_HOW'] for i, tag in enumerate(btags2): taggedtext = taggedtext.replace(TextMarker.btags[i], tag) for i, tag in enumerate(etags2): taggedtext = taggedtext.replace(TextMarker.etags[i], tag) text = "" state = 0 for token in Tokenizer.getTokens(taggedtext): if (reduce( (lambda x, y: x or y), list(map((lambda x: x in token), btags2)) )): state += len([item for item in list(map((lambda x: x in token), btags2)) if item]) if (state==0): # print "%s\t%s" % (state, TextMarker.othertags[0] + token + TextMarker.othertags[1]) text += TextMarker.othertags[0] + token + TextMarker.othertags[1] else: # print "%s\t%s" % (state, token) text += token + " " if (reduce( (lambda x, y: x or y), list(map((lambda x: x in token), etags2)) )): state -= len([item for item in list(map((lambda x: x in token), etags2)) if item]) for i, tag in enumerate(TextMarker.btags): text = text.replace(btags2[i], tag) for i, tag in enumerate(TextMarker.etags): text = text.replace(etags2[i], tag) return text
def __init__(self, _what, _who, _when, _where, _why, _how, _text): self.what = Tokenizer.removeNonAscii(_what).replace(".\"",". \"") self.who = Tokenizer.removeNonAscii(_who).replace(".\"",". \"") self.when = Tokenizer.removeNonAscii(_when).replace(".\"",". \"") self.where = Tokenizer.removeNonAscii(_where).replace(".\"",". \"") self.why = Tokenizer.removeNonAscii(_why).replace(".\"",". \"") self.how = Tokenizer.removeNonAscii(_how).replace(".\"",". \"") self.text = Tokenizer.removeNonAscii(_text).replace(".\"",". \"") self.sentences = Tokenizer.getSentences(self.text) self.tokenized_sentences = [Tokenizer.getTokens(sentence) for sentence in self.sentences]
def getMarkedText(info): omtext = TextMarker.getOtherTaggedText(info) # print omtext result = "" searchObj = re.findall( r'\[b(.+?)\](.+?)\[e.+?\]', omtext) # print len(searchObj) for tup in searchObj: # print tup if (tup[0]=="other"): result += "[%s]%s[%s]" % (tup[0], tup[1], tup[0]) else: label = tup[0] tokens = Tokenizer.getTokens(tup[1]) for i, token in enumerate(tokens): prefix = "beg" if(i==0) else "in" result += "[%s_%s]%s[%s_%s]" % (prefix, label, token, prefix, label) return result
def getMarkedText(info): omtext = TextMarker.getOtherTaggedText(info) # print omtext result = "" searchObj = re.findall(r'\[b(.+?)\](.+?)\[e.+?\]', omtext) # print len(searchObj) for tup in searchObj: # print tup if (tup[0] == "other"): result += "[%s]%s[%s]" % (tup[0], tup[1], tup[0]) else: label = tup[0] tokens = Tokenizer.getTokens(tup[1]) for i, token in enumerate(tokens): prefix = "beg" if (i == 0) else "in" result += "[%s_%s]%s[%s_%s]" % (prefix, label, token, prefix, label) return result
def main(): inputBuffer = "" outputBuffer = "" if (len(sys.argv) >= 2): inputBuffer = readFile(sys.argv[1]) tokenizer = Tokenizer(inputBuffer) tokenizer.tokenize() tokens = tokenizer.getTokens() parser = Parser(tokens) parser.parse() outputBuffer = parser.getOutput() # print(outputBuffer) if (len(sys.argv) == 3): writeFile(sys.argv[2], outputBuffer) else: writeFile("output.html", outputBuffer)
def getOtherTaggedText(info): taggedtext = TextMarker.getTaggedText(info) # print taggedtext # print '' btags2 = ['B_WHAT', 'B_WHO', 'B_WHEN', 'B_WHERE', 'B_WHY', 'B_HOW'] etags2 = ['E_WHAT', 'E_WHO', 'E_WHEN', 'E_WHERE', 'E_WHY', 'E_HOW'] for i, tag in enumerate(btags2): taggedtext = taggedtext.replace(TextMarker.btags[i], tag) for i, tag in enumerate(etags2): taggedtext = taggedtext.replace(TextMarker.etags[i], tag) text = "" state = 0 for token in Tokenizer.getTokens(taggedtext): if (reduce((lambda x, y: x or y), list(map((lambda x: x in token), btags2)))): state += len([ item for item in list(map((lambda x: x in token), btags2)) if item ]) if (state == 0): # print "%s\t%s" % (state, TextMarker.othertags[0] + token + TextMarker.othertags[1]) text += TextMarker.othertags[0] + token + TextMarker.othertags[ 1] else: # print "%s\t%s" % (state, token) text += token + " " if (reduce((lambda x, y: x or y), list(map((lambda x: x in token), etags2)))): state -= len([ item for item in list(map((lambda x: x in token), etags2)) if item ]) for i, tag in enumerate(TextMarker.btags): text = text.replace(btags2[i], tag) for i, tag in enumerate(TextMarker.etags): text = text.replace(etags2[i], tag) return text