def make_esdc_candidate(esdc_structure, esdc_field_to_texts): field_map = {} entire_text = "" for field in ExtendedSdc.fieldNames: if not esdc_structure.childIsEmpty(field): text = random.choice(esdc_field_to_texts[field]) entire_text += text + " " start = len(entire_text) - len(text) - 1 end = start + len(text) field_map[field] = (start, end) field_standoffs = dict((f, [[entire_text.__getslice__(*r), r]]) for f, r in field_map.iteritems()) tokenizer = IndexedTokenizer() for field, standoffs in field_standoffs.iteritems(): results = [] for text, range_tuple in standoffs: start = range_tuple[0] indexes, tokens = tokenizer.tokenize(text) for idx, word in zip(indexes, tokens): range_tuple = [start + idx, start + idx + len(word)] results.append([word, range_tuple]) field_standoffs[field] = results esdc_candidate = esdcIo.fromYaml(entire_text, {esdc_structure.type: field_standoffs}) return esdc_candidate
def testPunktTokenizer(self): tokenizer = IndexedTokenizer(PunktWordTokenizer()) string = " Facing the long wall in front of you, your destination will be the first door to your left (36-880)." tokens = tokenizer.tokenize(string) self.assertEqual([t.text for t in tokens], ['Facing', 'the', 'long', 'wall', 'in', 'front', 'of', 'you', ',', 'your', 'destination', 'will', 'be', 'the', 'first', 'door', 'to', 'your', 'left', '(', '36-880', ')', '.']) for i, token in enumerate(tokens): self.assertEqual(string[tokens[i].start:tokens[i].end], token.text)
def testPunktTokenizerContraction(self): tokenizer = IndexedTokenizer(PunktWordTokenizer()) string = " You'll see a large white question mark." tokens = tokenizer.tokenize(string) self.assertEqual([t.text for t in tokens], ['You', "'ll", 'see', 'a', 'large', 'white', 'question', 'mark', '.']) for i, token in enumerate(tokens): self.assertEqual(string[tokens[i].start:tokens[i].end], token.text)
def testPunktTokenizerNiceView(self): tokenizer = IndexedTokenizer(PunktWordTokenizer()) string = "you should have a nice view ." tokens = tokenizer.tokenize(string) self.assertEqual([t.text for t in tokens], ['you', "should", 'have', 'a', 'nice', 'view', '.']) self.assertEqual([t.start for t in tokens], [0, 4, 11, 17, 22, 29, 34]) for i, token in enumerate(tokens): self.assertEqual(string[tokens[i].start:tokens[i].end], token.text)
def testMultipleSentences(self): tokenizer = IndexedTokenizer() sentences = """With your back to the windows, walk straight through the door near the elevators. Continue to walk straight, going through one door until you come to an intersection just past a whiteboard. Turn left, turn right, and enter the second door on your right (sign says "Administrative Assistant"). """ tokens = tokenizer.tokenize(sentences) for token in tokens: print str(token) self.assertEqual(tokens[14].text, "elevators") self.assertEqual(tokens[15].text, ".")
def pos_histograms(discourses, pos_tagger, tag_groups={"Verbs":["VBZ", "VB"], "Nouns":["NN", "NNS"], "Prepositions":["IN", "TO", "UNTIL", "OF"], "Adjectives":["JJ"], }): stokenizer = SentenceTokenizer() tokenizer = IndexedTokenizer() tag_groups_to_words = collections.defaultdict(lambda : list()) for discourse in discourses: for sentence_standoff in stokenizer.tokenize(discourse): tokens = tokenizer.tokenize(sentence_standoff.text) for key_tag, tag_group in tag_groups.iteritems(): tags = pos_tagger.tag([t.text.lower() for t in tokens]) for token, tag in tags: if tag in tag_group: tag_groups_to_words[key_tag].append(token) tag_groups_to_words["all"].append(token) print "dumping counts" for pos, words in tag_groups_to_words.iteritems(): print len(words), pos print len(set(words)), "unique", pos w_to_counts = collections.defaultdict(lambda : 0) for w in words: w_to_counts[w] += 1 cnt_target = 10 frequent_words = [(w, cnt) for w, cnt in w_to_counts.iteritems() if cnt > cnt_target] print len(frequent_words) print "if appeared more than %d times" % cnt_target print frequent_words print "done" return tag_groups_to_words
def handleEsdcContents(argMap, entireText): outputDict = {} for argName, argValue in argMap.iteritems(): assert argName in ExtendedSdc.fieldNames, ("Arg " + ` argName ` + " not in names." + " Value: " + ` argValue `) if argName == "id": outputDict["esdc_id"] = argValue elif isinstance(argValue, str): matches = list(re.finditer(re.escape(argValue), entireText)) if len(matches) == 1: match = matches[0] tokens = [] matchText = match.group() currentIndex = 0 for token in matchText.split(): # we've ensured the index both exists and is unique. tokenIdx = matchText[currentIndex:].index( token) + currentIndex standoff = TextStandoff( entireText, (match.start() + tokenIdx, match.start() + tokenIdx + len(token))) currentIndex = tokenIdx + len(token) tokens.append(standoff) outputDict[argName] = tokens else: candidates = [[match.start(), match.end()] for match in matches] token = argValue tokenizer = IndexedTokenizer() for candidate in candidates: print "candidate", candidate for standoff in tokenizer.tokenize(argValue): print "- -", standoff.text start_idx = standoff.start print " - [%d, %d]" % (candidate[0] + start_idx, candidate[0] + start_idx + len(token)) raise ValueError("Must provide indices for token: '" + argValue + "' in text '" + entireText + "'." " matches: " + ` candidates `) elif isEsdc(argValue): outputDict[argName] = list(fromYaml(entireText, argValue)) elif isWordList(argValue): tokens = [] try: for token, (start, end) in argValue: substring = entireText[start:end] if substring != token: print "candidates" for match in re.finditer(token, entireText): print[match.start(), match.end()] raise ValueError("Token '" + token + "' must correspond" + " to index " + ` (start, end) ` + "and not '" + substring + "'.") tokens.append(TextStandoff(entireText, (start, end))) except: print "Problem with", argValue raise outputDict[argName] = tokens else: raise ValueError("Must be strings or ESDCs: " + ` argValue `) return outputDict
# ] templates = brill.fntbl37() trainer = nltk.BrillTaggerTrainer(raubt_tagger, templates) braubt_tagger = trainer.train(train_sents, max_rules=100, min_score=3) pickle.dump(braubt_tagger, open("data/braubt_tagger.dat", "w")) return braubt_tagger def makeTagger(): # return makeBrillTagger() tagger = pickle.load(open(TKLIB_HOME + "/nlp/data/braubt_tagger.dat")) tagger = SpatialTagger(tagger) return tagger wordTokenizer = IndexedTokenizer() def tokenize(string): return wordTokenizer.tokenize(string) def simpleTokenize(string): indexedTokens = splitIdx(string) tokens = [] for token, index in indexedTokens: newToken = trimPunctuation(token) if len(newToken) == 0: tokens.append(token) else: tokens.append(newToken)
def testEmpty(self): tokenizer = IndexedTokenizer() tokens = tokenizer.tokenize(" ") self.assertEqual(len(tokens), 0)