예제 #1
0
def make_esdc_candidate(esdc_structure, esdc_field_to_texts):
    field_map = {}
    entire_text = ""
    for field in ExtendedSdc.fieldNames:
        if not esdc_structure.childIsEmpty(field):
            text = random.choice(esdc_field_to_texts[field])
            entire_text += text + " "
            start = len(entire_text) - len(text) - 1
            end = start + len(text)
            field_map[field] = (start, end)
    field_standoffs = dict((f, [[entire_text.__getslice__(*r), r]])
                           for f, r in field_map.iteritems())
    tokenizer = IndexedTokenizer()
    for field, standoffs in field_standoffs.iteritems():
        results = []
        for text, range_tuple in standoffs:
            start = range_tuple[0]
            indexes, tokens = tokenizer.tokenize(text)

            for idx, word in zip(indexes, tokens):
                range_tuple = [start + idx, start + idx + len(word)]
                results.append([word, range_tuple])
        field_standoffs[field] = results

    esdc_candidate = esdcIo.fromYaml(entire_text,
                                     {esdc_structure.type: field_standoffs})
    return esdc_candidate
예제 #2
0
    def testPunktTokenizer(self):
        tokenizer = IndexedTokenizer(PunktWordTokenizer())
        string = " Facing the long wall in front of you, your destination will be the first door to your left (36-880)."
        tokens = tokenizer.tokenize(string)
        self.assertEqual([t.text for t in tokens],
                         ['Facing', 'the', 'long', 'wall', 'in', 'front', 'of', 'you', ',', 'your', 'destination', 'will', 'be', 'the', 'first', 'door', 'to', 'your', 'left', '(', '36-880', ')', '.'])

        for i, token in enumerate(tokens):
            self.assertEqual(string[tokens[i].start:tokens[i].end], token.text)
예제 #3
0
    def testPunktTokenizerContraction(self):
        tokenizer = IndexedTokenizer(PunktWordTokenizer())
        string = " You'll see a large white question mark."
        tokens = tokenizer.tokenize(string)
        self.assertEqual([t.text for t in tokens],
                         ['You', "'ll", 'see', 'a', 'large', 'white', 'question', 'mark', '.'])

        for i, token in enumerate(tokens):
            self.assertEqual(string[tokens[i].start:tokens[i].end], token.text)
예제 #4
0
    def testPunktTokenizerNiceView(self):
        tokenizer = IndexedTokenizer(PunktWordTokenizer())
        string = "you should have  a    nice   view ."
        tokens = tokenizer.tokenize(string)
        self.assertEqual([t.text for t in tokens],
                         ['you', "should", 'have', 'a', 'nice', 'view', '.'])
        self.assertEqual([t.start for t in tokens],
                         [0,      4,       11,     17,   22,     29,     34])

        for i, token in enumerate(tokens):
            self.assertEqual(string[tokens[i].start:tokens[i].end], token.text)
예제 #5
0
    def testMultipleSentences(self):
        tokenizer = IndexedTokenizer()
        sentences = """With your back to the windows, walk straight through the door near the elevators.  Continue
    to walk straight, going through one door until you come to an intersection just
    past a whiteboard.  Turn left, turn right, and enter the second door on your right
    (sign says "Administrative Assistant").  """
        tokens = tokenizer.tokenize(sentences)
        for token in tokens:
            print str(token)

        self.assertEqual(tokens[14].text, "elevators")
        self.assertEqual(tokens[15].text, ".")
예제 #6
0
def pos_histograms(discourses, 
                   pos_tagger,
                   tag_groups={"Verbs":["VBZ", "VB"],
                               "Nouns":["NN", "NNS"],
                               "Prepositions":["IN", "TO", "UNTIL", "OF"],
                               "Adjectives":["JJ"],
                               }):

    stokenizer = SentenceTokenizer()
    tokenizer = IndexedTokenizer()
    tag_groups_to_words = collections.defaultdict(lambda : list())
    

    for discourse in discourses:
        for sentence_standoff in stokenizer.tokenize(discourse):
            tokens = tokenizer.tokenize(sentence_standoff.text)
            for key_tag, tag_group in tag_groups.iteritems():
                tags = pos_tagger.tag([t.text.lower() for t in tokens])
                for token, tag in tags:
                    if tag in tag_group:
                        tag_groups_to_words[key_tag].append(token)
                    tag_groups_to_words["all"].append(token)

    print "dumping counts"
    for pos, words in tag_groups_to_words.iteritems():
        print len(words), pos
        print len(set(words)), "unique", pos
        w_to_counts = collections.defaultdict(lambda : 0)
        for w in words:
            w_to_counts[w] += 1

        cnt_target = 10
        frequent_words = [(w, cnt) for w, cnt in w_to_counts.iteritems() 
                          if cnt > cnt_target]
        print len(frequent_words)
        print "if appeared more than %d times" % cnt_target
        print frequent_words

    print "done"
    return tag_groups_to_words

        
예제 #7
0
파일: yamlReader.py 프로젝트: h2r/slu_core
def handleEsdcContents(argMap, entireText):
    outputDict = {}
    for argName, argValue in argMap.iteritems():
        assert argName in ExtendedSdc.fieldNames, ("Arg " + ` argName ` +
                                                   " not in names." +
                                                   " Value: " + ` argValue `)
        if argName == "id":
            outputDict["esdc_id"] = argValue
        elif isinstance(argValue, str):
            matches = list(re.finditer(re.escape(argValue), entireText))
            if len(matches) == 1:
                match = matches[0]
                tokens = []
                matchText = match.group()
                currentIndex = 0
                for token in matchText.split():
                    # we've ensured the index both exists and is unique.
                    tokenIdx = matchText[currentIndex:].index(
                        token) + currentIndex
                    standoff = TextStandoff(
                        entireText, (match.start() + tokenIdx,
                                     match.start() + tokenIdx + len(token)))
                    currentIndex = tokenIdx + len(token)
                    tokens.append(standoff)
                outputDict[argName] = tokens
            else:
                candidates = [[match.start(), match.end()]
                              for match in matches]
                token = argValue
                tokenizer = IndexedTokenizer()
                for candidate in candidates:
                    print "candidate", candidate
                    for standoff in tokenizer.tokenize(argValue):
                        print "- -", standoff.text
                        start_idx = standoff.start
                        print "  - [%d, %d]" % (candidate[0] + start_idx,
                                                candidate[0] + start_idx +
                                                len(token))

                raise ValueError("Must provide indices for token: '" +
                                 argValue + "' in text '" + entireText + "'."
                                 " matches: " + ` candidates `)

        elif isEsdc(argValue):
            outputDict[argName] = list(fromYaml(entireText, argValue))

        elif isWordList(argValue):
            tokens = []
            try:
                for token, (start, end) in argValue:
                    substring = entireText[start:end]
                    if substring != token:
                        print "candidates"
                        for match in re.finditer(token, entireText):
                            print[match.start(), match.end()]

                        raise ValueError("Token '" + token +
                                         "' must correspond" + " to index " +
                                         ` (start, end) ` + "and not '" +
                                         substring + "'.")
                    tokens.append(TextStandoff(entireText, (start, end)))
            except:
                print "Problem with", argValue
                raise
            outputDict[argName] = tokens

        else:
            raise ValueError("Must be strings or ESDCs: " + ` argValue `)

    return outputDict
예제 #8
0
    #     ]
    templates = brill.fntbl37()
    trainer = nltk.BrillTaggerTrainer(raubt_tagger, templates)
    braubt_tagger = trainer.train(train_sents, max_rules=100, min_score=3)
    pickle.dump(braubt_tagger, open("data/braubt_tagger.dat", "w"))
    return braubt_tagger


def makeTagger():
    #    return makeBrillTagger()
    tagger = pickle.load(open(TKLIB_HOME + "/nlp/data/braubt_tagger.dat"))
    tagger = SpatialTagger(tagger)
    return tagger


wordTokenizer = IndexedTokenizer()


def tokenize(string):
    return wordTokenizer.tokenize(string)


def simpleTokenize(string):
    indexedTokens = splitIdx(string)
    tokens = []
    for token, index in indexedTokens:
        newToken = trimPunctuation(token)
        if len(newToken) == 0:
            tokens.append(token)
        else:
            tokens.append(newToken)
예제 #9
0
 def testEmpty(self):
     tokenizer = IndexedTokenizer()
     tokens = tokenizer.tokenize("  ")
     self.assertEqual(len(tokens), 0)