Python word_tokenize 예제들, nlp.word_tokenize Python 예제들

예제 #1

0

파일 보기

파일: occ.py 프로젝트: amcgail/nytimes-obituaries

def check_phrase_for_occupations_nobreaks(s):
    import nlp
    from occ import set2code

    found = []

    words = nlp.word_tokenize(s)
    words = [nlp.lemmatize(x) for x in words]

    sets = set()
    sets.update(
        nlp.getCloseUnorderedSets(words, minTuple=1, maxTuple=1, maxBuffer=0))
    sets.update(
        nlp.getCloseUnorderedSets(words, minTuple=2, maxTuple=2, maxBuffer=0))
    sets.update(
        nlp.getCloseUnorderedSets(words, minTuple=3, maxTuple=3, maxBuffer=0))
    sets.update(
        nlp.getCloseUnorderedSets(words, minTuple=4, maxTuple=4, maxBuffer=0))

    for fs in sets:
        if fs in set2code:
            c = set2code[fs]["code"]

            found.append({"word": " ".join(fs), "occ": [c], "fs": fs})

    def is_subset_anyone(x):
        for y in found:
            if x['fs'] != y['fs'] and x['fs'].issubset(y['fs']):
                return True

    found = [x for x in found if not is_subset_anyone(x)]

    return found

예제 #2

0

파일 보기

    def choose_function(self, r_content):
        start_function_point = 0
        shutdown_function_point = 0
        for _key in self.start_function_keywords:
            if _key.strip() in r_content:
                start_function_point += 1
        for _key in self.shutdown_function_keywords:
            if _key.strip() in r_content:
                shutdown_function_point += 1

        max_point = max(start_function_point, shutdown_function_point)

        # debug
        if self.properties['debugmode']:
            print('[start_function: ' + str(start_function_point) + "]")
            print('[shutdown_function: ' + str(shutdown_function_point) + ']')

        if [start_function_point, shutdown_function_point].count(max_point) > 1:
            a = input("Not very clear. Which function should i choose ?\n"
                      "1. start_function\n"
                      "2. shutdown_function\n"
                      "3. \n"
                      "4. \n"
                      "Your choice: ")

            if a == '2':
                self.shutdown_function_keywords += nlp.word_tokenize(r_content)
                self.shutdown_function_keywords = nlp.remove_duplicate_in_list(self.shutdown_function_keywords)
                return 'shutdown_function'

            elif a == '3':
                return 'start_function'
            elif a == '4':
                return 'start_function'
            else:
                self.start_function_keywords += nlp.word_tokenize(r_content)
                self.start_function_keywords = nlp.remove_duplicate_in_list(self.start_function_keywords)
                self.save_keywords_data_xml()
                return 'start_function'

        if max_point == 0 or max_point == start_function_point:
            return 'start_function'
        else:
            return 'shutdown_function'

예제 #3

0

파일 보기

        def check(s):
            import nlp
            from occ import term2code
            found = []

            words = nlp.word_tokenize(s.lower())
            words += ["-"] + nlp.word_tokenize(s.lower())

            # This algorithm proceeds from largest to smallest tuples, making sure not to count any codes inside codes

            max_tuples = 4
            current_tuples = max_tuples

            process_now = nlp.getTuples(words, minTuple=4, maxTuple=4)

            while current_tuples > 0:
                # print(process_now, current_tuples)

                dont_process_next = set()

                for tup in process_now:
                    tocheck = " ".join(tup)
                    if tocheck in term2code:
                        c = term2code[tocheck]["code"]
                        found.append({"word": tocheck, "occ": [c]})

                        dont_process_next.update(
                            nlp.getTuples(list(tup),
                                          minTuple=current_tuples - 1,
                                          maxTuple=current_tuples - 1))

                # print(dont_process_next)

                process_now = set(
                    nlp.getTuples(words,
                                  minTuple=current_tuples - 1,
                                  maxTuple=current_tuples - 1))
                process_now = process_now.difference(dont_process_next)

                current_tuples -= 1

            return found

예제 #4

0

파일 보기

파일: important.py 프로젝트: amcgail/nytimes-obituaries

    def run(self):

        from nlp import sent_tokenize, word_tokenize, lemmatize
        fb = self['fullBody']

        possibilities = sent_tokenize(fb)

        for i, p in enumerate(possibilities):
            words = word_tokenize(fb)
            words = map(lemmatize, words)
            if len([x for x in words if x == 'die']):
                return p

예제 #5

0

파일 보기

파일: important.py 프로젝트: amcgail/nytimes-obituaries

    def run(self):
        import nlp

        my_props = set()

        toSearch = self.ofWhat['firstSentence']

        # don't want lexicon (or a name) to be spotted in the "died on the 3rd with is family"
        toSearch = toSearch.split("died")[0]
        toSearch = toSearch.split("dead")[0]
        toSearch = toSearch.split("killed")[0]
        toSearch = toSearch.split("drowned")[0]

        # their own name might get confusing for this analysis...
        toSearch = toSearch.replace(self.ofWhat["name"], "")

        # intelligent tokenization
        toSearchWords = nlp.word_tokenize(toSearch)

        kinMatch = 0
        kinMatchStronger = 0

        lexicon = nlp.inquirer_lexicon["KIN"]

        for x in toSearchWords:
            if x.upper() in lexicon:
                kinMatch += 1
        for x in nlp.getTuples(toSearchWords, 2, 2):
            if x[0].upper() in lexicon and x[1].upper() == "OF":
                kinMatchStronger += 1

        if kinMatch > 0:
            my_props.add("lex_match")
        if kinMatchStronger > 0:
            my_props.add("lex_match_strong")

        # I also need a full name that matches in the last name...
        for names in nlp.getTuples(toSearchWords, 2, 3):
            # must be capitalized...
            if any(x[0].lower() == x[0] for x in names):
                continue

            # last must be the same name!
            if names[-1].lower() != self.ofWhat["last_name"]:
                continue

            my_props.add("name_match")
            break

        return list(my_props)

예제 #6

0

파일 보기

파일: occ.py 프로젝트: amcgail/nytimes-obituaries

def loadAssociations():
    global codes
    global term2code
    global set2code

    CSV_fn = path.join(path.dirname(__file__), "..", "w2c_source", "compiledCodes.csv")
    print("Loading term-code associations into variable 'codes' from %s..." % CSV_fn)
    print("Loading term dictionary into variable 'term2code' from %s..." % CSV_fn)

    with open(CSV_fn, 'r') as outCodesF:
        CSV_r = DictReader(outCodesF)
        codes = list(CSV_r)

    for code in codes:
        term2code[ code["term"] ] = code

        words = nlp.word_tokenize( code["term"] )
        words = [nlp.lemmatize(x) for x in words]
        set2code[ frozenset(words) ] = code

예제 #7

0

파일 보기

파일: name.py 프로젝트: amcgail/nytimes-obituaries

    def run_old(self):
        import nlp
        ret = None

        # print(self.ofWhat['firstSentence'])
        # most consistently, it's the first noun chunk:
        def isName(x):
            if len(x.split()) < 2:
                return False
            if not nlp.isTitleCase(x):
                return False
            return True

        # start with NER from spacy:
        if ret is None:
            guesses = self.ofWhat['spacyFirstSentence'].ents
            guesses = [
                x for x in guesses if x.label_ == 'PERSON' and isName(x.text)
            ]
            if len(guesses) > 0:
                # just use the first one
                # and we'll probably need expansion
                ret = guesses[0].text
                # print("NER for the win")

        # first, expand. it many times doesn't get parens, or Dr. Rev. etc.
        # we then need to look deeper, if it's a "Mr." "Mrs." or "Dr."

        # then just try some noun chunking...
        if ret is None:
            nc = list(self.ofWhat['spacyFirstSentence'].noun_chunks)
            if len(nc) > 0:
                nc = list(filter(isName, map(str, nc)))
                if len(nc) > 0:
                    ret = nc[0]
                    # print("Noun Chunk Found!")

        if ret is None:
            ret = None
        return ret
        # print(name)

        if False:
            # try spacy's NER:
            guesses = self.ofWhat['spacyFirstSentence'].ents
            guesses = [x for x in guesses if x.label_ == 'PERSON']
            print("FS:", self.ofWhat['firstSentence'])
            if len(guesses) > 0:
                print("Found:", [x.text for x in guesses])

        if True:
            # name is ALMOST ALWAYS the first noun_chunk.
            nc = list(self.ofWhat['spacyFirstSentence'].noun_chunks)
            if len(nc) > 0:
                nc = list(filter(nlp.isTitleCase, map(str, nc)))
                if len(nc) > 0:
                    # print(nc)
                    pass
                # print("FS:", self.ofWhat['firstSentence'])
            return

            # also could just check that the words are in the title...
            if fsname is not None:
                t = self.ofWhat['title'].lower()
                tw = set(nlp.word_tokenize(t))
                fsnamew = set(nlp.word_tokenize(str(fsname).lower()))
                if len(tw.intersection(fsnamew)) > 0:
                    # print(fsname)
                    pass

            # the title is a good check
            if False:
                tname = self.ofWhat['title']
                pats = [
                    "is dead",
                    ",",
                    "dies",
                    "is slain",
                    "of",
                    "dead",
                ]
                for pat in pats:
                    tname = re.split(pat, tname, flags=re.IGNORECASE)[0]

예제 #8

0

파일 보기

    def run(self):
        import g
        from occ import set2code
        import nlp
        import wiki
        import re
        """
        if len(self.ofWhat['spacyFirstSentence']) == 0:
            if self.debug:
                g.p("Skipping. No content after trim.")
            coding.stateCounter.update(["zeroLengthSkip"])
            return

        if self.debug:
            g.p.depth = 0
            g.p()
            g.p(self.ofWhat['spacyFirstSentence'])

            g.p.depth += 1
        """

        dead_guys_occs = set()

        if len(self.ofWhat["name"]) > 0:
            words = wiki.lookupOccupationalTitles(self.ofWhat["name"])
            for x in words:
                dead_guys_occs.update(set2code[set(x)])

            if len(dead_guys_occs) > 0:
                if self.debug:
                    g.p("WikiData returns %s which gives OCC %s" %
                        (words, dead_guys_occs))

        if self.debug:
            g.p("Extracted name: %s" % self.ofWhat["name"])

        # extract information from the title
        dieWords = ['dies', 'die', 'dead']
        t = self.ofWhat['title']
        ts = [x.strip() for x in re.split(r'[;,]|--', t)]
        ts = ts[1:]  # the name is always the first one

        for tp in ts:
            tpW = [x.lower() for x in nlp.word_tokenize(tp)]
            hasDeathWord = False
            for dW in dieWords:
                if dW in tpW:
                    hasDeathWord = True
            if hasDeathWord:
                continue

            # if it's a number, continue
            try:
                int(tp)
                continue
            except ValueError:
                pass

            if self.debug:
                g.p("Extracted from title:", tp)

        didSomething = False

        guesses = []

        # Alec McGail, scientist and genius, died today.
        nameChildren = list(self.ofWhat["spacyName"].root.children)
        apposHooks = list(
            filter(lambda nameChild: nameChild.dep_ == 'appos', nameChildren))

        if len(apposHooks) > 0:
            didSomething = True

            # painter, scientist, and architect
            baseNouns = nlp.followRecursive(apposHooks, 'conj')

            # one of the first **novelists**
            for i, x in enumerate(baseNouns):
                if nlp.isPrepPhrase(x) and str(x) == 'one':
                    baseNouns[i] = nlp.enterPrepPhrase(x)[0]

            # now that the important "what they were" nouns are identified,
            #   identify what OCC they are
            for n in baseNouns:
                result = set2code[set(n)]
                guesses.append(result)

        return guesses

        # Alec McGail, who ..., died today.
        relcls = list(
            filter(lambda nameChild: nameChild.dep_ == 'relcl', nameChildren))

        if len(relcls) > 0:
            g.p.depth += 1

        for relcl in relcls:
            # need to follow advcl and conj
            goDeep = nlp.followRecursive(relcl, ['advcl', 'conj'])
            be = ['was', 'became']
            for v in goDeep:
                # as _
                followPreps = nlp.followRecursive(v, ['relcl', 'prep', 'pobj'])
                asWhat = [
                    x for x in followPreps
                    if next(x.ancestors).text == 'as' and x.pos_ == 'pobj'
                ]

                if self.debug and len(asWhat):
                    g.p('whoAs', asWhat)

                if len(asWhat):
                    didSomething = True

                # who was a scientist and inventor
                if v.pos_ == 'VERB':
                    if v.text in be:
                        for vc in v.children:
                            if vc.dep_ != 'attr':
                                continue

                            if self.debug:
                                g.p('Expanded be verb', vc, vc.dep_)

                            # guesses.append(result)
                            didSomething = True

        finalGuess = []
        for guess in guesses:
            if len(guess['occ']) != 1:
                continue
            finalGuess.append(guess['occ'][0])

        if self.debug:
            g.p("finalGuess", finalGuess)

        if False:
            moreGuesses = []
            # more stupid guesses...
            # literally expand every noun

            for w in self.ofWhat['spacyFirstSentence']:
                if w.pos_ != 'NOUN':
                    continue
                guess = coding.nounOCC(w)
                moreGuesses.append(guess)

            stupidFinalGuess = []
            for guess in moreGuesses:
                stupidFinalGuess += guess['occ']

            if self.debug:
                g.p("stupidFinalGuess", stupidFinalGuess)

                if set(stupidFinalGuess) != set(finalGuess):
                    g.p("And they're different!", extrad=1)

        if not didSomething:
            if len(dead_guys_occs) > 0:
                coding.stateCounter.update(["justWikidata"])
            else:
                if self.debug:
                    g.p("Skipping. Strange grammatical construction.")
                coding.stateCounter.update(["strangeGrammar"])

예제 #9

0

파일 보기

    def choose_bot(self, r_content):
        chatbot_point = 0
        infobot_point = 0
        cmdbot_point = 0
        learningbot_point = 0
        r_content_list = nlp.word_tokenize(r_content)
        for _key in self.chatbot_keywords:
            if _key.strip() in r_content_list:
                chatbot_point += 1
        for _key in self.infobot_keywords:
            if _key.strip() in r_content_list:
                infobot_point += 1
        for _key in self.cmdbot_keywords:
            if _key.strip() in r_content_list:
                cmdbot_point += 1
        for _key in self.learningbot_keywords:
            if _key.strip() in r_content_list:
                learningbot_point += 1

        max_point = max(chatbot_point, infobot_point, cmdbot_point,
                        learningbot_point)

        # debug
        if self.properties['debugmode']:
            print('[chatBot: ' + str(chatbot_point) + "]")
            print('[infoBot: ' + str(infobot_point) + "]")
            print('[cmdBot:  ' + str(cmdbot_point) + "]")
            print('[learningBot: ' + str(learningbot_point) + "]")
            print('[max_point: ' + str(max_point) + "]")

        # Hoi y kien nguoi dung neu co 2 hoac nhieu bot co cung so diem
        if [chatbot_point, infobot_point, cmdbot_point, learningbot_point
            ].count(max_point) > 1:
            a = input("Not very clear. Which bot should i choose ?\n"
                      "1. ChatBot\n"
                      "2. InfoBot\n"
                      "3. CmdBot\n"
                      "4. LearningBot\n"
                      "Your choice: ")

            if a == '2':
                self.infobot_keywords += nlp.word_tokenize(cmd.raw_content)
                self.infobot_keywords = nlp.remove_duplicate_in_list(
                    self.infobot_keywords)
                self.save_keywords_data_xml()
                return 'infobot'

            elif a == '3':
                self.cmdbot_keywords += nlp.word_tokenize(cmd.raw_content)
                self.cmdbot_keywords = nlp.remove_duplicate_in_list(
                    self.cmdbot_keywords)
                self.save_keywords_data_xml()
                return 'cmdbot'
            elif a == '4':
                self.learningbot_keywords += nlp.word_tokenize(cmd.raw_content)
                self.learningbot_keywords = nlp.remove_duplicate_in_list(
                    self.learningbot_keywords)
                self.save_keywords_data_xml()
                return 'learningbot'
            else:
                self.chatbot_keywords += nlp.word_tokenize(cmd.raw_content)
                self.chatbot_keywords = nlp.remove_duplicate_in_list(
                    self.chatbot_keywords)
                self.save_keywords_data_xml()
                return 'chatbot'

        if max_point == 0 or max_point == chatbot_point:
            return 'chatbot'
        else:
            if max_point == infobot_point:
                return 'infobot'
            elif max_point == cmdbot_point:
                return 'cmdbot'
            else:
                return 'learningbot'

예제 #10

0

파일 보기

 def _tokenizer(text):
     return nlp.word_tokenize(text, remove_punct=False, remove_num=True)