示例#1
0
    def add_words(self, sentence, parsed_sentence, raw_text):
        """Given a Sentence and its parsed text, and find the PoS, lemmas, 
        and space_befores of each word in the sentence, and add them to the
        Sentence object.
        """
        words = dict()
        position = 0
        space = re.compile(r'\s')
        cr = re.compile(r'[\n\r]')

        for word_data in parsed_sentence["words"]:
            surface = word_data[0]
            part_of_speech = word_data[1]["PartOfSpeech"]
            try:
                lemma = word_data[1]["Lemma"].lower()
            except AttributeError as err:
                # this word wasn't recognized as a word by the parser,
                # it's probably a weird character or something
                lemma = "*" * (int(word_data[1]["CharacterOffsetEnd"]) - int(word_data[1]["CharacterOffsetBegin"]))
                surface = "*" * (int(word_data[1]["CharacterOffsetEnd"]) - int(word_data[1]["CharacterOffsetBegin"]))
            space_before = ""
            try:
                prevChar = raw_text[int(word_data[1]["CharacterOffsetBegin"]) - 1]
                if space.match(prevChar):
                    if cr.match(prevChar):
                        space_before = "\n"
                    else:
                        space_before = " "
            except IndexError:
                pass

            key = (surface.lower(), part_of_speech, lemma)

            if key in words:
                word = words[key]

            else:
                try:
                    word = Word.query.filter_by(lemma=lemma, surface=surface.lower(),
                                                part_of_speech=part_of_speech).one()
                except MultipleResultsFound:
                    project_logger.warning("Duplicate records found for: %s",
                                           str(key))
                except NoResultFound:
                    word = Word(lemma=lemma, surface=surface.lower(), part_of_speech=part_of_speech)
                    word.save(False)

                words[key] = word

            sentence.add_word(
                word=word,
                position=position,
                space_before=space_before,
                surface=surface,
                project=self.project,
                force=False
            )

            position += 1

        db.session.commit()
示例#2
0
    def add_words(self, sentence, parsed_sentence, raw_text):
        """Given a Sentence and its parsed text, and find the PoS, lemmas, 
        and space_befores of each word in the sentence, and add them to the
        Sentence object.
        """
        words = dict()
        position = 0
        space = re.compile(r'\s')
        cr = re.compile(r'[\n\r]')

        for word_data in parsed_sentence["words"]:
            surface = word_data[0]
            part_of_speech = word_data[1]["PartOfSpeech"]
            try:
                lemma = word_data[1]["Lemma"].lower()
            except AttributeError as err:
                # this word wasn't recognized as a word by the parser,
                # it's probably a weird character or something
                lemma = "*" * (int(word_data[1]["CharacterOffsetEnd"]) -
                               int(word_data[1]["CharacterOffsetBegin"]))
                surface = "*" * (int(word_data[1]["CharacterOffsetEnd"]) -
                                 int(word_data[1]["CharacterOffsetBegin"]))
            space_before = ""
            try:
                prevChar = raw_text[int(word_data[1]["CharacterOffsetBegin"]) -
                                    1]
                if space.match(prevChar):
                    if cr.match(prevChar):
                        space_before = "\n"
                    else:
                        space_before = " "
            except IndexError:
                pass

            key = (surface.lower(), part_of_speech, lemma)

            if key in words:
                word = words[key]

            else:
                try:
                    word = Word.query.filter_by(
                        lemma=lemma,
                        surface=surface.lower(),
                        part_of_speech=part_of_speech).one()
                except MultipleResultsFound:
                    project_logger.warning("Duplicate records found for: %s",
                                           str(key))
                except NoResultFound:
                    word = Word(lemma=lemma,
                                surface=surface.lower(),
                                part_of_speech=part_of_speech)
                    word.save(False)

                words[key] = word

            sentence.add_word(word=word,
                              position=position,
                              space_before=space_before,
                              surface=surface,
                              project=self.project,
                              force=False)

            position += 1

        db.session.commit()