def add_words(self, sentence, parsed_sentence, raw_text): """Given a Sentence and its parsed text, and find the PoS, lemmas, and space_befores of each word in the sentence, and add them to the Sentence object. """ words = dict() position = 0 space = re.compile(r'\s') cr = re.compile(r'[\n\r]') for word_data in parsed_sentence["words"]: surface = word_data[0] part_of_speech = word_data[1]["PartOfSpeech"] try: lemma = word_data[1]["Lemma"].lower() except AttributeError as err: # this word wasn't recognized as a word by the parser, # it's probably a weird character or something lemma = "*" * (int(word_data[1]["CharacterOffsetEnd"]) - int(word_data[1]["CharacterOffsetBegin"])) surface = "*" * (int(word_data[1]["CharacterOffsetEnd"]) - int(word_data[1]["CharacterOffsetBegin"])) space_before = "" try: prevChar = raw_text[int(word_data[1]["CharacterOffsetBegin"]) - 1] if space.match(prevChar): if cr.match(prevChar): space_before = "\n" else: space_before = " " except IndexError: pass key = (surface.lower(), part_of_speech, lemma) if key in words: word = words[key] else: try: word = Word.query.filter_by(lemma=lemma, surface=surface.lower(), part_of_speech=part_of_speech).one() except MultipleResultsFound: project_logger.warning("Duplicate records found for: %s", str(key)) except NoResultFound: word = Word(lemma=lemma, surface=surface.lower(), part_of_speech=part_of_speech) word.save(False) words[key] = word sentence.add_word( word=word, position=position, space_before=space_before, surface=surface, project=self.project, force=False ) position += 1 db.session.commit()
def add_words(self, sentence, parsed_sentence, raw_text): """Given a Sentence and its parsed text, and find the PoS, lemmas, and space_befores of each word in the sentence, and add them to the Sentence object. """ words = dict() position = 0 space = re.compile(r'\s') cr = re.compile(r'[\n\r]') for word_data in parsed_sentence["words"]: surface = word_data[0] part_of_speech = word_data[1]["PartOfSpeech"] try: lemma = word_data[1]["Lemma"].lower() except AttributeError as err: # this word wasn't recognized as a word by the parser, # it's probably a weird character or something lemma = "*" * (int(word_data[1]["CharacterOffsetEnd"]) - int(word_data[1]["CharacterOffsetBegin"])) surface = "*" * (int(word_data[1]["CharacterOffsetEnd"]) - int(word_data[1]["CharacterOffsetBegin"])) space_before = "" try: prevChar = raw_text[int(word_data[1]["CharacterOffsetBegin"]) - 1] if space.match(prevChar): if cr.match(prevChar): space_before = "\n" else: space_before = " " except IndexError: pass key = (surface.lower(), part_of_speech, lemma) if key in words: word = words[key] else: try: word = Word.query.filter_by( lemma=lemma, surface=surface.lower(), part_of_speech=part_of_speech).one() except MultipleResultsFound: project_logger.warning("Duplicate records found for: %s", str(key)) except NoResultFound: word = Word(lemma=lemma, surface=surface.lower(), part_of_speech=part_of_speech) word.save(False) words[key] = word sentence.add_word(word=word, position=position, space_before=space_before, surface=surface, project=self.project, force=False) position += 1 db.session.commit()