def test_get_sentences(self): actual_value = nlp_utils.get_sentences(empty_paragraph) expected_value = [] self.assertEqual(actual_value, expected_value) actual_value = nlp_utils.get_sentences(paragraph1) expected_value = [ 'Good morning Dr. Adams.', 'The patient is waiting for you in room number 3.' ] self.assertEqual(actual_value, expected_value) actual_value = len(nlp_utils.get_sentences(review_text6)) expected_value = 8 self.assertEqual(actual_value, expected_value)
def lemmatize_sentences(records): print('%s: lemmatize sentences' % time.strftime("%Y/%m/%d-%H:%M:%S")) sentence_records = [] record_index = 0 document_level = Constants.DOCUMENT_LEVEL for record in records: sentences = \ nlp_utils.get_sentences(record[Constants.TEXT_FIELD]) sentence_index = 0 for sentence in sentences: if isinstance(document_level, (int, float)) and\ sentence_index >= document_level: break tagged_words = nlp_utils.lemmatize_sentence(sentence) sentence_record = {} sentence_record.update(record) sentence_record[Constants.TEXT_FIELD] = sentence sentence_record['sentence_index'] = sentence_index sentence_record[Constants.POS_TAGS_FIELD] = tagged_words sentence_records.append(sentence_record) sentence_index += 1 # print(sentence_record) record_index += 1 # print('\rrecord index: %d/%d' % (record_index, len(records))), return sentence_records
def lemmatize_reviews(records): """ Performs a POS tagging on the text contained in the reviews and additionally finds the lemma of each word in the review :type records: list[dict] :param records: a list of dictionaries with the reviews """ print('%s: lemmatize reviews' % time.strftime("%Y/%m/%d-%H:%M:%S")) record_index = 0 max_sentences = Constants.MAX_SENTENCES for record in records: # print('\rrecord index: %d/%d' % (record_index, len(records))), if max_sentences is None: tagged_words =\ nlp_utils.lemmatize_text(record[Constants.TEXT_FIELD]) else: sentences = nlp_utils.get_sentences(record[Constants.TEXT_FIELD]) sentence_index = 0 tagged_words = [] for sentence in sentences: if max_sentences is not None and sentence_index >= max_sentences: break tagged_words.extend(nlp_utils.lemmatize_sentence(sentence)) sentence_index += 1 record[Constants.POS_TAGS_FIELD] = tagged_words record_index += 1
def build_text_automatic(self, record): text = record[Constants.TEXT_FIELD] sentences = nlp_utils.get_sentences(text) lemmatized_words = [] for sentence in sentences: lemmatized_words.append( nlp_utils.lemmatize_sentence(sentence, nltk.re.compile(''), min_length=1, max_length=100)) doc_parts = [] itemize = Itemize() for sentence in lemmatized_words: new_words = [] itemize.add_item('') for tagged_word in sentence: tag = tagged_word[1] word = tagged_word[0] singular = pattern.text.en.singularize(word) word_found = False # if tag == 'VBD': # new_words.append( # '\\colorbox[rgb]{0.5,0.5,0.5}{' + word + '}') # word_found = True # # if tag.startswith('PRP'): # new_words.append( # '\\colorbox[rgb]{0.85,0.85,0.85}{' + word + '}') # word_found = True for topic_id in self.automatic_context_topic_ids: if word in self.topic_words_map[topic_id]: # if singular in context_words[Constants.ITEM_TYPE][topic]: self.tag_word(word) color_id = self.automatic_context_topic_colors[ topic_id] color = self.rgb_tuples[color_id] new_words.append('\\colorbox[rgb]{' + str(color[0]) + ',' + str(color[1]) + ',' + str(color[2]) + '}{' + word + '}') word_found = True break if not word_found: new_words.append(word) itemize.append(NoEscape(' '.join(new_words))) doc_parts.append(itemize) return doc_parts
def build_text_automatic(self, record): text = record[Constants.TEXT_FIELD] sentences = nlp_utils.get_sentences(text) lemmatized_words = [] for sentence in sentences: lemmatized_words.append(nlp_utils.lemmatize_sentence( sentence, nltk.re.compile(''), min_length=1, max_length=100)) doc_parts = [] itemize = Itemize() for sentence in lemmatized_words: new_words = [] itemize.add_item('') for tagged_word in sentence: tag = tagged_word[1] word = tagged_word[0] singular = pattern.text.en.singularize(word) word_found = False # if tag == 'VBD': # new_words.append( # '\\colorbox[rgb]{0.5,0.5,0.5}{' + word + '}') # word_found = True # # if tag.startswith('PRP'): # new_words.append( # '\\colorbox[rgb]{0.85,0.85,0.85}{' + word + '}') # word_found = True for topic_id in self.automatic_context_topic_ids: if word in self.topic_words_map[topic_id]: # if singular in context_words[Constants.ITEM_TYPE][topic]: self.tag_word(word) color_id = self.automatic_context_topic_colors[topic_id] color = self.rgb_tuples[color_id] new_words.append( '\\colorbox[rgb]{' + str(color[0]) + ',' + str(color[1]) + ',' + str(color[2]) + '}{' + word + '}') word_found = True break if not word_found: new_words.append(word) itemize.append(NoEscape(' '.join(new_words))) doc_parts.append(itemize) return doc_parts
def get_review_metrics(record): """ Returns a list with the metrics of a review. This list is composed in the following way: [log(num_sentences + 1), log(num_words + 1), log(num_past_verbs + 1), log(num_verbs + 1), (log(num_past_verbs + 1) / log(num_verbs + 1)) :type record: dict :param record: the review that wants to be analyzed, it should contain the text of the review :rtype: list[float] :return: a list with numeric metrics """ review_text = record[Constants.TEXT_FIELD] log_sentences = math.log(len(nlp_utils.get_sentences(review_text)) + 1) log_words = math.log(len(nlp_utils.get_words(review_text)) + 1) # log_time_words = math.log(len(self.get_time_words(review.text)) + 1) tagged_words = record[Constants.POS_TAGS_FIELD] counts = Counter(tag for word, tag, lemma in tagged_words) # print(counts) log_past_verbs = math.log(counts['VBD'] + 1) log_verbs = math.log(nlp_utils.count_verbs(counts) + 1) log_personal_pronouns = math.log(counts['PRP'] + 1) # log_sentences = float(len(get_sentences(review.text)) + 1) # log_words = float(len(get_words(review.text)) + 1) # log_time_words = float(len(self.get_time_words(review.text)) + 1) # tagged_words = review.tagged_words # counts = Counter(tag for word, tag in tagged_words) # log_past_verbs = float(counts['VBD'] + 1) # log_verbs = float(count_verbs(counts) + 1) # log_personal_pronouns = float(counts['PRP'] + 1) # This ensures that when log_verbs = 0 the program won't crash if log_verbs == 0: past_verbs_ratio = 0 else: past_verbs_ratio = log_past_verbs / log_verbs # This ensures that when log_verbs = 0 the program won't crash if log_words == 0: verbs_ratio = 0 past_verbs_ratio2 = 0 personal_pronouns_ratio = 0 else: verbs_ratio = log_verbs / log_words past_verbs_ratio2 = log_past_verbs / log_words # time_words_ratio = log_time_words / log_words personal_pronouns_ratio = log_personal_pronouns / log_words result = [ log_words, log_past_verbs, log_verbs, past_verbs_ratio, verbs_ratio, log_personal_pronouns, personal_pronouns_ratio, past_verbs_ratio2 ] #, log_time_words, time_words_ratio] # print(result) # result = [log_sentences] # result = [log_personal_pronouns, log_past_verbs, log_words] # result = [time_words_ratio, personal_pronouns_ratio] # result = [log_past_verbs] # result = [log_verbs] # result = [past_verbs_ratio] # result = [past_verbs_ratio2] # result = [log_personal_pronouns] # result = [personal_pronouns_ratio] # result = [log_time_words] # result = [time_words_ratio] return numpy.array(result)
def get_review_metrics(record): """ Returns a list with the metrics of a review. This list is composed in the following way: [log(num_sentences + 1), log(num_words + 1), log(num_past_verbs + 1), log(num_verbs + 1), (log(num_past_verbs + 1) / log(num_verbs + 1)) :type record: dict :param record: the review that wants to be analyzed, it should contain the text of the review :rtype: list[float] :return: a list with numeric metrics """ review_text = record[Constants.TEXT_FIELD] log_sentences = math.log(len(nlp_utils.get_sentences(review_text)) + 1) # log_time_words = math.log(len(self.get_time_words(review.text)) + 1) tagged_words = record[Constants.POS_TAGS_FIELD] log_words = math.log(len(tagged_words) + 1) counts = Counter(tag for word, tag, lemma in tagged_words) # print(counts) log_past_verbs = math.log(counts['VBD'] + 1) log_verbs = math.log(nlp_utils.count_verbs(counts) + 1) log_personal_pronouns = math.log(counts['PRP'] + 1) # log_sentences = float(len(get_sentences(review.text)) + 1) # log_words = float(len(get_words(review.text)) + 1) # log_time_words = float(len(self.get_time_words(review.text)) + 1) # tagged_words = review.tagged_words # counts = Counter(tag for word, tag in tagged_words) # log_past_verbs = float(counts['VBD'] + 1) # log_verbs = float(count_verbs(counts) + 1) # log_personal_pronouns = float(counts['PRP'] + 1) # This ensures that when log_verbs = 0 the program won't crash if log_verbs == 0: past_verbs_ratio = 0 else: past_verbs_ratio = log_past_verbs / log_verbs # This ensures that when log_verbs = 0 the program won't crash if log_words == 0: verbs_ratio = 0 past_verbs_ratio2 = 0 personal_pronouns_ratio = 0 else: verbs_ratio = log_verbs / log_words past_verbs_ratio2 = log_past_verbs / log_words # time_words_ratio = log_time_words / log_words personal_pronouns_ratio = log_personal_pronouns / log_words result = [log_words, log_past_verbs, log_verbs, past_verbs_ratio, verbs_ratio, log_personal_pronouns, personal_pronouns_ratio, past_verbs_ratio2]#, log_time_words, time_words_ratio] # print(result) # result = [log_sentences] # result = [log_personal_pronouns, log_past_verbs, log_words] # result = [time_words_ratio, personal_pronouns_ratio] # result = [log_past_verbs] # result = [log_verbs] # result = [past_verbs_ratio] # result = [past_verbs_ratio2] # result = [log_personal_pronouns] # result = [personal_pronouns_ratio] # result = [log_time_words] # result = [time_words_ratio] return numpy.array(result)