예제 #1
0
    def test_get_sentences(self):

        actual_value = nlp_utils.get_sentences(empty_paragraph)
        expected_value = []
        self.assertEqual(actual_value, expected_value)
        actual_value = nlp_utils.get_sentences(paragraph1)
        expected_value = [
            'Good morning Dr. Adams.',
            'The patient is waiting for you in room number 3.'
        ]
        self.assertEqual(actual_value, expected_value)
        actual_value = len(nlp_utils.get_sentences(review_text6))
        expected_value = 8
        self.assertEqual(actual_value, expected_value)
예제 #2
0
    def test_get_sentences(self):

        actual_value = nlp_utils.get_sentences(empty_paragraph)
        expected_value = []
        self.assertEqual(actual_value, expected_value)
        actual_value = nlp_utils.get_sentences(paragraph1)
        expected_value = [
            'Good morning Dr. Adams.',
            'The patient is waiting for you in room number 3.'
        ]
        self.assertEqual(actual_value, expected_value)
        actual_value = len(nlp_utils.get_sentences(review_text6))
        expected_value = 8
        self.assertEqual(actual_value, expected_value)
    def lemmatize_sentences(records):
        print('%s: lemmatize sentences' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        sentence_records = []
        record_index = 0
        document_level = Constants.DOCUMENT_LEVEL
        for record in records:
            sentences = \
                nlp_utils.get_sentences(record[Constants.TEXT_FIELD])
            sentence_index = 0
            for sentence in sentences:
                if isinstance(document_level, (int, float)) and\
                        sentence_index >= document_level:
                    break
                tagged_words = nlp_utils.lemmatize_sentence(sentence)
                sentence_record = {}
                sentence_record.update(record)
                sentence_record[Constants.TEXT_FIELD] = sentence
                sentence_record['sentence_index'] = sentence_index
                sentence_record[Constants.POS_TAGS_FIELD] = tagged_words
                sentence_records.append(sentence_record)
                sentence_index += 1
                # print(sentence_record)
            record_index += 1
            # print('\rrecord index: %d/%d' % (record_index, len(records))),
        return sentence_records
예제 #4
0
    def lemmatize_reviews(records):
        """
        Performs a POS tagging on the text contained in the reviews and
        additionally finds the lemma of each word in the review

        :type records: list[dict]
        :param records: a list of dictionaries with the reviews
        """
        print('%s: lemmatize reviews' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        record_index = 0
        max_sentences = Constants.MAX_SENTENCES
        for record in records:
            # print('\rrecord index: %d/%d' % (record_index, len(records))),

            if max_sentences is None:
                tagged_words =\
                    nlp_utils.lemmatize_text(record[Constants.TEXT_FIELD])
            else:
                sentences = nlp_utils.get_sentences(record[Constants.TEXT_FIELD])
                sentence_index = 0
                tagged_words = []
                for sentence in sentences:
                    if max_sentences is not None and sentence_index >= max_sentences:
                        break
                    tagged_words.extend(nlp_utils.lemmatize_sentence(sentence))
                    sentence_index += 1

            record[Constants.POS_TAGS_FIELD] = tagged_words
            record_index += 1
예제 #5
0
    def lemmatize_sentences(records):
        print('%s: lemmatize sentences' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        sentence_records = []
        record_index = 0
        document_level = Constants.DOCUMENT_LEVEL
        for record in records:
            sentences = \
                nlp_utils.get_sentences(record[Constants.TEXT_FIELD])
            sentence_index = 0
            for sentence in sentences:
                if isinstance(document_level, (int, float)) and\
                        sentence_index >= document_level:
                    break
                tagged_words = nlp_utils.lemmatize_sentence(sentence)
                sentence_record = {}
                sentence_record.update(record)
                sentence_record[Constants.TEXT_FIELD] = sentence
                sentence_record['sentence_index'] = sentence_index
                sentence_record[Constants.POS_TAGS_FIELD] = tagged_words
                sentence_records.append(sentence_record)
                sentence_index += 1
                # print(sentence_record)
            record_index += 1
            # print('\rrecord index: %d/%d' % (record_index, len(records))),
        return sentence_records
예제 #6
0
    def build_text_automatic(self, record):
        text = record[Constants.TEXT_FIELD]
        sentences = nlp_utils.get_sentences(text)
        lemmatized_words = []
        for sentence in sentences:
            lemmatized_words.append(
                nlp_utils.lemmatize_sentence(sentence,
                                             nltk.re.compile(''),
                                             min_length=1,
                                             max_length=100))

        doc_parts = []
        itemize = Itemize()

        for sentence in lemmatized_words:
            new_words = []
            itemize.add_item('')
            for tagged_word in sentence:
                tag = tagged_word[1]
                word = tagged_word[0]
                singular = pattern.text.en.singularize(word)
                word_found = False

                # if tag == 'VBD':
                #     new_words.append(
                #         '\\colorbox[rgb]{0.5,0.5,0.5}{' + word + '}')
                #     word_found = True
                #
                # if tag.startswith('PRP'):
                #     new_words.append(
                #         '\\colorbox[rgb]{0.85,0.85,0.85}{' + word + '}')
                #     word_found = True

                for topic_id in self.automatic_context_topic_ids:
                    if word in self.topic_words_map[topic_id]:
                        # if singular in context_words[Constants.ITEM_TYPE][topic]:
                        self.tag_word(word)
                        color_id = self.automatic_context_topic_colors[
                            topic_id]
                        color = self.rgb_tuples[color_id]
                        new_words.append('\\colorbox[rgb]{' + str(color[0]) +
                                         ',' + str(color[1]) + ',' +
                                         str(color[2]) + '}{' + word + '}')
                        word_found = True
                        break
                if not word_found:
                    new_words.append(word)
            itemize.append(NoEscape(' '.join(new_words)))
        doc_parts.append(itemize)

        return doc_parts
예제 #7
0
    def build_text_automatic(self, record):
        text = record[Constants.TEXT_FIELD]
        sentences = nlp_utils.get_sentences(text)
        lemmatized_words = []
        for sentence in sentences:
            lemmatized_words.append(nlp_utils.lemmatize_sentence(
                sentence, nltk.re.compile(''),
                min_length=1, max_length=100))

        doc_parts = []
        itemize = Itemize()

        for sentence in lemmatized_words:
            new_words = []
            itemize.add_item('')
            for tagged_word in sentence:
                tag = tagged_word[1]
                word = tagged_word[0]
                singular = pattern.text.en.singularize(word)
                word_found = False

                # if tag == 'VBD':
                #     new_words.append(
                #         '\\colorbox[rgb]{0.5,0.5,0.5}{' + word + '}')
                #     word_found = True
                #
                # if tag.startswith('PRP'):
                #     new_words.append(
                #         '\\colorbox[rgb]{0.85,0.85,0.85}{' + word + '}')
                #     word_found = True

                for topic_id in self.automatic_context_topic_ids:
                    if word in self.topic_words_map[topic_id]:
                    # if singular in context_words[Constants.ITEM_TYPE][topic]:
                        self.tag_word(word)
                        color_id = self.automatic_context_topic_colors[topic_id]
                        color = self.rgb_tuples[color_id]
                        new_words.append(
                            '\\colorbox[rgb]{' +
                            str(color[0]) + ',' + str(color[1]) + ',' +
                            str(color[2]) + '}{' + word + '}')
                        word_found = True
                        break
                if not word_found:
                    new_words.append(word)
            itemize.append(NoEscape(' '.join(new_words)))
        doc_parts.append(itemize)

        return doc_parts
예제 #8
0
def get_review_metrics(record):
    """
    Returns a list with the metrics of a review. This list is composed
    in the following way: [log(num_sentences + 1), log(num_words + 1),
    log(num_past_verbs + 1), log(num_verbs + 1),
    (log(num_past_verbs + 1) / log(num_verbs + 1))

    :type record: dict
    :param record: the review that wants to be analyzed, it should contain the
    text of the review
    :rtype: list[float]
    :return: a list with numeric metrics
    """
    review_text = record[Constants.TEXT_FIELD]
    log_sentences = math.log(len(nlp_utils.get_sentences(review_text)) + 1)
    log_words = math.log(len(nlp_utils.get_words(review_text)) + 1)
    # log_time_words = math.log(len(self.get_time_words(review.text)) + 1)
    tagged_words = record[Constants.POS_TAGS_FIELD]
    counts = Counter(tag for word, tag, lemma in tagged_words)
    # print(counts)
    log_past_verbs = math.log(counts['VBD'] + 1)
    log_verbs = math.log(nlp_utils.count_verbs(counts) + 1)
    log_personal_pronouns = math.log(counts['PRP'] + 1)
    # log_sentences = float(len(get_sentences(review.text)) + 1)
    # log_words = float(len(get_words(review.text)) + 1)
    # log_time_words = float(len(self.get_time_words(review.text)) + 1)
    # tagged_words = review.tagged_words
    # counts = Counter(tag for word, tag in tagged_words)
    # log_past_verbs = float(counts['VBD'] + 1)
    # log_verbs = float(count_verbs(counts) + 1)
    # log_personal_pronouns = float(counts['PRP'] + 1)

    # This ensures that when log_verbs = 0 the program won't crash
    if log_verbs == 0:
        past_verbs_ratio = 0
    else:
        past_verbs_ratio = log_past_verbs / log_verbs
    # This ensures that when log_verbs = 0 the program won't crash
    if log_words == 0:
        verbs_ratio = 0
        past_verbs_ratio2 = 0
        personal_pronouns_ratio = 0
    else:
        verbs_ratio = log_verbs / log_words
        past_verbs_ratio2 = log_past_verbs / log_words
        # time_words_ratio = log_time_words / log_words
        personal_pronouns_ratio = log_personal_pronouns / log_words

    result = [
        log_words, log_past_verbs, log_verbs, past_verbs_ratio, verbs_ratio,
        log_personal_pronouns, personal_pronouns_ratio, past_verbs_ratio2
    ]  #, log_time_words, time_words_ratio]
    # print(result)
    # result = [log_sentences]
    # result = [log_personal_pronouns, log_past_verbs, log_words]
    # result = [time_words_ratio, personal_pronouns_ratio]
    # result = [log_past_verbs]
    # result = [log_verbs]
    # result = [past_verbs_ratio]
    # result = [past_verbs_ratio2]
    # result = [log_personal_pronouns]
    # result = [personal_pronouns_ratio]
    # result = [log_time_words]
    # result = [time_words_ratio]

    return numpy.array(result)
예제 #9
0
def get_review_metrics(record):
    """
    Returns a list with the metrics of a review. This list is composed
    in the following way: [log(num_sentences + 1), log(num_words + 1),
    log(num_past_verbs + 1), log(num_verbs + 1),
    (log(num_past_verbs + 1) / log(num_verbs + 1))

    :type record: dict
    :param record: the review that wants to be analyzed, it should contain the
    text of the review
    :rtype: list[float]
    :return: a list with numeric metrics
    """
    review_text = record[Constants.TEXT_FIELD]
    log_sentences = math.log(len(nlp_utils.get_sentences(review_text)) + 1)
    # log_time_words = math.log(len(self.get_time_words(review.text)) + 1)
    tagged_words = record[Constants.POS_TAGS_FIELD]
    log_words = math.log(len(tagged_words) + 1)
    counts = Counter(tag for word, tag, lemma in tagged_words)
    # print(counts)
    log_past_verbs = math.log(counts['VBD'] + 1)
    log_verbs = math.log(nlp_utils.count_verbs(counts) + 1)
    log_personal_pronouns = math.log(counts['PRP'] + 1)
    # log_sentences = float(len(get_sentences(review.text)) + 1)
    # log_words = float(len(get_words(review.text)) + 1)
    # log_time_words = float(len(self.get_time_words(review.text)) + 1)
    # tagged_words = review.tagged_words
    # counts = Counter(tag for word, tag in tagged_words)
    # log_past_verbs = float(counts['VBD'] + 1)
    # log_verbs = float(count_verbs(counts) + 1)
    # log_personal_pronouns = float(counts['PRP'] + 1)

    # This ensures that when log_verbs = 0 the program won't crash
    if log_verbs == 0:
        past_verbs_ratio = 0
    else:
        past_verbs_ratio = log_past_verbs / log_verbs
    # This ensures that when log_verbs = 0 the program won't crash
    if log_words == 0:
        verbs_ratio = 0
        past_verbs_ratio2 = 0
        personal_pronouns_ratio = 0
    else:
        verbs_ratio = log_verbs / log_words
        past_verbs_ratio2 = log_past_verbs / log_words
        # time_words_ratio = log_time_words / log_words
        personal_pronouns_ratio = log_personal_pronouns / log_words

    result = [log_words, log_past_verbs, log_verbs, past_verbs_ratio, verbs_ratio, log_personal_pronouns, personal_pronouns_ratio, past_verbs_ratio2]#, log_time_words, time_words_ratio]
    # print(result)
    # result = [log_sentences]
    # result = [log_personal_pronouns, log_past_verbs, log_words]
    # result = [time_words_ratio, personal_pronouns_ratio]
    # result = [log_past_verbs]
    # result = [log_verbs]
    # result = [past_verbs_ratio]
    # result = [past_verbs_ratio2]
    # result = [log_personal_pronouns]
    # result = [personal_pronouns_ratio]
    # result = [log_time_words]
    # result = [time_words_ratio]

    return numpy.array(result)