Exemplo n.º 1
0
def test_text_tokenized_tagged_both():
    file = new_file(text_type=('tokenized', 'tagged_both'))

    text_flat_tokens = wl_text.Wl_Text(main, file, flat_tokens=True)
    text = wl_text.Wl_Text(main, file, flat_tokens=False)

    assert text_flat_tokens.tokens_flat != []
    assert text_flat_tokens.tags_pos != [[]] * len(
        text_flat_tokens.tokens_flat)
    assert text_flat_tokens.tags_non_pos != [[]] * len(
        text_flat_tokens.tokens_flat)
    assert text_flat_tokens.tags_all != [[]] * len(
        text_flat_tokens.tokens_flat)

    assert text_flat_tokens.offsets_paras == [0]
    assert text_flat_tokens.offsets_sentences == [0]
    assert text_flat_tokens.offsets_clauses == [0]

    assert (len(text_flat_tokens.tokens_flat) == len(text_flat_tokens.tags_pos)
            == len(text_flat_tokens.tags_non_pos) == len(
                text_flat_tokens.tags_all))

    assert len(text_flat_tokens.offsets_paras) == len(
        text_flat_tokens.tokens_hierarchical)
    assert len(text_flat_tokens.offsets_sentences) == sum(
        [len(para) for para in text_flat_tokens.tokens_hierarchical])
    assert len(text_flat_tokens.offsets_clauses) == sum([
        len(sentence) for para in text_flat_tokens.tokens_hierarchical
        for sentence in para
    ])

    assert text.tokens_flat != []
    assert text.tags_pos != [[]] * len(text.tokens_flat)
    assert text.tags_non_pos != [[]] * len(text.tokens_flat)
    assert text.tags_all != [[]] * len(text.tokens_flat)

    assert text.offsets_paras != [0]
    assert text.offsets_sentences != [0]
    assert text.offsets_clauses != [0]

    assert (len(text.tokens_flat) == len(text.tags_pos) == len(
        text.tags_non_pos) == len(text.tags_all))

    assert len(text.offsets_paras) == len(text.tokens_hierarchical)
    assert len(text.offsets_sentences) == sum(
        [len(para) for para in text.tokens_hierarchical])
    assert len(text.offsets_clauses) == sum([
        len(sentence) for para in text.tokens_hierarchical for sentence in para
    ])
Exemplo n.º 2
0
    def run(self):
        texts = []

        settings = self.main.settings_custom['overview']
        files = self.main.wl_files.get_selected_files()

        for i, file in enumerate(files):
            text = wl_text.Wl_Text(self.main, file, flat_tokens=False)
            text = wl_token_processing.wl_process_tokens_overview(
                text, token_settings=settings['token_settings'])

            texts.append(text)

        if len(files) > 1:
            text_total = wl_text.Wl_Text_Blank()
            text_total.offsets_paras = [
                offset for text in texts for offset in text.offsets_paras
            ]
            text_total.offsets_sentences = [
                offset for text in texts for offset in text.offsets_sentences
            ]
            text_total.offsets_clauses = [
                offset for text in texts for offset in text.offsets_clauses
            ]
            text_total.tokens_multilevel = [
                para for text in texts for para in text.tokens_multilevel
            ]
            text_total.tokens_flat = [
                token for text in texts for token in text.tokens_flat
            ]

            texts.append(text_total)
        else:
            texts.append(texts[0])

        self.progress_updated.emit(self.tr('Processing data ...'))

        base_sttr = settings['generation_settings']['base_sttr']

        for text in texts:
            texts_stats_file = []

            # Paragraph length
            len_paras_in_sentence = [
                len(para) for para in text.tokens_multilevel
            ]
            len_paras_in_clause = [
                sum([len(sentence) for sentence in para])
                for para in text.tokens_multilevel
            ]
            len_paras_in_token = [
                sum([len(clause) for sentence in para for clause in sentence])
                for para in text.tokens_multilevel
            ]

            # Sentence length
            len_sentences = [
                sum([len(clause) for clause in sentence])
                for para in text.tokens_multilevel for sentence in para
            ]

            # Clause length
            len_clauses = [
                len(clause) for para in text.tokens_multilevel
                for sentence in para for clause in sentence
            ]

            # Token length
            len_tokens = [len(token) for token in text.tokens_flat]
            # Type length
            len_types = [
                len(token_type) for token_type in set(text.tokens_flat)
            ]

            count_tokens = len(len_tokens)
            count_types = len(len_types)

            # TTR
            if count_tokens:
                ttr = count_types / count_tokens
            else:
                ttr = 0

            # STTR
            if count_tokens < base_sttr:
                sttr = ttr
            else:
                token_sections = wl_text_utils.to_sections_unequal(
                    text.tokens_flat, base_sttr)

                # Discard the last section if number of tokens in it is smaller than the base of sttr
                if len(token_sections[-1]) < base_sttr:
                    ttrs = [
                        len(set(token_section)) / len(token_section)
                        for token_section in token_sections[:-1]
                    ]
                else:
                    ttrs = [
                        len(set(token_section)) / len(token_section)
                        for token_section in token_sections
                    ]

                sttr = sum(ttrs) / len(ttrs)

            texts_stats_file.append(len_paras_in_sentence)
            texts_stats_file.append(len_paras_in_clause)
            texts_stats_file.append(len_paras_in_token)
            texts_stats_file.append(len_sentences)
            texts_stats_file.append(len_clauses)
            texts_stats_file.append(len_tokens)
            texts_stats_file.append(len_types)
            texts_stats_file.append(ttr)
            texts_stats_file.append(sttr)

            self.texts_stats_files.append(texts_stats_file)
Exemplo n.º 3
0
    def run(self):
        texts = []

        settings = self.main.settings_custom['wordlist']
        files = self.main.wl_files.get_selected_files()

        # Frequency
        for file in files:
            text = wl_text.Wl_Text(self.main, file)
            text = wl_token_processing.wl_process_tokens_wordlist(
                text, token_settings=settings['token_settings'])

            # Remove empty tokens
            tokens = [token for token in text.tokens_flat if token]

            self.tokens_freq_files.append(collections.Counter(tokens))
            texts.append(text)

        # Total
        if len(files) > 1:
            text_total = wl_text.Wl_Text_Blank()
            text_total.tokens_flat = [
                token for text in texts for token in text.tokens_flat
            ]

            self.tokens_freq_files.append(
                sum(self.tokens_freq_files, collections.Counter()))
            texts.append(text_total)

        self.progress_updated.emit(self.tr('Processing data ...'))

        # Dispersion & Adjusted Frequency
        text_measure_dispersion = settings['generation_settings'][
            'measure_dispersion']
        text_measure_adjusted_freq = settings['generation_settings'][
            'measure_adjusted_freq']

        measure_dispersion = self.main.settings_global['measures_dispersion'][
            text_measure_dispersion]['func']
        measure_adjusted_freq = self.main.settings_global[
            'measures_adjusted_freq'][text_measure_adjusted_freq]['func']

        tokens_total = self.tokens_freq_files[-1].keys()

        for text in texts:
            tokens_stats_file = {}

            # Dispersion
            number_sections = self.main.settings_custom['measures'][
                'dispersion']['general']['number_sections']

            sections_freq = [
                collections.Counter(section)
                for section in wl_text_utils.to_sections(
                    text.tokens_flat, number_sections)
            ]

            for token in tokens_total:
                counts = [
                    section_freq[token] for section_freq in sections_freq
                ]

                tokens_stats_file[token] = [measure_dispersion(counts)]

            # Adjusted Frequency
            if not self.main.settings_custom['measures']['adjusted_freq'][
                    'general']['use_same_settings_dispersion']:
                number_sections = self.main.settings_custom['measures'][
                    'adjusted_freq']['general']['number_sections']

                sections_freq = [
                    collections.Counter(section)
                    for section in wl_text_utils.to_sections(
                        text.tokens_flat, number_sections)
                ]

            for token in tokens_total:
                counts = [
                    section_freq[token] for section_freq in sections_freq
                ]

                tokens_stats_file[token].append(measure_adjusted_freq(counts))

            self.tokens_stats_files.append(tokens_stats_file)

        if len(files) == 1:
            self.tokens_freq_files *= 2
            self.tokens_stats_files *= 2
Exemplo n.º 4
0
    def run(self):
        texts = []

        settings = self.main.settings_custom['keyword']
        ref_file = self.main.wl_files.find_file_by_name(
            settings['generation_settings']['ref_file'], selected_only=True)

        files = [
            file for file in self.main.wl_files.get_selected_files()
            if file != ref_file
        ]

        # Frequency
        for i, file in enumerate([ref_file] + files):
            text = wl_text.Wl_Text(self.main, file)
            text = wl_token_processing.wl_process_tokens_keyword(
                text, token_settings=settings['token_settings'])

            # Remove empty tokens
            tokens = [token for token in text.tokens_flat if token]

            self.keywords_freq_files.append(collections.Counter(tokens))

            if i > 0:
                texts.append(text)
            else:
                tokens_ref = text.tokens_flat.copy()
                len_tokens_ref = len(tokens_ref)

        # Total
        if len(files) > 1:
            text_total = wl_text.Wl_Text_Blank()
            text_total.tokens_flat = [
                token for text in texts for token in text.tokens_flat
            ]

            self.keywords_freq_files.append(
                sum(self.keywords_freq_files, collections.Counter()))

            self.keywords_freq_files[0] = {
                token: freq
                for token, freq in self.keywords_freq_files[0].items()
                if token in text_total.tokens_flat
            }

            texts.append(text_total)
        else:
            self.keywords_freq_files[0] = {
                token: freq
                for token, freq in self.keywords_freq_files[0].items()
                if token in self.keywords_freq_files[1]
            }

        self.progress_updated.emit(self.tr('Processing data ...'))

        # Keyness
        text_test_significance = settings['generation_settings'][
            'test_significance']
        text_measure_effect_size = settings['generation_settings'][
            'measure_effect_size']

        test_significance = self.main.settings_global['tests_significance'][
            'keyword'][text_test_significance]['func']
        measure_effect_size = self.main.settings_global[
            'measures_effect_size']['keyword'][text_measure_effect_size][
                'func']

        keywords_freq_file_observed = self.keywords_freq_files[-1]
        keywords_freq_file_ref = self.keywords_freq_files[0]

        for text in texts:
            keywords_stats_file = {}

            tokens_observed = text.tokens_flat
            len_tokens_observed = len(tokens_observed)

            if text_test_significance in [
                    self.tr('Student\'s t-test (Two-sample)'),
                    self.tr('Mann-Whitney U Test')
            ]:
                # Test Statistic, p-value & Bayes Factor
                if text_test_significance == self.tr(
                        'Student\'s t-test (Two-sample)'):
                    number_sections = self.main.settings_custom['measures'][
                        'statistical_significance'][
                            'students_t_test_2_sample']['number_sections']
                    use_data = self.main.settings_custom['measures'][
                        'statistical_significance'][
                            'students_t_test_2_sample']['use_data']
                elif text_test_significance == self.tr('Mann-Whitney U Test'):
                    number_sections = self.main.settings_custom['measures'][
                        'statistical_significance']['mann_whitney_u_test'][
                            'number_sections']
                    use_data = self.main.settings_custom['measures'][
                        'statistical_significance']['mann_whitney_u_test'][
                            'use_data']

                sections_observed = wl_text_utils.to_sections(
                    tokens_observed, number_sections)
                sections_ref = wl_text_utils.to_sections(
                    tokens_ref, number_sections)

                sections_freq_observed = [
                    collections.Counter(section)
                    for section in sections_observed
                ]
                sections_freq_ref = [
                    collections.Counter(section)
                    for section in sections_observed
                ]

                len_sections_observed = [
                    len(section) for section in sections_observed
                ]
                len_sections_ref = [len(section) for section in sections_ref]

                if use_data == self.tr('Absolute Frequency'):
                    for token in keywords_freq_file_observed:
                        counts_observed = [
                            section_freq.get(token, 0)
                            for section_freq in sections_freq_observed
                        ]
                        counts_ref = [
                            section_freq.get(token, 0)
                            for section_freq in sections_freq_ref
                        ]

                        keywords_stats_file[token] = test_significance(
                            self.main, counts_observed, counts_ref)
                elif use_data == self.tr('Relative Frequency'):
                    for token in keywords_freq_file_observed:
                        counts_observed = [
                            section_freq.get(token, 0) /
                            len_sections_observed[i]
                            for i, section_freq in enumerate(
                                sections_freq_observed)
                        ]
                        counts_ref = [
                            section_freq.get(token, 0) / len_sections_ref[i]
                            for i, section_freq in enumerate(sections_freq_ref)
                        ]

                        keywords_stats_file[token] = test_significance(
                            self.main, counts_observed, counts_ref)

                # Effect Size
                for token in keywords_freq_file_observed:
                    c11 = keywords_freq_file_observed.get(token, 0)
                    c12 = keywords_freq_file_ref.get(token, 0)
                    c21 = len_tokens_observed - c11
                    c22 = len_tokens_ref - c12

                    keywords_stats_file[token].append(
                        measure_effect_size(self.main, c11, c12, c21, c22))
            else:
                for token in keywords_freq_file_observed:
                    c11 = keywords_freq_file_observed.get(token, 0)
                    c12 = keywords_freq_file_ref.get(token, 0)
                    c21 = len_tokens_observed - c11
                    c22 = len_tokens_ref - c12

                    # Test Statistic, p-value & Bayes Factor
                    keywords_stats_file[token] = test_significance(
                        self.main, c11, c12, c21, c22)

                    # Effect Size
                    keywords_stats_file[token].append(
                        measure_effect_size(self.main, c11, c12, c21, c22))

            self.keywords_stats_files.append(keywords_stats_file)

        if len(files) == 1:
            self.keywords_freq_files.append(self.keywords_freq_files[1])
            self.keywords_stats_files *= 2