def test_text_tokenized_tagged_both(): file = new_file(text_type=('tokenized', 'tagged_both')) text_flat_tokens = wordless_text.Wordless_Text(main, file, flat_tokens=True) text = wordless_text.Wordless_Text(main, file, flat_tokens=False) assert text_flat_tokens.tokens_flat != [] assert text_flat_tokens.tags_pos != [[]] * len( text_flat_tokens.tokens_flat) assert text_flat_tokens.tags_non_pos != [[]] * len( text_flat_tokens.tokens_flat) assert text_flat_tokens.tags_all != [[]] * len( text_flat_tokens.tokens_flat) assert text_flat_tokens.offsets_paras == [0] assert text_flat_tokens.offsets_sentences == [0] assert text_flat_tokens.offsets_clauses == [0] assert (len(text_flat_tokens.tokens_flat) == len(text_flat_tokens.tags_pos) == len(text_flat_tokens.tags_non_pos) == len( text_flat_tokens.tags_all)) assert len(text_flat_tokens.offsets_paras) == len( text_flat_tokens.tokens_hierarchical) assert len(text_flat_tokens.offsets_sentences) == sum( [len(para) for para in text_flat_tokens.tokens_hierarchical]) assert len(text_flat_tokens.offsets_clauses) == sum([ len(sentence) for para in text_flat_tokens.tokens_hierarchical for sentence in para ]) assert text.tokens_flat != [] assert text.tags_pos != [[]] * len(text.tokens_flat) assert text.tags_non_pos != [[]] * len(text.tokens_flat) assert text.tags_all != [[]] * len(text.tokens_flat) assert text.offsets_paras != [0] assert text.offsets_sentences != [0] assert text.offsets_clauses != [0] assert (len(text.tokens_flat) == len(text.tags_pos) == len( text.tags_non_pos) == len(text.tags_all)) assert len(text.offsets_paras) == len(text.tokens_hierarchical) assert len(text.offsets_sentences) == sum( [len(para) for para in text.tokens_hierarchical]) assert len(text.offsets_clauses) == sum([ len(sentence) for para in text.tokens_hierarchical for sentence in para ])
def testing_text(title, file, tokens_only=True): text = wordless_text.Wordless_Text(main, file, tokens_only=tokens_only) if tokens_only: print(f'---------- {title} [Tokens Only] ----------') else: print(f'---------- {title} ----------') print(f'Tokens ({len(text.tokens)}):') print(f'\t{text.tokens}') print(f'POS Tags ({len(text.tags_pos)}):') print(f'\t{text.tags_pos}') print(f'Non-POS Tags ({len(text.tags_non_pos)}):') print(f'\t{text.tags_non_pos}') print(f'All Tags ({len(text.tags_all)}):') print(f'\t{text.tags_all}') print(f'Paragraph Offsets ({len(text.para_offsets)}):') print(f'\t{text.para_offsets}') print(f'Sentence Offsets ({len(text.sentence_offsets)}):') print(f'\t{text.sentence_offsets}')
def process_data(self): texts = [] settings = self.main.settings_custom['wordlist'] files = self.main.wordless_files.get_selected_files() # Frequency for file in files: text = wordless_text.Wordless_Text(self.main, file) tokens = wordless_token_processing.wordless_process_tokens_wordlist( text, token_settings=settings['token_settings']) texts.append(text) self.tokens_freq_files.append(collections.Counter(tokens)) # Total if len(files) > 1: text_total = wordless_text.Wordless_Text_Blank() text_total.tokens = [ token for text in texts for token in text.tokens ] texts.append(text_total) self.tokens_freq_files.append( sum(self.tokens_freq_files, collections.Counter())) self.progress_updated.emit(self.tr('Processing data ...')) # Dispersion & Adjusted Frequency text_measure_dispersion = settings['generation_settings'][ 'measure_dispersion'] text_measure_adjusted_freq = settings['generation_settings'][ 'measure_adjusted_freq'] measure_dispersion = self.main.settings_global['measures_dispersion'][ text_measure_dispersion]['func'] measure_adjusted_freq = self.main.settings_global[ 'measures_adjusted_freq'][text_measure_adjusted_freq]['func'] tokens_total = self.tokens_freq_files[-1].keys() for text in texts: tokens_stats_file = {} # Dispersion number_sections = self.main.settings_custom['measures'][ 'dispersion']['general']['number_sections'] sections_freq = [ collections.Counter(section) for section in wordless_text_utils.to_sections( text.tokens, number_sections) ] for token in tokens_total: counts = [ section_freq[token] for section_freq in sections_freq ] tokens_stats_file[token] = [measure_dispersion(counts)] # Adjusted Frequency if not self.main.settings_custom['measures']['adjusted_freq'][ 'general']['use_same_settings_dispersion']: number_sections = self.main.settings_custom['measures'][ 'adjusted_freq']['general']['number_sections'] sections_freq = [ collections.Counter(section) for section in wordless_text_utils.to_sections( text.tokens, number_sections) ] for token in tokens_total: counts = [ section_freq[token] for section_freq in sections_freq ] tokens_stats_file[token].append(measure_adjusted_freq(counts)) self.tokens_stats_files.append(tokens_stats_file) if len(files) == 1: self.tokens_freq_files *= 2 self.tokens_stats_files *= 2
def test_text_tokenized_tagged_both(): file = new_file(file_name='tokenized_tagged_both', text_type=('tokenized', 'tagged_both')) text_tokens_only = wordless_text.Wordless_Text(main, file, tokens_only=True) text = wordless_text.Wordless_Text(main, file, tokens_only=False) assert text_tokens_only.tokens == [ '', 'English', 'is', 'a', 'West', 'Germanic', 'language', 'that', 'was', 'first', 'spoken', 'in', 'early', 'medieval', 'England', 'and', 'eventually', 'became', 'a', 'global', 'lingua', 'franca', '.', 'Named', 'after', 'the', 'Angles', ',', 'one', 'of', 'the', 'Germanic', 'tribes', 'that', 'migrated', 'to', 'the', 'area', 'of', 'Great', 'Britain', 'that', 'would', 'later', 'take', 'their', 'name', ',', 'England', ',', 'both', 'names', 'ultimately', 'deriving', 'from', 'the', 'Anglia', 'peninsula', 'in', 'the', 'Baltic', 'Sea', '.', 'It', 'is', 'closely', 'related', 'to', 'the', 'Frisian', 'languages', ',', 'but', 'its', 'vocabulary', 'has', 'been', 'significantly', 'influenced', 'by', 'other', 'Germanic', 'languages', ',', 'particularly', 'Norse', '(', 'a', 'North', 'Germanic', 'language', ')', ',', 'and', 'to', 'a', 'greater', 'extent', 'Latin', 'and', 'French', '.' ] assert text_tokens_only.tags_pos == [[], ['_JJ'], ['_VBZ'], ['_DT'], ['_JJ'], ['_JJ'], ['_NN'], ['_IN'], ['_VBD'], ['_RB'], ['_VBN'], ['_IN'], ['_RB'], ['_JJ'], ['_NN'], ['_CC'], ['_RB'], ['_VBD'], ['_DT'], ['_JJ'], ['_FW'], ['_FW'], ['_.'], ['_VBN'], ['_IN'], ['_DT'], ['_NNS'], ['_,'], ['_CD'], ['_IN'], ['_DT'], ['_JJ'], ['_NNS'], ['_IN'], ['_VBN'], ['_IN'], ['_DT'], ['_NN'], ['_IN'], ['_JJ'], ['_NNP'], ['_IN'], ['_MD'], ['_RB'], ['_VB'], ['_PRP$'], ['_NN'], ['_,'], ['_NN'], ['_,'], ['_DT'], ['_NNS'], ['_RB'], ['_VBG'], ['_IN'], ['_DT'], ['_NNP'], ['_NN'], ['_IN'], ['_DT'], ['_JJ'], ['_NNP'], ['_.'], ['_PRP'], ['_VBZ'], ['_RB'], ['_VBN'], ['_IN'], ['_DT'], ['_NNP'], ['_NNS'], ['_,'], ['_CC'], ['_PRP$'], ['_NN'], ['_VBZ'], ['_VBN'], ['_RB'], ['_VBN'], ['_IN'], ['_JJ'], ['_JJ'], ['_NNS'], ['_,'], ['_RB'], ['_NNP'], ['_-LRB-'], ['_DT'], ['_JJ'], ['_JJ'], ['_NN'], ['_-RRB-'], ['_,'], ['_CC'], ['_IN'], ['_DT'], ['_JJR'], ['_NN'], ['_JJ'], ['_CC'], ['_JJ'], ['_.']] assert text_tokens_only.tags_non_pos == [ ['<TAG1>', '<TAG2>'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['[4]', '[5]'], [], [], [], [], [], [], [], [], [], ['<TAG3>'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['<TAG3>'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['<TAG3>'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['[6]', '<TAG4>', '<TAG5>'] ] assert text_tokens_only.tags_all == [['<TAG1>', '<TAG2>'], ['_JJ'], ['_VBZ'], ['_DT'], ['_JJ'], ['_JJ'], ['_NN'], ['_IN'], ['_VBD'], ['_RB'], ['_VBN'], ['_IN'], ['_RB'], ['_JJ'], ['_NN'], ['_CC'], ['_RB'], ['_VBD'], ['_DT'], ['_JJ'], ['_FW'], ['_FW'], ['_.', '[4]', '[5]'], ['_VBN'], ['_IN'], ['_DT'], ['_NNS'], ['_,'], ['_CD'], ['_IN'], ['_DT'], ['_JJ'], ['_NNS', '<TAG3>'], ['_IN'], ['_VBN'], ['_IN'], ['_DT'], ['_NN'], ['_IN'], ['_JJ'], ['_NNP'], ['_IN'], ['_MD'], ['_RB'], ['_VB'], ['_PRP$'], ['_NN'], ['_,'], ['_NN'], ['_,'], ['_DT'], ['_NNS'], ['_RB'], ['_VBG'], ['_IN'], ['_DT'], ['_NNP'], ['_NN'], ['_IN'], ['_DT'], ['_JJ'], ['_NNP'], ['_.'], ['_PRP'], ['_VBZ'], ['_RB', '<TAG3>'], ['_VBN'], ['_IN'], ['_DT'], ['_NNP'], ['_NNS'], ['_,'], ['_CC'], ['_PRP$'], ['_NN'], ['_VBZ'], ['_VBN'], ['_RB'], ['_VBN'], ['_IN'], ['_JJ', '<TAG3>'], ['_JJ'], ['_NNS'], ['_,'], ['_RB'], ['_NNP'], ['_-LRB-'], ['_DT'], ['_JJ'], ['_JJ'], ['_NN'], ['_-RRB-'], ['_,'], ['_CC'], ['_IN'], ['_DT'], ['_JJR'], ['_NN'], ['_JJ'], ['_CC'], ['_JJ'], ['_.', '[6]', '<TAG4>', '<TAG5>']] assert text_tokens_only.offsets_paras == [0] assert text_tokens_only.offsets_sentences == [0] assert text_tokens_only.offsets_clauses == [0, 28, 48, 50, 72, 84, 93] assert (len(text_tokens_only.tokens) == len(text_tokens_only.tags_pos) == len(text_tokens_only.tags_non_pos) == len( text_tokens_only.tags_all)) assert len(text_tokens_only.offsets_paras) == len( text_tokens_only.tokens_sentences_paras) assert len(text_tokens_only.offsets_sentences) == sum( [len(para) for para in text_tokens_only.tokens_sentences_paras]) assert text.tokens == [ '', 'English', 'is', 'a', 'West', 'Germanic', 'language', 'that', 'was', 'first', 'spoken', 'in', 'early', 'medieval', 'England', 'and', 'eventually', 'became', 'a', 'global', 'lingua', 'franca', '.', 'Named', 'after', 'the', 'Angles', ',', 'one', 'of', 'the', 'Germanic', 'tribes', 'that', 'migrated', 'to', 'the', 'area', 'of', 'Great', 'Britain', 'that', 'would', 'later', 'take', 'their', 'name', ',', 'England', ',', 'both', 'names', 'ultimately', 'deriving', 'from', 'the', 'Anglia', 'peninsula', 'in', 'the', 'Baltic', 'Sea', '.', 'It', 'is', 'closely', 'related', 'to', 'the', 'Frisian', 'languages', ',', 'but', 'its', 'vocabulary', 'has', 'been', 'significantly', 'influenced', 'by', 'other', 'Germanic', 'languages', ',', 'particularly', 'Norse', '(', 'a', 'North', 'Germanic', 'language', ')', ',', 'and', 'to', 'a', 'greater', 'extent', 'Latin', 'and', 'French', '.' ] assert text.tags_pos == [[], ['_JJ'], ['_VBZ'], ['_DT'], ['_JJ'], ['_JJ'], ['_NN'], ['_IN'], ['_VBD'], ['_RB'], ['_VBN'], ['_IN'], ['_RB'], ['_JJ'], ['_NN'], ['_CC'], ['_RB'], ['_VBD'], ['_DT'], ['_JJ'], ['_FW'], ['_FW'], ['_.'], ['_VBN'], ['_IN'], ['_DT'], ['_NNS'], ['_,'], ['_CD'], ['_IN'], ['_DT'], ['_JJ'], ['_NNS'], ['_IN'], ['_VBN'], ['_IN'], ['_DT'], ['_NN'], ['_IN'], ['_JJ'], ['_NNP'], ['_IN'], ['_MD'], ['_RB'], ['_VB'], ['_PRP$'], ['_NN'], ['_,'], ['_NN'], ['_,'], ['_DT'], ['_NNS'], ['_RB'], ['_VBG'], ['_IN'], ['_DT'], ['_NNP'], ['_NN'], ['_IN'], ['_DT'], ['_JJ'], ['_NNP'], ['_.'], ['_PRP'], ['_VBZ'], ['_RB'], ['_VBN'], ['_IN'], ['_DT'], ['_NNP'], ['_NNS'], ['_,'], ['_CC'], ['_PRP$'], ['_NN'], ['_VBZ'], ['_VBN'], ['_RB'], ['_VBN'], ['_IN'], ['_JJ'], ['_JJ'], ['_NNS'], ['_,'], ['_RB'], ['_NNP'], ['_-LRB-'], ['_DT'], ['_JJ'], ['_JJ'], ['_NN'], ['_-RRB-'], ['_,'], ['_CC'], ['_IN'], ['_DT'], ['_JJR'], ['_NN'], ['_JJ'], ['_CC'], ['_JJ'], ['_.']] assert text.tags_non_pos == [ ['<TAG1>', '<TAG2>'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['[4]', '[5]'], [], [], [], [], [], [], [], [], [], ['<TAG3>'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['<TAG3>'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['<TAG3>'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['[6]', '<TAG4>', '<TAG5>'] ] assert text.tags_all == [['<TAG1>', '<TAG2>'], ['_JJ'], ['_VBZ'], ['_DT'], ['_JJ'], ['_JJ'], ['_NN'], ['_IN'], ['_VBD'], ['_RB'], ['_VBN'], ['_IN'], ['_RB'], ['_JJ'], ['_NN'], ['_CC'], ['_RB'], ['_VBD'], ['_DT'], ['_JJ'], ['_FW'], ['_FW'], ['_.', '[4]', '[5]'], ['_VBN'], ['_IN'], ['_DT'], ['_NNS'], ['_,'], ['_CD'], ['_IN'], ['_DT'], ['_JJ'], ['_NNS', '<TAG3>'], ['_IN'], ['_VBN'], ['_IN'], ['_DT'], ['_NN'], ['_IN'], ['_JJ'], ['_NNP'], ['_IN'], ['_MD'], ['_RB'], ['_VB'], ['_PRP$'], ['_NN'], ['_,'], ['_NN'], ['_,'], ['_DT'], ['_NNS'], ['_RB'], ['_VBG'], ['_IN'], ['_DT'], ['_NNP'], ['_NN'], ['_IN'], ['_DT'], ['_JJ'], ['_NNP'], ['_.'], ['_PRP'], ['_VBZ'], ['_RB', '<TAG3>'], ['_VBN'], ['_IN'], ['_DT'], ['_NNP'], ['_NNS'], ['_,'], ['_CC'], ['_PRP$'], ['_NN'], ['_VBZ'], ['_VBN'], ['_RB'], ['_VBN'], ['_IN'], ['_JJ', '<TAG3>'], ['_JJ'], ['_NNS'], ['_,'], ['_RB'], ['_NNP'], ['_-LRB-'], ['_DT'], ['_JJ'], ['_JJ'], ['_NN'], ['_-RRB-'], ['_,'], ['_CC'], ['_IN'], ['_DT'], ['_JJR'], ['_NN'], ['_JJ'], ['_CC'], ['_JJ'], ['_.', '[6]', '<TAG4>', '<TAG5>']] assert text.offsets_paras == [0] assert text.offsets_sentences == [0, 23, 63] assert text.offsets_clauses == [0, 23, 28, 48, 50, 63, 72, 84, 93] assert (len(text.tokens) == len(text.tags_pos) == len(text.tags_non_pos) == len(text.tags_all)) assert len(text.offsets_paras) == len(text.tokens_sentences_paras) assert len(text.offsets_sentences) == sum( [len(para) for para in text.tokens_sentences_paras])
def test_text_untokenized_untagged(): file = new_file(file_name='untokenized_untagged', text_type=('untokenized', 'untagged')) text_tokens_only = wordless_text.Wordless_Text(main, file, tokens_only=True) text = wordless_text.Wordless_Text(main, file, tokens_only=False) assert text_tokens_only.tokens == [ 'English', 'is', 'a', 'West', 'Germanic', 'language', 'that', 'was', 'first', 'spoken', 'in', 'early', 'medieval', 'England', 'and', 'eventually', 'became', 'a', 'global', 'lingua', 'franca', '.', 'Named', 'after', 'the', 'Angles', ',', 'one', 'of', 'the', 'Germanic', 'tribes', 'that', 'migrated', 'to', 'the', 'area', 'of', 'Great', 'Britain', 'that', 'would', 'later', 'take', 'their', 'name', ',', 'England', ',', 'both', 'names', 'ultimately', 'deriving', 'from', 'the', 'Anglia', 'peninsula', 'in', 'the', 'Baltic', 'Sea', '.', 'It', 'is', 'closely', 'related', 'to', 'the', 'Frisian', 'languages', ',', 'but', 'its', 'vocabulary', 'has', 'been', 'significantly', 'influenced', 'by', 'other', 'Germanic', 'languages', ',', 'particularly', 'Norse', '(', 'a', 'North', 'Germanic', 'language', ')', ',', 'and', 'to', 'a', 'greater', 'extent', 'Latin', 'and', 'French', '.' ] assert text_tokens_only.tags_pos == [[]] * len(text_tokens_only.tokens) assert text_tokens_only.tags_non_pos == [[]] * len(text_tokens_only.tokens) assert text_tokens_only.tags_all == [[]] * len(text_tokens_only.tokens) assert text_tokens_only.offsets_paras == [0] assert text_tokens_only.offsets_sentences == [0] assert text_tokens_only.offsets_clauses == [0, 27, 47, 49, 71, 83, 92] assert (len(text_tokens_only.tokens) == len(text_tokens_only.tags_pos) == len(text_tokens_only.tags_non_pos) == len( text_tokens_only.tags_all)) assert len(text_tokens_only.offsets_paras) == len( text_tokens_only.tokens_sentences_paras) assert len(text_tokens_only.offsets_sentences) == sum( [len(para) for para in text_tokens_only.tokens_sentences_paras]) assert text.tokens == [ 'English', 'is', 'a', 'West', 'Germanic', 'language', 'that', 'was', 'first', 'spoken', 'in', 'early', 'medieval', 'England', 'and', 'eventually', 'became', 'a', 'global', 'lingua', 'franca', '.', 'Named', 'after', 'the', 'Angles', ',', 'one', 'of', 'the', 'Germanic', 'tribes', 'that', 'migrated', 'to', 'the', 'area', 'of', 'Great', 'Britain', 'that', 'would', 'later', 'take', 'their', 'name', ',', 'England', ',', 'both', 'names', 'ultimately', 'deriving', 'from', 'the', 'Anglia', 'peninsula', 'in', 'the', 'Baltic', 'Sea', '.', 'It', 'is', 'closely', 'related', 'to', 'the', 'Frisian', 'languages', ',', 'but', 'its', 'vocabulary', 'has', 'been', 'significantly', 'influenced', 'by', 'other', 'Germanic', 'languages', ',', 'particularly', 'Norse', '(', 'a', 'North', 'Germanic', 'language', ')', ',', 'and', 'to', 'a', 'greater', 'extent', 'Latin', 'and', 'French', '.' ] assert text.tags_pos == [[]] * len(text.tokens) assert text.tags_non_pos == [[]] * len(text.tokens) assert text.tags_all == [[]] * len(text.tokens) assert text.offsets_paras == [0] assert text.offsets_sentences == [0, 22, 62] assert text.offsets_clauses == [0, 22, 27, 47, 49, 62, 71, 83, 92] assert (len(text.tokens) == len(text.tags_pos) == len(text.tags_non_pos) == len(text.tags_all)) assert len(text.offsets_paras) == len(text.tokens_sentences_paras) assert len(text.offsets_sentences) == sum( [len(para) for para in text.tokens_sentences_paras])
def process_data(self): texts = [] settings = self.main.settings_custom['keywords'] ref_file = self.main.wordless_files.find_file_by_name( settings['generation_settings']['ref_file'], selected_only=True) files = [ file for file in self.main.wordless_files.get_selected_files() if file != ref_file ] # Frequency for i, file in enumerate([ref_file] + files): text = wordless_text.Wordless_Text(self.main, file) tokens = wordless_token_processing.wordless_process_tokens_wordlist( text, token_settings=settings['token_settings']) self.keywords_freq_files.append(collections.Counter(tokens)) if i > 0: texts.append(text) else: tokens_ref = text.tokens len_tokens_ref = len(tokens_ref) # Total if len(files) > 1: text_total = wordless_text.Wordless_Text_Blank() text_total.tokens = [ token for text in texts for token in text.tokens ] texts.append(text_total) self.keywords_freq_files.append( sum(self.keywords_freq_files, collections.Counter())) self.keywords_freq_files[0] = { token: freq for token, freq in self.keywords_freq_files[0].items() if token in text_total.tokens } else: self.keywords_freq_files[0] = { token: freq for token, freq in self.keywords_freq_files[0].items() if token in self.keywords_freq_files[1] } self.progress_updated.emit(self.tr('Processing data ...')) # Keyness text_test_significance = settings['generation_settings'][ 'test_significance'] text_measure_effect_size = settings['generation_settings'][ 'measure_effect_size'] test_significance = self.main.settings_global['tests_significance'][ 'keywords'][text_test_significance]['func'] measure_effect_size = self.main.settings_global[ 'measures_effect_size']['keywords'][text_measure_effect_size][ 'func'] keywords_freq_file_observed = self.keywords_freq_files[-1] keywords_freq_file_ref = self.keywords_freq_files[0] for text in texts: keywords_stats_file = {} tokens_observed = text.tokens len_tokens_observed = len(tokens_observed) if text_test_significance in [ self.tr('Student\'s t-test (Two-sample)'), self.tr('Mann-Whitney U Test') ]: # Test Statistic, p-value & Bayes Factor if text_test_significance == self.tr( 'Student\'s t-test (Two-sample)'): number_sections = self.main.settings_custom['measures'][ 'statistical_significance'][ 'students_t_test_2_sample']['number_sections'] use_data = self.main.settings_custom['measures'][ 'statistical_significance'][ 'students_t_test_2_sample']['use_data'] elif text_test_significance == self.tr('Mann-Whitney U Test'): number_sections = self.main.settings_custom['measures'][ 'statistical_significance']['mann_whitney_u_test'][ 'number_sections'] use_data = self.main.settings_custom['measures'][ 'statistical_significance']['mann_whitney_u_test'][ 'use_data'] sections_observed = wordless_text_utils.to_sections( tokens_observed, number_sections) sections_ref = wordless_text_utils.to_sections( tokens_ref, number_sections) sections_freq_observed = [ collections.Counter(section) for section in sections_observed ] sections_freq_ref = [ collections.Counter(section) for section in sections_observed ] len_sections_observed = [ len(section) for section in sections_observed ] len_sections_ref = [len(section) for section in sections_ref] if use_data == self.tr('Absolute Frequency'): for token in keywords_freq_file_observed: counts_observed = [ section_freq.get(token, 0) for section_freq in sections_freq_observed ] counts_ref = [ section_freq.get(token, 0) for section_freq in sections_freq_ref ] keywords_stats_file[token] = test_significance( self.main, counts_observed, counts_ref) elif use_data == self.tr('Relative Frequency'): for token in keywords_freq_file_observed: counts_observed = [ section_freq.get(token, 0) / len_sections_observed[i] for i, section_freq in enumerate( sections_freq_observed) ] counts_ref = [ section_freq.get(token, 0) / len_sections_ref[i] for i, section_freq in enumerate(sections_freq_ref) ] keywords_stats_file[token] = test_significance( self.main, counts_observed, counts_ref) # Effect Size for token in keywords_freq_file_observed: c11 = keywords_freq_file_observed.get(token, 0) c12 = keywords_freq_file_ref.get(token, 0) c21 = len_tokens_observed - c11 c22 = len_tokens_ref - c12 keywords_stats_file[token].append( measure_effect_size(self.main, c11, c12, c21, c22)) else: for token in keywords_freq_file_observed: c11 = keywords_freq_file_observed.get(token, 0) c12 = keywords_freq_file_ref.get(token, 0) c21 = len_tokens_observed - c11 c22 = len_tokens_ref - c12 # Test Statistic, p-value & Bayes Factor keywords_stats_file[token] = test_significance( self.main, c11, c12, c21, c22) # Effect Size keywords_stats_file[token].append( measure_effect_size(self.main, c11, c12, c21, c22)) self.keywords_stats_files.append(keywords_stats_file) if len(files) == 1: self.keywords_freq_files.append(self.keywords_freq_files[1]) self.keywords_stats_files *= 2
def process_data(self): texts = [] settings = self.main.settings_custom['overview'] files = self.main.wordless_files.get_selected_files() for i, file in enumerate(files): text = wordless_text.Wordless_Text(self.main, file, flat_tokens=False) wordless_token_processing.wordless_process_tokens_overview( text, token_settings=settings['token_settings']) texts.append(text) if len(files) > 1: text_total = wordless_text.Wordless_Text_Blank() text_total.offsets_paras = [ offset for text in texts for offset in text.offsets_paras ] text_total.offsets_sentences = [ offset for text in texts for offset in text.offsets_sentences ] text_total.offsets_clauses = [ offset for text in texts for offset in text.offsets_clauses ] text_total.tokens_hierarchical = [ para for text in texts for para in text.tokens_hierarchical ] text_total.tokens_flat = [ token for text in texts for token in text.tokens_flat ] texts.append(text_total) else: texts.append(texts[0]) self.progress_updated.emit(self.tr('Processing data ...')) base_sttr = settings['generation_settings']['base_sttr'] for text in texts: texts_stats_file = [] # Paragraph length len_paras_in_sentence = [ len(para) for para in text.tokens_hierarchical ] len_paras_in_clause = [ sum([len(sentence) for sentence in para]) for para in text.tokens_hierarchical ] len_paras_in_token = [ sum([len(clause) for sentence in para for clause in sentence]) for para in text.tokens_hierarchical ] # Sentence length len_sentences = [ sum([len(clause) for clause in sentence]) for para in text.tokens_hierarchical for sentence in para ] # Clause length len_clauses = [ len(clause) for para in text.tokens_hierarchical for sentence in para for clause in sentence ] # Token length len_tokens = [len(token) for token in text.tokens_flat] # Type length len_types = [ len(token_type) for token_type in set(text.tokens_flat) ] count_tokens = len(len_tokens) count_types = len(len_types) # TTR if count_tokens == 0: ttr = 0 else: ttr = count_types / count_tokens # STTR if count_tokens < base_sttr: sttr = ttr else: token_sections = wordless_text_utils.to_sections_unequal( text.tokens_flat, base_sttr) # Discard the last section if number of tokens in it is smaller than the base of sttr if len(token_sections[-1]) < base_sttr: ttrs = [ len(set(token_section)) / len(token_section) for token_section in token_sections[:-1] ] else: ttrs = [ len(set(token_section)) / len(token_section) for token_section in token_sections ] sttr = sum(ttrs) / len(ttrs) texts_stats_file.append(len_paras_in_sentence) texts_stats_file.append(len_paras_in_clause) texts_stats_file.append(len_paras_in_token) texts_stats_file.append(len_sentences) texts_stats_file.append(len_clauses) texts_stats_file.append(len_tokens) texts_stats_file.append(len_types) texts_stats_file.append(ttr) texts_stats_file.append(sttr) self.texts_stats_files.append(texts_stats_file)
def process_data(self): texts = [] settings = self.main.settings_custom['overview'] files = self.main.wordless_files.get_selected_files() for i, file in enumerate(files): text = wordless_text.Wordless_Text(self.main, file, tokens_only = False) text.tokens = wordless_token_processing.wordless_process_tokens_overview(text, token_settings = settings['token_settings']) texts.append(text) if len(files) > 1: text_total = wordless_text.Wordless_Text_Blank() text_total.para_offsets = [offset for text in texts for offset in text.para_offsets] text_total.sentence_offsets = [offset for text in texts for offset in text.sentence_offsets] text_total.tokens = [token for text in texts for token in text.tokens] texts.append(text_total) else: texts.append(texts[0]) self.progress_updated.emit(self.tr('Processing data ...')) base_sttr = settings['generation_settings']['base_sttr'] for text in texts: texts_stats_file = [] count_paras = len(text.para_offsets) count_sentences = len(text.sentence_offsets) count_tokens = len(text.tokens) count_types = len(set(text.tokens)) len_tokens = [len(token) for token in text.tokens] self.texts_len_tokens_files.append(collections.Counter(len_tokens)) count_chars = sum(len_tokens) if count_tokens == 0: ttr = 0 else: ttr = count_types / count_tokens if count_tokens < base_sttr: sttr = ttr else: token_sections = wordless_text_utils.to_sections_unequal(text.tokens, base_sttr) # Discard the last section if number of tokens in it is smaller than the base of sttr if len(token_sections[-1]) < base_sttr: ttrs = [len(set(token_section)) / len(token_section) for token_section in token_sections[:-1]] else: ttrs = [len(set(token_section)) / len(token_section) for token_section in token_sections] sttr = sum(ttrs) / len(ttrs) texts_stats_file.append(count_paras) texts_stats_file.append(count_sentences) texts_stats_file.append(count_tokens) texts_stats_file.append(count_types) texts_stats_file.append(count_chars) texts_stats_file.append(ttr) texts_stats_file.append(sttr) self.texts_stats_files.append(texts_stats_file)