def index(self): db_access_info = pg_utils.get_db_access_info() with psycopg2.connect(database=db_access_info[0], user=db_access_info[1], password=db_access_info[2]) as connection: with connection.cursor() as cursor: word_centre_letter_at_random_query = \ ' '.join(( "SELECT * FROM pgt_board", "ORDER BY RANDOM() LIMIT 2", )) cursor.execute(word_centre_letter_at_random_query) word_centre_letter = cursor.fetchall() # list of 1 tuple connection.close() word_for_board = ''.join(word_centre_letter[0][0]) centre_letter_for_board = ''.join(word_centre_letter[0][1]) koru = pk.get_koru(word_for_board, centre_letter_for_board) word_for_display = ''.join(word_centre_letter[1][0]) template_id = 'index' template = self.env.get_template(template_id + '.html') return template.render({"template_id": template_id, "word": word_for_display, "koru": koru})
def distribute_children(): ''' For each word, middle letter combination (board) distribute the child words into groups according to frequency and then remove those boards that don't have 'enough' in any of the 3 groups ''' # get the word frequency data db_access_info = pg_utils.get_db_access_info() with psycopg2.connect(database=db_access_info[0], user=db_access_info[1], password=db_access_info[2]) as connection: with connection.cursor() as cursor: word_frequency_pairs = [] all_word_frequency_data_query = \ ' '.join(( "SELECT * FROM pgt_word_frequency", )) cursor.execute(all_word_frequency_data_query) word_frequency_pairs = cursor.fetchall() # list of tuples words = [x[0] for x in word_frequency_pairs] boards = [] for k, v in boards_and_children.boards_and_children.items(): child_word_frequencies = [] for child_word in v: try: child_word_index = words.index(child_word) except ValueError: # the headword in the dictionary doesn't appear # anywhere in the example text child_word_score = 0 else: child_word_score = word_frequency_pairs[child_word_index][1] child_word_frequencies.append(child_word_score) intervals = [frozenset(range(PAI_MINIMUM_FREQUENCY, VERY_LARGE_NUMBER)), \ frozenset(range(TINO_PAI_MINIMUM_FREQUENCY, PAI_MINIMUM_FREQUENCY)), \ frozenset(range(TINO_PAI_RAWA_ATU_MINIMUM_FREQUENCY, \ TINO_PAI_MINIMUM_FREQUENCY))] counts = [0] * len(intervals) for n in sorted(child_word_frequencies): for i, inter in enumerate(intervals): if n in inter: counts[i] += 1 if not(counts[0] < MINIMUM_GROUP_SIZE or counts[1] < MINIMUM_GROUP_SIZE or counts[2] < MINIMUM_GROUP_SIZE): boards.append(k) return boards
def get_word_frequency_distribution(): ''' For each word, middle letter combination split the data into groups ''' # get the word frequency data db_access_info = pg_utils.get_db_access_info() with psycopg2.connect(database=db_access_info[0], user=db_access_info[1], password=db_access_info[2]) as connection: with connection.cursor() as cursor: word_frequency_pairs = [] all_word_frequency_data_query = \ ' '.join(( "SELECT * FROM pgt_word_frequency", )) cursor.execute(all_word_frequency_data_query) word_frequency_pairs = cursor.fetchall() # list of tuples words = [x[0] for x in word_frequency_pairs] scores = {} burp = 0 for k, v in boards_and_children.boards_and_children.items(): child_word_frequencies = [] for child_word in v: try: child_word_index = words.index(child_word) except ValueError: # the headword in the dictionary doesn't appear # anywhere in the example text child_word_score = 0 else: child_word_score = word_frequency_pairs[child_word_index][1] child_word_frequencies.append(child_word_score) scores[k] = sorted(child_word_frequencies) intervals = [frozenset(range(24, 100000)), \ frozenset(range(4, 24)), \ frozenset(range(4))] counts = [0] * len(intervals) for n in sorted(child_word_frequencies): for i, inter in enumerate(intervals): if n in inter: counts[i] += 1 if not(counts[0] < 4 or counts[1] < 4 or counts[2] < 4): burp = burp +1 print(k, burp, counts[0], counts[1], counts[2])
def group_children(children): ''' This function takes a list of children and sorts them into three lists of tuples (word, frequency) - pai - tino pai - tino pai rawa atu ''' # get the word frequency data db_access_info = pg_utils.get_db_access_info() with psycopg2.connect(database=db_access_info[0], user=db_access_info[1], password=db_access_info[2]) as connection: with connection.cursor() as cursor: word_frequency_pairs = [] for child in children: frequency_data_query = \ ' '.join(( "SELECT kount FROM pgt_word_frequency", "WHERE word = (%s)", )) cursor.execute(frequency_data_query, (child,)) frequency = cursor.fetchall() # list of 1 tuple (assumed) if frequency == []: # child word not found in word frequency list frequency_to_use = 0 else: frequency_to_use = int(frequency[0][0]) word_frequency_pairs.append((child, frequency_to_use)) intervals = [frozenset(range(PAI_MINIMUM_FREQUENCY, VERY_LARGE_NUMBER)), \ frozenset(range(TINO_PAI_MINIMUM_FREQUENCY, PAI_MINIMUM_FREQUENCY)), \ frozenset(range(TINO_PAI_RAWA_ATU_MINIMUM_FREQUENCY, \ TINO_PAI_MINIMUM_FREQUENCY))] pai = [] tino_pai = [] tino_pai_rawa_atu = [] for word, frequency in word_frequency_pairs: if frequency in intervals[0]: pai.append((word, frequency)) elif frequency in intervals[1]: tino_pai.append((word, frequency)) elif frequency in intervals[2]: tino_pai_rawa_atu.append((word, frequency)) else: raise return pai, tino_pai, tino_pai_rawa_atu
def get_children(input_string, compulsory_letter, minimum_length=3): ''' Returns a list containing all the word forms (children) that can be made from the input_string. The input string can be of one of two forms a) A Māori word b) A Koru If the latter then any digraphs on the last row need to be reversed. ''' # if minimum length is passed as a string *try* and convert to integer minimum_length = int(minimum_length) children = [] # if the input string contains any reversed digraphs, reverse them # note that this can only happen with a koru on the last line # these 2 are mutually exclusive and will not interfere with eachother # swap digraphs around if necessary if input_string[6] + input_string[5] in pū.digraphs: input_string = list(input_string) input_string[6], input_string[5] = input_string[5], input_string[6] input_string = ''.join(input_string) elif input_string[5] + input_string[4] in pū.digraphs: input_string = list(input_string) input_string[5], input_string[4] = input_string[4], input_string[5] input_string = ''.join(input_string) else: pass # no action required as there are no reversed digraphs input_string_as_list = mw._aslist(input_string) # get the word list db_access_info = pg_utils.get_db_access_info() with psycopg2.connect(database=db_access_info[0], user=db_access_info[1], password=db_access_info[2]) as connection: with connection.cursor() as cursor: all_word_forms_query = "SELECT * FROM pgt_word" cursor.execute(all_word_forms_query) unique_word_forms = cursor.fetchall() # list of tuples connection.close() # list of strings unique_word_forms = [''.join(x) for x in unique_word_forms] for word in [x for x in unique_word_forms if len(x) >= minimum_length]: word_as_list = mw._aslist(word) is_child = False if not (Counter(word_as_list) - Counter(input_string_as_list)): is_child = True if is_child and compulsory_letter in word_as_list: children.append(word) return(children)
def test_pangakupu_words(): db_access_info = pg_utils.get_db_access_info() with psycopg2.connect(database=db_access_info[0], user=db_access_info[1], password=db_access_info[2]) as connection: with connection.cursor() as cursor: all_word_forms_query = "SELECT * FROM pgt_word" cursor.execute(all_word_forms_query) unique_word_forms = cursor.fetchall() #list of tuples connection.close() all_words_for_iwa = [''.join(x) for x in unique_word_forms] #list of strings #word counts assert len(all_words_for_iwa) == 11601 c = Counter(len(x) for x in all_words_for_iwa) assert dict(c) == {1: 9, 2: 57, 3: 255, 4: 1099, 5: 1169, 6: 2691, 7: 1568, 8: 1949, 9: 830, 10: 971, 11: 451, 12: 279, 13: 164, 14: 54, 15: 35, 16: 10, 17: 6, 18: 3, 19: 1} assert sum(dict(c).values()) == 11601 #recheck the count assert sum([k * v for k, v in dict(c).items()]) == 83080 #letter counts assert len(set(all_words_for_iwa)) == 11601 #test for uniqueness #check every entry is lower case assert [x if x.lower() == x else 'derp' for x in all_words_for_iwa] == all_words_for_iwa #check every entry is free of punctuation assert [x if mw._isalllegalletters(x) else 'derp' for x in all_words_for_iwa] == all_words_for_iwa #check that the basics for all maori words hold for x in all_words_for_iwa: assert x == mw.MaoriWord(x).word #letter counts all_letters_for_iwa = [] for x in all_words_for_iwa: all_letters_for_iwa.extend(mw._aslist(x)) c = dict(Counter(all_letters_for_iwa)) assert c == {'a': 14894, 'ā': 2252, 'e': 5125, 'ē': 281, 'h': 3970, 'i': 6765, 'ī': 627, 'k': 6882, 'm': 2406, 'n': 2002, 'ng': 1834, 'o': 5521, 'ō': 1216, 'p': 3733, 'r': 6270, 't': 5880, 'u': 5736, 'ū': 993, 'w': 1245, 'wh': 1807} assert sum(dict(c).values()) == 79439 #digraphs count as 1 letter #cross check letter counts from words vs direct letter counts assert 83080 == 79439 + c['ng'] + c['wh'] #digraphs count as 2 letters