print('Loading models...') pos, person, number, tense, mood, voice, gender, case, degree = create_morph_classes() morphs = (pos, person, number, tense, mood, voice, gender, case, degree) # Load character list, annotator list, and vector dictionary print('Loading character list, annotator list, and vector dictionary...') with open(os.path.join('data', 'jsons', 'all_norm_characters.json'), encoding='utf-8') as json_file: all_norm_characters = json.load(json_file) with open(os.path.join('data', 'jsons', 'annotators.json'), encoding='utf-8') as json_file: all_annotators = json.load(json_file) with open(os.path.join('data', 'jsons', 'short_annotators.json'), encoding='utf-8') as json_file: short_annotators = json.load(json_file) wv = KeyedVectors.load('models/fasttext.wordvectors') # Create the normalizer normalise = Normaliser().normalise # Create annotator tensor annotator_tensor = [0] * 37 try: annotator_tensor[all_annotators.index(annotator)] = 1 # Make Vanessa Gorman the default annotator except IndexError: annotator_tensor[0] = 1 print('Pre-processing text...') blank_character_tensor = np.array([0]*174, dtype=np.float32) punc_separated_text = isolate_greek_punctuation(greek_text) split_text = punc_separated_text.split() print(f'Text and punctuation split into {len(split_text)} individual tokens.')
def tag(greek_text, annotator='Vanessa Gorman'): """Take in a string of Greek text and return that text morphologically tagged.""" print('Loading models...') pos, person, number, tense, mood, voice, gender, case, degree = create_morph_classes( ) morphs = (pos, person, number, tense, mood, voice, gender, case, degree) # Create the normalizer normalise = Normaliser().normalise # Create annotator tensor annotator_tensor = [0] * 37 try: annotator_tensor[all_annotators.index(annotator)] = 1 # Make Vanessa Gorman the default annotator except IndexError: annotator_tensor[0] = 1 print('Pre-processing text...') blank_character_tensor = np.array([0] * 174, dtype=np.float32) punc_separated_text = isolate_greek_punctuation(greek_text) split_text = punc_separated_text.split() print( f'Text and punctuation split into {len(split_text)} individual tokens.' ) one_hotted_tokens = [] dnn_input = [] blank_lstm2_token = np.array([0] * 192) lstm2_padding = np.tile(blank_lstm2_token, (7, 1)) lstm2_input = [] return_list = [] # Create character tensors and word tensors composed of those character tensors for word in split_text: # The whole token tensor starts out blank because it's challenging to fill out the empty characters. token_tensor = np.array([blank_character_tensor] * 21, dtype=np.float32) # Normalize each token before tensorizing its characters. normalized_form = normalise(elision_normalize(word))[0] token_length = len(normalized_form) # Create token tensors for tokens longer than 21 characters if token_length > 21: token_tensor = [] for character in normalized_form[:10]: character_tensor = [0] * 137 try: character_tensor[all_norm_characters.index(character)] = 1 except ValueError: character_tensor[136] = 1 # Append the annotator tensor at the end of every character tensor character_tensor = character_tensor + annotator_tensor character_tensor = np.array(character_tensor, dtype=np.float32) token_tensor.append(character_tensor) character_tensor = [0] * 137 character_tensor[135] = 1 # Append the annotator tensor at the end of every character tensor character_tensor = character_tensor + annotator_tensor character_tensor = np.array(character_tensor, dtype=np.float32) token_tensor.append(character_tensor) for character in normalized_form[-10:]: character_tensor = [0] * 137 try: character_tensor[all_norm_characters.index(character)] = 1 except ValueError: character_tensor[136] = 1 # Append the annotator tensor at the end of every character tensor character_tensor = character_tensor + annotator_tensor character_tensor = np.array(character_tensor, dtype=np.float32) token_tensor.append(character_tensor) token_tensor = np.array(token_tensor, dtype=np.float32) # Create token tensors for tokens shorter than 22 characters else: for i, character in enumerate(normalized_form): character_tensor = [0] * 137 try: character_tensor[all_norm_characters.index(character)] = 1 except ValueError: character_tensor[136] = 1 # Append the annotator tensor at the end of every character tensor character_tensor = character_tensor + annotator_tensor character_tensor = np.array(character_tensor, dtype=np.float32) token_tensor[21 - token_length + i] = character_tensor # Add each tensor token to the samples one_hotted_tokens.append(token_tensor) one_hots_np = np.array(one_hotted_tokens, dtype=np.float32) # Process through the first LSTM... print("Angel's looking at each word by itself...") for aspect in morphs: aspect.output1 = aspect.lstm1.predict(one_hots_np) for aspect in morphs: for tensor in aspect.output1: try: aspect.predicted_tags1.append(aspect.tags[int( np.argmax(tensor))]) except IndexError: aspect.predicted_tags1.append('-') aspect.confidence1.append(np.amax(tensor)) for i, token in enumerate(punc_separated_text.split()): dnn_input.append( np.concatenate( (pos.output1[i], person.output1[i], number.output1[i], tense.output1[i], mood.output1[i], voice.output1[i], gender.output1[i], case.output1[i], degree.output1[i], annotator_tensor), axis=0)) np_dnn_input = np.array(dnn_input) # Run outputs through DNN print('Reconsidering tags...') for aspect in morphs: aspect.output2 = aspect.dnn.predict(np_dnn_input) for aspect in morphs: for tensor in aspect.output2: try: aspect.predicted_tags2.append(aspect.tags[int( np.argmax(tensor))]) except IndexError: aspect.predicted_tags2.append('-') aspect.confidence2.append(np.amax(tensor)) # Prepare inputs for LSTM2 for i, token in enumerate(punc_separated_text.split()): lstm2_input.append( np.concatenate( (pos.output2[i], person.output2[i], number.output2[i], tense.output2[i], mood.output2[i], voice.output2[i], gender.output2[i], case.output2[i], degree.output2[i], annotator_tensor, vector_lookup(normalise(elision_normalize(token))[0])), axis=0)) padded_lstm2_input = np.concatenate( (lstm2_padding, lstm2_input, lstm2_padding)) time_series = [] for i in range(0, len(padded_lstm2_input) - 14): time_series.append(padded_lstm2_input[i:i + 15]) lstm2_ts = np.array(time_series) # In the future, convert to a tf.data format: # dataset = tf.data.Dataset.from_tensor_slices(padded_lstm2_input).window(15, 1, 1) # Run outputs through LSTM2 print("Studying each word in light of its context...") for aspect in morphs: aspect.output3 = aspect.lstm2.predict(lstm2_ts) for aspect in morphs: for tensor in aspect.output3: try: aspect.predicted_tags3.append(aspect.tags[int( np.argmax(tensor))]) except IndexError: aspect.predicted_tags3.append('-') aspect.confidence3.append(np.amax(tensor)) for i, token in enumerate(punc_separated_text.split()): return_list.append( (token, pos.predicted_tags3[i] + person.predicted_tags3[i] + number.predicted_tags3[i] + tense.predicted_tags3[i] + mood.predicted_tags3[i] + voice.predicted_tags3[i] + gender.predicted_tags3[i] + case.predicted_tags3[i] + degree.predicted_tags3[i])) return tuple(return_list)
if flags & Norm.GRAVE: s += "g" if flags & Norm.EXTRA: s += "x" if flags & Norm.ELISION: s += "l" if flags & Norm.MOVABLE: s += "m" if s == "": s = "." return s normalise = Normaliser(config).normalise for chapter_num in range(1, 20): input_filename = f"text/lgpsi.sent.{chapter_num:03d}.txt" output_filename = f"analysis/lgpsi.sent.{chapter_num:03d}.norm.txt" with open(input_filename) as f, open(output_filename, "w") as g: for line in f: line = line.strip() ref, *text = line.split() text_list = [f"{ref}.text", *text] norm = [f"{ref}.norm"] flags = [f"{ref}.flags"] for token in text: norm_token, norm_flags = normalise(token.strip(",.;·«»()!")) norm.append(norm_token)