print('Loading models...')
pos, person, number, tense, mood, voice, gender, case, degree = create_morph_classes()
morphs = (pos, person, number, tense, mood, voice, gender, case, degree)

# Load character list, annotator list, and vector dictionary
print('Loading character list, annotator list, and vector dictionary...')
with open(os.path.join('data', 'jsons', 'all_norm_characters.json'), encoding='utf-8') as json_file:
    all_norm_characters = json.load(json_file)
with open(os.path.join('data', 'jsons', 'annotators.json'), encoding='utf-8') as json_file:
    all_annotators = json.load(json_file)
with open(os.path.join('data', 'jsons', 'short_annotators.json'), encoding='utf-8') as json_file:
    short_annotators = json.load(json_file)
wv = KeyedVectors.load('models/fasttext.wordvectors')

# Create the normalizer
normalise = Normaliser().normalise

# Create annotator tensor
annotator_tensor = [0] * 37
try:
    annotator_tensor[all_annotators.index(annotator)] = 1

# Make Vanessa Gorman the default annotator
except IndexError:
    annotator_tensor[0] = 1

print('Pre-processing text...')
blank_character_tensor = np.array([0]*174, dtype=np.float32)
punc_separated_text = isolate_greek_punctuation(greek_text)
split_text = punc_separated_text.split()
print(f'Text and punctuation split into {len(split_text)} individual tokens.')
示例#2
0
def tag(greek_text, annotator='Vanessa Gorman'):
    """Take in a string of Greek text and return that text morphologically tagged."""
    print('Loading models...')
    pos, person, number, tense, mood, voice, gender, case, degree = create_morph_classes(
    )
    morphs = (pos, person, number, tense, mood, voice, gender, case, degree)

    # Create the normalizer
    normalise = Normaliser().normalise

    # Create annotator tensor
    annotator_tensor = [0] * 37
    try:
        annotator_tensor[all_annotators.index(annotator)] = 1

    # Make Vanessa Gorman the default annotator
    except IndexError:
        annotator_tensor[0] = 1

    print('Pre-processing text...')
    blank_character_tensor = np.array([0] * 174, dtype=np.float32)
    punc_separated_text = isolate_greek_punctuation(greek_text)
    split_text = punc_separated_text.split()
    print(
        f'Text and punctuation split into {len(split_text)} individual tokens.'
    )
    one_hotted_tokens = []
    dnn_input = []
    blank_lstm2_token = np.array([0] * 192)
    lstm2_padding = np.tile(blank_lstm2_token, (7, 1))
    lstm2_input = []
    return_list = []

    # Create character tensors and word tensors composed of those character tensors
    for word in split_text:

        # The whole token tensor starts out blank because it's challenging to fill out the empty characters.
        token_tensor = np.array([blank_character_tensor] * 21,
                                dtype=np.float32)

        # Normalize each token before tensorizing its characters.
        normalized_form = normalise(elision_normalize(word))[0]
        token_length = len(normalized_form)

        # Create token tensors for tokens longer than 21 characters
        if token_length > 21:
            token_tensor = []
            for character in normalized_form[:10]:
                character_tensor = [0] * 137
                try:
                    character_tensor[all_norm_characters.index(character)] = 1
                except ValueError:
                    character_tensor[136] = 1

                # Append the annotator tensor at the end of every character tensor
                character_tensor = character_tensor + annotator_tensor
                character_tensor = np.array(character_tensor, dtype=np.float32)
                token_tensor.append(character_tensor)
            character_tensor = [0] * 137
            character_tensor[135] = 1

            # Append the annotator tensor at the end of every character tensor
            character_tensor = character_tensor + annotator_tensor
            character_tensor = np.array(character_tensor, dtype=np.float32)
            token_tensor.append(character_tensor)
            for character in normalized_form[-10:]:
                character_tensor = [0] * 137
                try:
                    character_tensor[all_norm_characters.index(character)] = 1
                except ValueError:
                    character_tensor[136] = 1

                # Append the annotator tensor at the end of every character tensor
                character_tensor = character_tensor + annotator_tensor
                character_tensor = np.array(character_tensor, dtype=np.float32)
                token_tensor.append(character_tensor)
            token_tensor = np.array(token_tensor, dtype=np.float32)

        # Create token tensors for tokens shorter than 22 characters
        else:
            for i, character in enumerate(normalized_form):
                character_tensor = [0] * 137
                try:
                    character_tensor[all_norm_characters.index(character)] = 1
                except ValueError:
                    character_tensor[136] = 1

                # Append the annotator tensor at the end of every character tensor
                character_tensor = character_tensor + annotator_tensor
                character_tensor = np.array(character_tensor, dtype=np.float32)
                token_tensor[21 - token_length + i] = character_tensor

        # Add each tensor token to the samples
        one_hotted_tokens.append(token_tensor)
    one_hots_np = np.array(one_hotted_tokens, dtype=np.float32)

    # Process through the first LSTM...
    print("Angel's looking at each word by itself...")
    for aspect in morphs:
        aspect.output1 = aspect.lstm1.predict(one_hots_np)

    for aspect in morphs:
        for tensor in aspect.output1:
            try:
                aspect.predicted_tags1.append(aspect.tags[int(
                    np.argmax(tensor))])
            except IndexError:
                aspect.predicted_tags1.append('-')
            aspect.confidence1.append(np.amax(tensor))

    for i, token in enumerate(punc_separated_text.split()):
        dnn_input.append(
            np.concatenate(
                (pos.output1[i], person.output1[i], number.output1[i],
                 tense.output1[i], mood.output1[i], voice.output1[i],
                 gender.output1[i], case.output1[i], degree.output1[i],
                 annotator_tensor),
                axis=0))
    np_dnn_input = np.array(dnn_input)

    # Run outputs through DNN
    print('Reconsidering tags...')
    for aspect in morphs:
        aspect.output2 = aspect.dnn.predict(np_dnn_input)

    for aspect in morphs:
        for tensor in aspect.output2:
            try:
                aspect.predicted_tags2.append(aspect.tags[int(
                    np.argmax(tensor))])
            except IndexError:
                aspect.predicted_tags2.append('-')
            aspect.confidence2.append(np.amax(tensor))

    # Prepare inputs for LSTM2
    for i, token in enumerate(punc_separated_text.split()):
        lstm2_input.append(
            np.concatenate(
                (pos.output2[i], person.output2[i], number.output2[i],
                 tense.output2[i], mood.output2[i], voice.output2[i],
                 gender.output2[i], case.output2[i], degree.output2[i],
                 annotator_tensor,
                 vector_lookup(normalise(elision_normalize(token))[0])),
                axis=0))

    padded_lstm2_input = np.concatenate(
        (lstm2_padding, lstm2_input, lstm2_padding))

    time_series = []
    for i in range(0, len(padded_lstm2_input) - 14):
        time_series.append(padded_lstm2_input[i:i + 15])

    lstm2_ts = np.array(time_series)

    # In the future, convert to a tf.data format:
    # dataset = tf.data.Dataset.from_tensor_slices(padded_lstm2_input).window(15, 1, 1)

    # Run outputs through LSTM2
    print("Studying each word in light of its context...")
    for aspect in morphs:
        aspect.output3 = aspect.lstm2.predict(lstm2_ts)

    for aspect in morphs:
        for tensor in aspect.output3:
            try:
                aspect.predicted_tags3.append(aspect.tags[int(
                    np.argmax(tensor))])
            except IndexError:
                aspect.predicted_tags3.append('-')
            aspect.confidence3.append(np.amax(tensor))

    for i, token in enumerate(punc_separated_text.split()):
        return_list.append(
            (token, pos.predicted_tags3[i] + person.predicted_tags3[i] +
             number.predicted_tags3[i] + tense.predicted_tags3[i] +
             mood.predicted_tags3[i] + voice.predicted_tags3[i] +
             gender.predicted_tags3[i] + case.predicted_tags3[i] +
             degree.predicted_tags3[i]))
    return tuple(return_list)
示例#3
0
    if flags & Norm.GRAVE:
        s += "g"
    if flags & Norm.EXTRA:
        s += "x"
    if flags & Norm.ELISION:
        s += "l"
    if flags & Norm.MOVABLE:
        s += "m"

    if s == "":
        s = "."

    return s


normalise = Normaliser(config).normalise

for chapter_num in range(1, 20):
    input_filename = f"text/lgpsi.sent.{chapter_num:03d}.txt"
    output_filename = f"analysis/lgpsi.sent.{chapter_num:03d}.norm.txt"

    with open(input_filename) as f, open(output_filename, "w") as g:
        for line in f:
            line = line.strip()
            ref, *text = line.split()
            text_list = [f"{ref}.text", *text]
            norm = [f"{ref}.norm"]
            flags = [f"{ref}.flags"]
            for token in text:
                norm_token, norm_flags = normalise(token.strip(",.;·«»()!"))
                norm.append(norm_token)