Пример #1
0
def findSyllables(jsonListTurns):
    dictionaryList = []
    phoney = BigPhoney()
    for elem in jsonListTurns:
        syllableNum = sum(
            [phoney.count_syllables(word) for word in elem[3].split()])
        dictionaryList.append({
            "elem":
            elem,
            "syllableNum":
            syllableNum,
            "syllRate":
            round(syllableNum / (abs(elem[2] - elem[1])), 2)
        })
    return dictionaryList
def main():
    path = os.getcwd() + "\\train\\"
    trainSummary = []
    trainFluency = []
    trainNRedundancy = []
    stops = set(stopwords.words("english"))
    ##############################################################
    # reading and pre processing training data
    ##############################################################
    with open(path + 'Train_Data.csv') as csvfile:
        readCSV = csv.reader(csvfile, delimiter=',')
        count = 0
        for row in readCSV:
            ####first row -headings not required#####
            if (count == 0):
                count += 1
                continue
            ########for rows with no data############
            if (row[0] == ""):
                continue
            ###########################################################
            # replace \n and \t with spaces
            # remove extra spaces
            # convert to lowercase
            ###########################################################
            a = row[0]
            a = a.replace("\n", " ")
            a = a.replace("\t", " ")
            a = a.lower()
            a = re.sub(" +", " ", a)
            trainSummary.append(a)
            #####redundancy and fluency values are strings-conversion to int and float######
            if (str(row[1]).__contains__(".")):
                x = float(str(row[1]))
                trainNRedundancy.append(x)
            else:
                x = int(str(row[1]))
                trainNRedundancy.append(x)

            if (str(row[2]).__contains__(".")):
                x = float(str(row[2]))
                trainFluency.append(x)
            else:
                x = int(str(row[2]))
                trainFluency.append(x)
    ##############################################################
    # reading and pre processing test data
    ##############################################################

    testSummary = []
    testFluency = []
    testNRedundancy = []
    with open(path + 'Test_Data.csv') as csvfile:
        readCSV = csv.reader(csvfile, delimiter=',')
        count = 0
        for row in readCSV:
            if (count == 0):
                count += 1
                continue
            if (row[0] == ""):
                continue
            a = row[0]
            a = a.replace("\n", " ")
            a = a.replace("\t", " ")
            a = a.lower()
            a = re.sub(" +", " ", a)
            testSummary.append(a)
            ######################################################################
            # changing fluency and redundancy values from string to float or int
            ######################################################################
            if (str(row[1]).__contains__(".")):
                x = float(str(row[1]))
                testNRedundancy.append(x)
            else:
                x = int(str(row[1]))
                testNRedundancy.append(x)

            if (str(row[2]).__contains__(".")):
                x = float(str(row[2]))
                testFluency.append(x)
            else:
                x = int(str(row[2]))
                testFluency.append(x)

    #####################################################################
    # Total number of repetitive unigrams
    #####################################################################
    unigram_feature = []
    for sen in trainSummary:
        sen = word_tokenize(sen)
        unigram = {}
        words = []

        for j in sen:
            if j not in stops:
                words.append(j)
        for j in range(len(words)):
            t = "_".join(words[j:j + 1])
            unigram.setdefault(t, 0)
            unigram[t] += 1
        count = 0
        for k in unigram.values():
            if (k > 1):
                count += 1
        unigram_feature.append(count)

    testunigram_feature = []
    for sen in testSummary:
        sen = word_tokenize(sen)
        unigram1 = {}
        words = []
        for j in sen:
            if j not in stops:
                words.append(j)
        for j in range(len(words)):
            t = "_".join(words[j:j + 1])
            unigram1.setdefault(t, 0)
            unigram1[t] += 1
        count1 = 0
        for k in unigram1.values():
            if (k > 1):
                count1 += 1
        testunigram_feature.append(count1)

    ##################################################################
    # Total number of repetitive bigrams
    ###################################################################
    bigram_feature = []
    for sen in trainSummary:
        sen = word_tokenize(sen)
        bigram = {}
        words = []
        for j in sen:
            if j not in stops:
                words.append(j)
        for j in range(len(words) - 1):
            t = "_".join(words[j:j + 2])
            bigram.setdefault(t, 0)
            bigram[t] += 1
        count = 0
        for k in bigram.values():
            if (k > 1):
                count += 1
        bigram_feature.append(count)

    testbigram_feature = []
    for sen in testSummary:
        sen = word_tokenize(sen)
        bigram1 = {}
        words1 = []
        for j in sen:
            if j not in stops:
                words1.append(j)
        for j in range(len(words1) - 1):
            t = "_".join(words1[j:j + 2])
            bigram1.setdefault(t, 0)
            bigram1[t] += 1
        count1 = 0
        for k in bigram1.values():
            if (k > 1):
                count1 += 1
        testbigram_feature.append(count1)

    ########################################################################
    # Minimum Flesch reading-ease score:
    ########################################################################

    flesch = []
    phoney = BigPhoney()
    for j in trainSummary:
        min = float('inf')
        sen = nltk.tokenize.sent_tokenize(j)
        for k in sen:
            words = word_tokenize(k)
            count = 0
            for z in words:
                count += phoney.count_syllables(z)
            score = readability.FleschReadingEase(count, len(words), 1)
            if (score < min):
                min = score
        flesch.append(min)

    fleschT = []

    for j in testSummary:
        min = float('inf')
        sen = nltk.tokenize.sent_tokenize(j)
        read = []

        for k in sen:
            words = word_tokenize(k)
            count = 0

            for z in words:
                count += phoney.count_syllables(z)
            score = readability.FleschReadingEase(count, len(words), 1)
            if (score < min):
                min = score
        fleschT.append(min)
    ##############################################################################################
    # Classifier for the above three features-Q4.3
    # Linear Regression Model

    ###############################################################################################
    unigramTrain = np.array(unigram_feature).reshape(len(unigram_feature), 1)
    unigramTest = np.array(testunigram_feature).reshape(
        len(testunigram_feature), 1)
    bigramTrain = np.array(bigram_feature).reshape(len(bigram_feature), 1)
    bigramTest = np.array(testbigram_feature).reshape(len(testbigram_feature),
                                                      1)
    fleschTrain = np.array(flesch).reshape(len(flesch), 1)
    fleschTest = np.array(fleschT).reshape(len(fleschT), 1)
    clf = LinearRegression()
    #######################################################################################
    # Reported Values
    # scipy.stats.pearsonr(x, y) gives two values, first value gives the value -1 to 1 with
    # positive value referring to more correlation.
    # The second p-value roughly indicates the probability of an uncorrelated system producing datasets that
    # have a Pearson correlation at least as extreme as the one computed from these datasets.
    # The p-values are not entirely reliable but are probably reasonable for datasets larger than 500 or so.
    # MSE - 0.22993706600411773
    # Pearson Correlation Coefficient-(0.3521331207011803, 3.163607855250048e-07)
    #######################################################################################
    clf.fit(np.hstack((np.hstack((unigramTrain, bigramTrain)), fleschTrain)),
            np.array(trainFluency))
    y_pred = clf.predict(
        np.hstack((np.hstack((unigramTest, bigramTest)), fleschTest)))
    MSE = mean_squared_error(np.float64(np.array(testFluency)),
                             np.float64(np.array(y_pred)))
    print(MSE)
    pearSon = pearsonr(np.float64(np.array(testFluency)),
                       np.float64(np.array(y_pred)))
    print(pearSon)

    ###############################################################################################
    # Question 4.4
    # feature 1
    # Maximum value of SMOG index,a Simple Measure of Gobbledygook
    # The value of SMOG index ranges from 1 to 240 with the higher value for less readable or
    # less fluent. It uses words with more than 3 syllables to determine complexity of the sentence.
    # This gives a measure of the fluency or readability or understandability of the summaries
    # and hence reduces the MSE and increases Pearson Correlation Coefficient
    ###############################################################################################

    grade = []
    for j in trainSummary:
        max = float('-inf')
        sen = nltk.tokenize.sent_tokenize(j)

        for k in sen:
            words = word_tokenize(k)
            count = 0
            for z in words:
                c = phoney.count_syllables(z)
                if (c >= 3):
                    count += 1
            score = readability.SMOGIndex(count, 1)
            if (score > max):
                max = score
        grade.append(max)

    gradeT = []
    for j in testSummary:
        max = float('-inf')
        sen = nltk.tokenize.sent_tokenize(j)

        for k in sen:
            words = word_tokenize(k)
            count = 0
            for z in words:
                c = phoney.count_syllables(z)
                if (c >= 3):
                    count += 1
            score = readability.SMOGIndex(count, 1)
            if (score > max):
                max = score
        gradeT.append(max)


###############################################################################################
# Question 4.4
# feature 2
# Lix Readability Formula
# LIX = A/B + (C x 100)/A, where

#A = Number of words
#B = Number of periods (defined by period, colon or capital first letter)
#C = Number of long words (More than 6 letters)

# LIX uses words with more than six letters to determine the complexity of the sentence.
# More is the LIX score, more is the complexity and less is the fluency. Hence LIX score gives
# a fair measure of readability and decreases MSE and increases Pearson Correlation Coefficient.
###############################################################################################
    lix = []
    for j in trainSummary:
        sen = nltk.tokenize.sent_tokenize(j)
        words = word_tokenize(j)
        count = 0
        for z in words:
            if (len(z) > 6):
                count += 1
        score = readability.LIX(len(words), count, len(sen))
        lix.append(score)

    lixT = []

    for j in testSummary:
        sen = nltk.tokenize.sent_tokenize(j)
        words = word_tokenize(j)
        count = 0
        for z in words:
            if (len(z) > 6):
                count += 1
        score = readability.LIX(len(words), count, len(sen))

        lixT.append(score)

    ####################################################################################

    gradeTrain = np.array(grade).reshape(len(grade), 1)
    gradeTest = np.array(gradeT).reshape(len(gradeT), 1)
    lixTrain = np.array(lix).reshape(len(lix), 1)
    lixTest = np.array(lixT).reshape(len(lixT), 1)
    ##############################################################################################
    # Classifier 2-Same as above using Linear Regression Model-Q4.4
    # Using one additional feature of Maximum value of SMOG Index
    # Reported Values:
    # MSE-0.22948742171238876
    # Pearson Correlation Coefficient-(0.3547109953705656, 2.5568576898239954e-07)
    ##############################################################################################

    clf.fit(
        np.hstack((np.hstack(
            (unigramTrain, bigramTrain)), np.hstack(
                (fleschTrain, gradeTrain)))), np.array(trainFluency))
    y_pred = clf.predict(
        np.hstack((np.hstack(
            (unigramTest, bigramTest)), np.hstack((fleschTest, gradeTest)))))
    MSE = mean_squared_error(np.float64(np.array(testFluency)),
                             np.float64(np.array(y_pred)))
    print(MSE)
    pearSon = pearsonr(np.float64(np.array(testFluency)),
                       np.float64(np.array(y_pred)))
    print(pearSon)

    #######################################################################################################
    # Classifier 3-Same as above using Linear Regression Model-Q4.4
    # Using one additional feature of LIX Readability formula
    # Reported Values:
    # MSE-0.22856596250744246
    # Pearson Correlation Coefficient-(0.3545475618365523, 2.591754092042154e-07)
    #######################################################################################################
    clf.fit(
        np.hstack((np.hstack(
            (unigramTrain, bigramTrain)), np.hstack((fleschTrain, lixTrain)))),
        np.array(trainFluency))
    y_pred = clf.predict(
        np.hstack((np.hstack(
            (unigramTest, bigramTest)), np.hstack((fleschTest, lixTest)))))
    MSE = mean_squared_error(np.float64(np.array(testFluency)),
                             np.float64(np.array(y_pred)))
    print(MSE)
    pearSon = pearsonr(np.float64(np.array(testFluency)),
                       np.float64(np.array(y_pred)))
    print(pearSon)
Пример #3
0
from big_phoney import BigPhoney
import os

def write_iambic(input_file, output_file):
    for line in input_file.readlines():
        syllables = phoney.count_syllables(line)
        if syllables == 10:
            output_file.write(line)

# Initialization
phoney = BigPhoney()
output_file = open("iambic.txt", "a")

for filename in os.listdir(os.getcwd()+"/outputs"):
    print(filename)
    input_file = open("outputs/"+filename, "r+")
    write_iambic(input_file, output_file)
    input_file.close()

output_file.close()
Пример #4
0
def seq2seq_preprocess(transcript_path: str,
                       motion_path: str) -> (np.ndarray, np.ndarray):
    transcripts = []
    intervals = []
    whole_sentence = []
    phoney = BigPhoney()

    prev_sentense = ''
    prev_interval = [0, 0]

    motions = np.loadtxt(motion_path,
                         usecols=range(4),
                         skiprows=17,
                         dtype='float')

    phon_split = []
    with open(transcript_path, 'r') as f:
        for line in f.readlines():
            line = line.strip().split()
            # print(line)
            if not is_number(line[1]) or not is_number(line[2]):
                continue
            start_time = int(float(line[1]) * 100)
            end_time = int(float(line[2]) * 100)
            text = line[3:]
            # print(text)
            # print(len(text))

            if (start_time > len(motions)):
                continue

            if (float(line[1]) - prev_interval[1] / 100.0 <=
                    0.5) and (len(prev_sentense) + len(text) <= 50):
                prev_sentense += text
                prev_interval[1] = end_time

            elif len(text) <= 5:
                continue

            else:
                temp_phon = []
                temp_split = []
                for i in prev_sentense:
                    temp_phon.append(phoney.phonize(i))
                    # print(i)
                    # print(phoney.phonize(i))
                # print(prev_interval[0])
                splited_phon = word_split_rule(temp_phon)
                the_sum = math.fsum(splited_phon)
                # phon_split.append(word_split_rule(temp_phon))
                time_distance = prev_interval[1] - prev_interval[0]
                # print(phon_split)
                # print(len(splited_phon))

                for j in range(len(splited_phon)):
                    time_float = splited_phon[j] / the_sum * time_distance
                    if (time_float - int(time_float) >= 0.5):
                        temp_split.append(int(time_float) + 1)
                    else:
                        temp_split.append(int(time_float))

                if (len(temp_split)) != 0:
                    temp_split[-1] += int(
                        abs(time_distance - math.fsum(temp_split)))

                phon_split.append(temp_split)

                # print(temp_phon)
                # print(convert_to_ints(prev_sentense))
                transcripts.append(convert_to_ints(prev_sentense))
                intervals.append(prev_interval)
                prev_sentense = text
                prev_interval = [start_time, end_time]
    # for i in transcripts:
    #     print(len(i))

    num_dof = 4
    targets = []
    for period in intervals[1:]:
        start_time = period[0]
        end_time = period[1]

        temp_motion = motions[start_time:end_time]
        # temp_motion = np.append(temp_motion,[[1.0,1.0,1.0,1.0]],axis = 0)
        # print(temp_motion)
        temp_motion = np.array(temp_motion)

        if not temp_motion.any():
            print(motion_path)
            continue
            # print(transcript_path)

            # print(start_time)
            # print(end_time)
            # print(temp_motion)
            # print('wow')
        else:
            # scaled_temp_motion = (temp_motion - temp_motion.mean())/temp_motion.std()
            # print(scaled_temp_motion)

            targets.append(temp_motion)

    inputs = np.array(transcripts[1:])
    word_time_distribution = np.array(phon_split[1:])
    # print(inputs)
    #     print(len(inputs))
    #     print(targets)
    if (len(inputs) != len(targets)):
        print('wow')
    # print(len(inputs))
    # print(len(word_time_distribution))
    return inputs, targets, word_time_distribution
Пример #5
0
# -*- coding: utf-8 -*-
"""
Created on Thu Aug 20 11:37:56 2020

@author: chenfish
"""

# count the syllable per word, normalized by token counts
from big_phoney import BigPhoney
import string
import pandas as pd
import os
import re

#initialize the syllable counter
phoney = BigPhoney()

#data_path = '/Users/chenfish/Desktop/Thesis/Project/data/mt_pe/dev/'

data_path = '/Users/yuwen/Desktop/Thesis/Project/data/ht_pe/all_no_split/mtht/'

for i in os.listdir(data_path):

    if i[-2:] == 'en':

        data = pd.read_pickle(data_path + i)
        print('Now we are working on', i)

    else:
        print('Skip the file.', i)
        continue  #return to the top of the loop
Пример #6
0
def fit_lyrics(gen_lyrics, target_lyrics):

    print(gen_lyrics)

    aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased',
                                    action="insert")
    print('aug')

    phoney = BigPhoney()
    print('initialized phoney')

    # Counts the number of syllables in each line
    def count_syls(text):

        schema = []

        for line in text:
            syls = phoney.count_syllables(line)
            schema.append(syls)
            #print(syls,line)

        return schema

    # Remove special characters and contractions from the line. This will make it easier to
    # augment lines that need augmentation.
    def decontracted(phrase):
        # specific
        phrase = re.sub(r"won\'t", "will not", phrase)
        phrase = re.sub(r"can\'t", "can not", phrase)

        # general
        phrase = re.sub(r"n\'t", " not", phrase)
        phrase = re.sub(r"\'re", " are", phrase)
        phrase = re.sub(r"\'s", " is", phrase)
        phrase = re.sub(r"\'d", " would", phrase)
        phrase = re.sub(r"\'ll", " will", phrase)
        phrase = re.sub(r"\'t", " not", phrase)
        phrase = re.sub(r"\'ve", " have", phrase)
        phrase = re.sub(r"\'m", " am", phrase)
        phrase = re.sub('[!@#$?]', '', phrase)
        return phrase

    # Count the number of syllables in the generated and target texts.
    gen_schema = count_syls(gen_lyrics)
    target_schema = count_syls(target_lyrics)
    #    print(target_schema)

    # make generated lyrics same length as target lyrics
    target_len = len(target_schema)
    del gen_schema[target_len:]
    del gen_lyrics[target_len:]

    # initialize array for the new fitted lyrics
    new_lyrics = []

    # loop through each line and either find existing line to place into the current position,
    # or augment the current line.

    for num, line in enumerate(gen_lyrics):
        print("line in gen_lyrics:")
        print(line)

        # if the line is already the right length, add it to the new lyrics.
        if (gen_schema[num] == target_schema[num]):
            new_lyrics.append(line)
            print("this line is good:")
            print(line)
        # if the line is not the right length, augment or delete
        elif (gen_schema[num] != target_schema[num]):
            line = decontracted(line)
            print("target syls:")
            print(target_schema[num])
            print("same line decontracted:")
            print(line)
            syls = gen_schema[num]
            # If we start with fewer syllables than we want, we augment.
            while syls < target_schema[num]:
                print("not enough syls")
                original_line = line
                line = aug.augment(line)
                line = re.sub(r'[^\w\s]', '', line)
                print(line)
                syls = phoney.count_syllables(line)
                #In case we overshoot (add too many syllables)
                if syls > target_schema[num]:
                    print("Oops we overshot")
                    print(line)
                    line = original_line
                    syls = phoney.count_syllables(line)
            new_line = line
            #                print("the same line:")
            syls = gen_schema[num]
            words = line.split(" ")
            while syls > target_schema[num]:
                print("too many syls")
                original_words = words
                #instead of deleting the last word, try deleting a word randomly
                #del words[-1]
                words.pop(random.randrange(len(words)))
                new_line = ' '.join(words)
                syls = phoney.count_syllables(new_line)
                print("after removing one:")
                print(new_line)
                #In case too many syllables are deleted
                if syls < target_schema[num]:
                    print("Oops, deleted too many")
                    print(line)
                    words = original_words
                    new_line = ' '.join(words)
                    syls = phoney.count_syllables(new_line)
                    print(syls)
                    print("the target was:")
                    print(target_schema[num])

            new_lyrics.append(new_line)

    return new_lyrics
Пример #7
0
import tensorflow as tf
import numpy as np
import os
import sys
import time
import re
import random
from big_phoney import BigPhoney

phoney = BigPhoney()

pre_text = open("lyrics.txt", 'r')
text = pre_text.read().lower()

# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i:i + maxlen])
    next_chars.append(text[i + maxlen])

vocab = sorted(list(set(text)))

print('total chars:', len(vocab))
char_index = dict((c, i) for i, c in enumerate(vocab))
index_char = dict((i, c) for i, c in enumerate(vocab))

# load model
model = tf.keras.models.load_model('model.h5')
Пример #8
0
from big_phoney import BigPhoney
import random
import pickle
bp = BigPhoney()

#word_file = open("saved_objects/words/fict_words.txt", "r")
#word_file = open("saved_objects/words/lost_words_phrontristery.txt", "r")
word_file = open("saved_objects/words/rare_words_phrontristery.txt", "r")
#word_file = open("saved_objects/words/one_syll_wonders.txt", "r")
lines = word_file.readlines()
word_file.close()

syll_file = open("saved_objects/ob_syll_dict.txt", "a")

one_syll_file = open("saved_objects/words/two_syll_wonders.txt", "a")

type_to_pos = {
    'v': "VB",
    'adj': "JJ",
    'n': "NN",
    'npl': "NNS",
    'vz': "VBZ",
    'vd': "VBD",
    'ving': "VBG",
    'vbp': "VBP",
    "adv": "RB"
}

postag_dict = pickle.load(open("saved_objects/ob_postag_dict.p", "rb"))

ob_pos_to_words = postag_dict[0]