def findSyllables(jsonListTurns): dictionaryList = [] phoney = BigPhoney() for elem in jsonListTurns: syllableNum = sum( [phoney.count_syllables(word) for word in elem[3].split()]) dictionaryList.append({ "elem": elem, "syllableNum": syllableNum, "syllRate": round(syllableNum / (abs(elem[2] - elem[1])), 2) }) return dictionaryList
def main(): path = os.getcwd() + "\\train\\" trainSummary = [] trainFluency = [] trainNRedundancy = [] stops = set(stopwords.words("english")) ############################################################## # reading and pre processing training data ############################################################## with open(path + 'Train_Data.csv') as csvfile: readCSV = csv.reader(csvfile, delimiter=',') count = 0 for row in readCSV: ####first row -headings not required##### if (count == 0): count += 1 continue ########for rows with no data############ if (row[0] == ""): continue ########################################################### # replace \n and \t with spaces # remove extra spaces # convert to lowercase ########################################################### a = row[0] a = a.replace("\n", " ") a = a.replace("\t", " ") a = a.lower() a = re.sub(" +", " ", a) trainSummary.append(a) #####redundancy and fluency values are strings-conversion to int and float###### if (str(row[1]).__contains__(".")): x = float(str(row[1])) trainNRedundancy.append(x) else: x = int(str(row[1])) trainNRedundancy.append(x) if (str(row[2]).__contains__(".")): x = float(str(row[2])) trainFluency.append(x) else: x = int(str(row[2])) trainFluency.append(x) ############################################################## # reading and pre processing test data ############################################################## testSummary = [] testFluency = [] testNRedundancy = [] with open(path + 'Test_Data.csv') as csvfile: readCSV = csv.reader(csvfile, delimiter=',') count = 0 for row in readCSV: if (count == 0): count += 1 continue if (row[0] == ""): continue a = row[0] a = a.replace("\n", " ") a = a.replace("\t", " ") a = a.lower() a = re.sub(" +", " ", a) testSummary.append(a) ###################################################################### # changing fluency and redundancy values from string to float or int ###################################################################### if (str(row[1]).__contains__(".")): x = float(str(row[1])) testNRedundancy.append(x) else: x = int(str(row[1])) testNRedundancy.append(x) if (str(row[2]).__contains__(".")): x = float(str(row[2])) testFluency.append(x) else: x = int(str(row[2])) testFluency.append(x) ##################################################################### # Total number of repetitive unigrams ##################################################################### unigram_feature = [] for sen in trainSummary: sen = word_tokenize(sen) unigram = {} words = [] for j in sen: if j not in stops: words.append(j) for j in range(len(words)): t = "_".join(words[j:j + 1]) unigram.setdefault(t, 0) unigram[t] += 1 count = 0 for k in unigram.values(): if (k > 1): count += 1 unigram_feature.append(count) testunigram_feature = [] for sen in testSummary: sen = word_tokenize(sen) unigram1 = {} words = [] for j in sen: if j not in stops: words.append(j) for j in range(len(words)): t = "_".join(words[j:j + 1]) unigram1.setdefault(t, 0) unigram1[t] += 1 count1 = 0 for k in unigram1.values(): if (k > 1): count1 += 1 testunigram_feature.append(count1) ################################################################## # Total number of repetitive bigrams ################################################################### bigram_feature = [] for sen in trainSummary: sen = word_tokenize(sen) bigram = {} words = [] for j in sen: if j not in stops: words.append(j) for j in range(len(words) - 1): t = "_".join(words[j:j + 2]) bigram.setdefault(t, 0) bigram[t] += 1 count = 0 for k in bigram.values(): if (k > 1): count += 1 bigram_feature.append(count) testbigram_feature = [] for sen in testSummary: sen = word_tokenize(sen) bigram1 = {} words1 = [] for j in sen: if j not in stops: words1.append(j) for j in range(len(words1) - 1): t = "_".join(words1[j:j + 2]) bigram1.setdefault(t, 0) bigram1[t] += 1 count1 = 0 for k in bigram1.values(): if (k > 1): count1 += 1 testbigram_feature.append(count1) ######################################################################## # Minimum Flesch reading-ease score: ######################################################################## flesch = [] phoney = BigPhoney() for j in trainSummary: min = float('inf') sen = nltk.tokenize.sent_tokenize(j) for k in sen: words = word_tokenize(k) count = 0 for z in words: count += phoney.count_syllables(z) score = readability.FleschReadingEase(count, len(words), 1) if (score < min): min = score flesch.append(min) fleschT = [] for j in testSummary: min = float('inf') sen = nltk.tokenize.sent_tokenize(j) read = [] for k in sen: words = word_tokenize(k) count = 0 for z in words: count += phoney.count_syllables(z) score = readability.FleschReadingEase(count, len(words), 1) if (score < min): min = score fleschT.append(min) ############################################################################################## # Classifier for the above three features-Q4.3 # Linear Regression Model ############################################################################################### unigramTrain = np.array(unigram_feature).reshape(len(unigram_feature), 1) unigramTest = np.array(testunigram_feature).reshape( len(testunigram_feature), 1) bigramTrain = np.array(bigram_feature).reshape(len(bigram_feature), 1) bigramTest = np.array(testbigram_feature).reshape(len(testbigram_feature), 1) fleschTrain = np.array(flesch).reshape(len(flesch), 1) fleschTest = np.array(fleschT).reshape(len(fleschT), 1) clf = LinearRegression() ####################################################################################### # Reported Values # scipy.stats.pearsonr(x, y) gives two values, first value gives the value -1 to 1 with # positive value referring to more correlation. # The second p-value roughly indicates the probability of an uncorrelated system producing datasets that # have a Pearson correlation at least as extreme as the one computed from these datasets. # The p-values are not entirely reliable but are probably reasonable for datasets larger than 500 or so. # MSE - 0.22993706600411773 # Pearson Correlation Coefficient-(0.3521331207011803, 3.163607855250048e-07) ####################################################################################### clf.fit(np.hstack((np.hstack((unigramTrain, bigramTrain)), fleschTrain)), np.array(trainFluency)) y_pred = clf.predict( np.hstack((np.hstack((unigramTest, bigramTest)), fleschTest))) MSE = mean_squared_error(np.float64(np.array(testFluency)), np.float64(np.array(y_pred))) print(MSE) pearSon = pearsonr(np.float64(np.array(testFluency)), np.float64(np.array(y_pred))) print(pearSon) ############################################################################################### # Question 4.4 # feature 1 # Maximum value of SMOG index,a Simple Measure of Gobbledygook # The value of SMOG index ranges from 1 to 240 with the higher value for less readable or # less fluent. It uses words with more than 3 syllables to determine complexity of the sentence. # This gives a measure of the fluency or readability or understandability of the summaries # and hence reduces the MSE and increases Pearson Correlation Coefficient ############################################################################################### grade = [] for j in trainSummary: max = float('-inf') sen = nltk.tokenize.sent_tokenize(j) for k in sen: words = word_tokenize(k) count = 0 for z in words: c = phoney.count_syllables(z) if (c >= 3): count += 1 score = readability.SMOGIndex(count, 1) if (score > max): max = score grade.append(max) gradeT = [] for j in testSummary: max = float('-inf') sen = nltk.tokenize.sent_tokenize(j) for k in sen: words = word_tokenize(k) count = 0 for z in words: c = phoney.count_syllables(z) if (c >= 3): count += 1 score = readability.SMOGIndex(count, 1) if (score > max): max = score gradeT.append(max) ############################################################################################### # Question 4.4 # feature 2 # Lix Readability Formula # LIX = A/B + (C x 100)/A, where #A = Number of words #B = Number of periods (defined by period, colon or capital first letter) #C = Number of long words (More than 6 letters) # LIX uses words with more than six letters to determine the complexity of the sentence. # More is the LIX score, more is the complexity and less is the fluency. Hence LIX score gives # a fair measure of readability and decreases MSE and increases Pearson Correlation Coefficient. ############################################################################################### lix = [] for j in trainSummary: sen = nltk.tokenize.sent_tokenize(j) words = word_tokenize(j) count = 0 for z in words: if (len(z) > 6): count += 1 score = readability.LIX(len(words), count, len(sen)) lix.append(score) lixT = [] for j in testSummary: sen = nltk.tokenize.sent_tokenize(j) words = word_tokenize(j) count = 0 for z in words: if (len(z) > 6): count += 1 score = readability.LIX(len(words), count, len(sen)) lixT.append(score) #################################################################################### gradeTrain = np.array(grade).reshape(len(grade), 1) gradeTest = np.array(gradeT).reshape(len(gradeT), 1) lixTrain = np.array(lix).reshape(len(lix), 1) lixTest = np.array(lixT).reshape(len(lixT), 1) ############################################################################################## # Classifier 2-Same as above using Linear Regression Model-Q4.4 # Using one additional feature of Maximum value of SMOG Index # Reported Values: # MSE-0.22948742171238876 # Pearson Correlation Coefficient-(0.3547109953705656, 2.5568576898239954e-07) ############################################################################################## clf.fit( np.hstack((np.hstack( (unigramTrain, bigramTrain)), np.hstack( (fleschTrain, gradeTrain)))), np.array(trainFluency)) y_pred = clf.predict( np.hstack((np.hstack( (unigramTest, bigramTest)), np.hstack((fleschTest, gradeTest))))) MSE = mean_squared_error(np.float64(np.array(testFluency)), np.float64(np.array(y_pred))) print(MSE) pearSon = pearsonr(np.float64(np.array(testFluency)), np.float64(np.array(y_pred))) print(pearSon) ####################################################################################################### # Classifier 3-Same as above using Linear Regression Model-Q4.4 # Using one additional feature of LIX Readability formula # Reported Values: # MSE-0.22856596250744246 # Pearson Correlation Coefficient-(0.3545475618365523, 2.591754092042154e-07) ####################################################################################################### clf.fit( np.hstack((np.hstack( (unigramTrain, bigramTrain)), np.hstack((fleschTrain, lixTrain)))), np.array(trainFluency)) y_pred = clf.predict( np.hstack((np.hstack( (unigramTest, bigramTest)), np.hstack((fleschTest, lixTest))))) MSE = mean_squared_error(np.float64(np.array(testFluency)), np.float64(np.array(y_pred))) print(MSE) pearSon = pearsonr(np.float64(np.array(testFluency)), np.float64(np.array(y_pred))) print(pearSon)
from big_phoney import BigPhoney import os def write_iambic(input_file, output_file): for line in input_file.readlines(): syllables = phoney.count_syllables(line) if syllables == 10: output_file.write(line) # Initialization phoney = BigPhoney() output_file = open("iambic.txt", "a") for filename in os.listdir(os.getcwd()+"/outputs"): print(filename) input_file = open("outputs/"+filename, "r+") write_iambic(input_file, output_file) input_file.close() output_file.close()
def seq2seq_preprocess(transcript_path: str, motion_path: str) -> (np.ndarray, np.ndarray): transcripts = [] intervals = [] whole_sentence = [] phoney = BigPhoney() prev_sentense = '' prev_interval = [0, 0] motions = np.loadtxt(motion_path, usecols=range(4), skiprows=17, dtype='float') phon_split = [] with open(transcript_path, 'r') as f: for line in f.readlines(): line = line.strip().split() # print(line) if not is_number(line[1]) or not is_number(line[2]): continue start_time = int(float(line[1]) * 100) end_time = int(float(line[2]) * 100) text = line[3:] # print(text) # print(len(text)) if (start_time > len(motions)): continue if (float(line[1]) - prev_interval[1] / 100.0 <= 0.5) and (len(prev_sentense) + len(text) <= 50): prev_sentense += text prev_interval[1] = end_time elif len(text) <= 5: continue else: temp_phon = [] temp_split = [] for i in prev_sentense: temp_phon.append(phoney.phonize(i)) # print(i) # print(phoney.phonize(i)) # print(prev_interval[0]) splited_phon = word_split_rule(temp_phon) the_sum = math.fsum(splited_phon) # phon_split.append(word_split_rule(temp_phon)) time_distance = prev_interval[1] - prev_interval[0] # print(phon_split) # print(len(splited_phon)) for j in range(len(splited_phon)): time_float = splited_phon[j] / the_sum * time_distance if (time_float - int(time_float) >= 0.5): temp_split.append(int(time_float) + 1) else: temp_split.append(int(time_float)) if (len(temp_split)) != 0: temp_split[-1] += int( abs(time_distance - math.fsum(temp_split))) phon_split.append(temp_split) # print(temp_phon) # print(convert_to_ints(prev_sentense)) transcripts.append(convert_to_ints(prev_sentense)) intervals.append(prev_interval) prev_sentense = text prev_interval = [start_time, end_time] # for i in transcripts: # print(len(i)) num_dof = 4 targets = [] for period in intervals[1:]: start_time = period[0] end_time = period[1] temp_motion = motions[start_time:end_time] # temp_motion = np.append(temp_motion,[[1.0,1.0,1.0,1.0]],axis = 0) # print(temp_motion) temp_motion = np.array(temp_motion) if not temp_motion.any(): print(motion_path) continue # print(transcript_path) # print(start_time) # print(end_time) # print(temp_motion) # print('wow') else: # scaled_temp_motion = (temp_motion - temp_motion.mean())/temp_motion.std() # print(scaled_temp_motion) targets.append(temp_motion) inputs = np.array(transcripts[1:]) word_time_distribution = np.array(phon_split[1:]) # print(inputs) # print(len(inputs)) # print(targets) if (len(inputs) != len(targets)): print('wow') # print(len(inputs)) # print(len(word_time_distribution)) return inputs, targets, word_time_distribution
# -*- coding: utf-8 -*- """ Created on Thu Aug 20 11:37:56 2020 @author: chenfish """ # count the syllable per word, normalized by token counts from big_phoney import BigPhoney import string import pandas as pd import os import re #initialize the syllable counter phoney = BigPhoney() #data_path = '/Users/chenfish/Desktop/Thesis/Project/data/mt_pe/dev/' data_path = '/Users/yuwen/Desktop/Thesis/Project/data/ht_pe/all_no_split/mtht/' for i in os.listdir(data_path): if i[-2:] == 'en': data = pd.read_pickle(data_path + i) print('Now we are working on', i) else: print('Skip the file.', i) continue #return to the top of the loop
def fit_lyrics(gen_lyrics, target_lyrics): print(gen_lyrics) aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert") print('aug') phoney = BigPhoney() print('initialized phoney') # Counts the number of syllables in each line def count_syls(text): schema = [] for line in text: syls = phoney.count_syllables(line) schema.append(syls) #print(syls,line) return schema # Remove special characters and contractions from the line. This will make it easier to # augment lines that need augmentation. def decontracted(phrase): # specific phrase = re.sub(r"won\'t", "will not", phrase) phrase = re.sub(r"can\'t", "can not", phrase) # general phrase = re.sub(r"n\'t", " not", phrase) phrase = re.sub(r"\'re", " are", phrase) phrase = re.sub(r"\'s", " is", phrase) phrase = re.sub(r"\'d", " would", phrase) phrase = re.sub(r"\'ll", " will", phrase) phrase = re.sub(r"\'t", " not", phrase) phrase = re.sub(r"\'ve", " have", phrase) phrase = re.sub(r"\'m", " am", phrase) phrase = re.sub('[!@#$?]', '', phrase) return phrase # Count the number of syllables in the generated and target texts. gen_schema = count_syls(gen_lyrics) target_schema = count_syls(target_lyrics) # print(target_schema) # make generated lyrics same length as target lyrics target_len = len(target_schema) del gen_schema[target_len:] del gen_lyrics[target_len:] # initialize array for the new fitted lyrics new_lyrics = [] # loop through each line and either find existing line to place into the current position, # or augment the current line. for num, line in enumerate(gen_lyrics): print("line in gen_lyrics:") print(line) # if the line is already the right length, add it to the new lyrics. if (gen_schema[num] == target_schema[num]): new_lyrics.append(line) print("this line is good:") print(line) # if the line is not the right length, augment or delete elif (gen_schema[num] != target_schema[num]): line = decontracted(line) print("target syls:") print(target_schema[num]) print("same line decontracted:") print(line) syls = gen_schema[num] # If we start with fewer syllables than we want, we augment. while syls < target_schema[num]: print("not enough syls") original_line = line line = aug.augment(line) line = re.sub(r'[^\w\s]', '', line) print(line) syls = phoney.count_syllables(line) #In case we overshoot (add too many syllables) if syls > target_schema[num]: print("Oops we overshot") print(line) line = original_line syls = phoney.count_syllables(line) new_line = line # print("the same line:") syls = gen_schema[num] words = line.split(" ") while syls > target_schema[num]: print("too many syls") original_words = words #instead of deleting the last word, try deleting a word randomly #del words[-1] words.pop(random.randrange(len(words))) new_line = ' '.join(words) syls = phoney.count_syllables(new_line) print("after removing one:") print(new_line) #In case too many syllables are deleted if syls < target_schema[num]: print("Oops, deleted too many") print(line) words = original_words new_line = ' '.join(words) syls = phoney.count_syllables(new_line) print(syls) print("the target was:") print(target_schema[num]) new_lyrics.append(new_line) return new_lyrics
import tensorflow as tf import numpy as np import os import sys import time import re import random from big_phoney import BigPhoney phoney = BigPhoney() pre_text = open("lyrics.txt", 'r') text = pre_text.read().lower() # cut the text in semi-redundant sequences of maxlen characters maxlen = 40 step = 3 sentences = [] next_chars = [] for i in range(0, len(text) - maxlen, step): sentences.append(text[i:i + maxlen]) next_chars.append(text[i + maxlen]) vocab = sorted(list(set(text))) print('total chars:', len(vocab)) char_index = dict((c, i) for i, c in enumerate(vocab)) index_char = dict((i, c) for i, c in enumerate(vocab)) # load model model = tf.keras.models.load_model('model.h5')
from big_phoney import BigPhoney import random import pickle bp = BigPhoney() #word_file = open("saved_objects/words/fict_words.txt", "r") #word_file = open("saved_objects/words/lost_words_phrontristery.txt", "r") word_file = open("saved_objects/words/rare_words_phrontristery.txt", "r") #word_file = open("saved_objects/words/one_syll_wonders.txt", "r") lines = word_file.readlines() word_file.close() syll_file = open("saved_objects/ob_syll_dict.txt", "a") one_syll_file = open("saved_objects/words/two_syll_wonders.txt", "a") type_to_pos = { 'v': "VB", 'adj': "JJ", 'n': "NN", 'npl': "NNS", 'vz': "VBZ", 'vd': "VBD", 'ving': "VBG", 'vbp': "VBP", "adv": "RB" } postag_dict = pickle.load(open("saved_objects/ob_postag_dict.p", "rb")) ob_pos_to_words = postag_dict[0]