Пример #1
0
    def phone_synthesize_solution(self, text):
        """
        Synthesize speech from text by concatenating phonemes
        selected from the database
        """
        # Convert all words to lower case
        words = [word.lower() for word in text.split()]
        phones = []
        for word in words:
            try:
                # Use cmudict get phonemic representation
                phones.extend(cmudict.dict()[word][0])
            except IndexError:
                # If word not found in dictionary, use g2p instead
                g2p = G2p()
                phones.extend(g2p(word))
        print(phones)

        # Initialize an empty audio segment
        result = AudioSegment.empty()
        # Concatenate phonems selected from PHONEMES_DIR
        for phone in phones:
            # Ignore accent marker
            phone = phone[0:-1] if phone[-1].isdigit() else phone
            # Look up phoneme wav file using phone_map
            sound_label = phone_map[phone]
            sound_path = PHONEMES_DIR + str(sound_label) + ".wav"
            audio = AudioSegment.from_wav(sound_path)
            result += audio
        # Write the synthesized .wav file to DST_DIR
        result.export(DST_DIR + "gen.wav", format="wav")
        play(result)
Пример #2
0
def percetage_not_in_CMU(filename, verbose):
    total_words = 0
    n_not_in_CMU = 0
    dict = cmudict.dict()
    total_songs = 0
    if verbose:
        print('Following words are not in CMU dictionary:')
    with open(filename, 'r') as input_file:
        data = json.load(input_file)
        for song in data:
            if verbose:
                print(f"SONG: {song['title']}")
            total_songs += 1
            for line in song['lyrics']:
                last_word = line.strip().split(' ')[-1]
                last_word = last_word.lower()
                # Remove punctuation.
                last_word = re.sub(r"[^\w\d'\s]+", '', last_word)
                if last_word == '':
                    continue
                pron = dict.get(last_word)
                if not pron and verbose:
                    print(last_word)
                    n_not_in_CMU += 1
                total_words += 1
    print(f'Analyzed {total_songs} songs.')
    return n_not_in_CMU / total_words
Пример #3
0
def test_estimate():
    EXPECTED_ACCURACY = .75
    hits = []
    misses = []

    d = cmudict.dict()
    for word in d:
        phones = d[word][0]
        cmudict_syllables = 0
        for phone in phones:
            if re.match(r"\w*[012]$", phone):
                cmudict_syllables += 1
        estimated_syllables = syllables.estimate(word)
        if cmudict_syllables == estimated_syllables:
            hits.append(word)
        else:
            misses.append(word)

    hit = len(hits)
    miss = len(misses)
    total = hit + miss
    ACCURACY = hit / total
    if (ACCURACY < EXPECTED_ACCURACY):
        raise AssertionError(
            'syllables.estimate(): Expected accuracy of {0}, got {1}.'.format(
                EXPECTED_ACCURACY, ACCURACY))
Пример #4
0
def create_CMU_encoding_dictionary():
    """
    Create a encoding CMU Dictionary.

    Returns:
        dict: CMU encoding Dictionary
    """
    return cmudict.dict()
Пример #5
0
	def __init__(self, fileName = "lyrics.csv"):
		self.removePunct = re.compile(":|\[|\(|chorus|verse")
		self.actualFile = pd.read_csv(fileName, encoding="ISO-8859-1", engine='python')
		self.cleanedLyrics = list()

		self.rhymingDict = cmudict.dict()
		self.keysInDict = sorted(self.rhymingDict)
		self.markovString = ""
Пример #6
0
def test_dict():
    EXPECTED_SIZE = 125997
    d = cmudict.dict()
    SIZE = len(d)
    if (EXPECTED_SIZE != SIZE):
        raise AssertionError(
            'cmudict.dict(): Expected {0} keys, got {1}.'.format(
                EXPECTED_SIZE, SIZE))
    def __init__(self,
                 data_file,
                 character_level=None,
                 phoneme_level=None,
                 vocabulary=None,
                 transform=None):
        self.data_file = data_file
        self.data = joblib.load(open(self.data_file, 'rb'))
        self.character_level = character_level
        self.phoneme_level = phoneme_level
        self.transcription_processor = lambda words: words

        if self.character_level:
            characters = [chr(c) for c in range(ord('a'), ord('z') + 1)]
            characters += [' ']
            character_vocab = Vocabulary()
            for character in characters:
                character_vocab.add_word(character)
            self.vocabulary = character_vocab
            self.transcription_processor = self._character_level_transcription_processor
        elif self.phoneme_level:
            cmu_phones = list(map(lambda x: x[0], cmudict.phones()))
            cmu_phones += [' ']
            phones_vocab = Vocabulary(custom_unk_word=' ')
            for phone in cmu_phones:
                phones_vocab.add_word(phone)
            self.vocabulary = phones_vocab
            self.phones_dict = cmudict.dict()
            self.transcription_processor = self._phone_level_transcription_processor
        elif vocabulary is None:
            data_file_dir = os.path.dirname(self.data_file)
            data_file_prefix = os.path.splitext(self.data_file)[0]
            pickle_file_name = f'{data_file_prefix}_SpeechDataset.pickle'
            pickle_file_path = os.path.join(data_file_dir, pickle_file_name)
            if not os.path.isfile(pickle_file_path):
                dataset_info = self.build_vocabulary_from_dataset(self.data)
                pickle.dump(dataset_info, open(pickle_file_path, 'wb'))
            else:
                dataset_info = pickle.load(open(pickle_file_path, 'rb'))
            self.vocabulary = dataset_info['vocabulary']
        else:
            self.vocabulary = vocabulary
        self.transform = transform
        self.max_transcription_length = max([
            len(transcription)
            for transcription in self.data['transcription_tokens']
        ])
        self.max_input_length = max([
            spectrogram.shape[1]
            for spectrogram in self.data['audio_spectrograms']
        ])
Пример #8
0
def getPsuedoKeyword(target_phone, already_present):
    a = cmudict.dict()
    b = cmudict.words()
    found = False
    for word in b:
        for lst in a[word]:
            for phone in lst:
                if(target_phone == phone and already_present.get(word) == None):
                    already_present[word] = 1
                    return word
                    
                if(re.search(target_phone,phone) and len(target_phone) !=1 and already_present.get(word) == None):
                    already_present[word] =1
                    return word
    def __init__(self):
        self.phones_for_word = cmudict.dict()
        self.id_to_word = list(self.phones_for_word.keys())
        self.word_to_id = {word: i for i, word in enumerate(self.id_to_word)}
        self.id_to_phoneme = list(map(lambda x: x[0], cmudict.phones()))
        self.phoneme_to_id = {
            phoneme: i
            for i, phoneme in enumerate(self.id_to_phoneme)
        }
        self.root_node = {}
        for word, phoneme_sequences in self.phones_for_word.items():
            word_id = self.word_to_id[word]
            for phoneme_sequence in phoneme_sequences:
                phoneme_sequence = map(self.remove_phoneme_numerals,
                                       phoneme_sequence)
                phoneme_sequence = list(phoneme_sequence)

                current_node = self.root_node

                for phoneme in phoneme_sequence[:-1]:
                    phoneme_id = self.phoneme_to_id[phoneme]
                    if not phoneme_id in current_node:
                        current_node[phoneme_id] = {
                            PhonemeTrie.WORDS_AT_PHONE: [word_id]
                        }
                    else:
                        current_node[phoneme_id][
                            PhonemeTrie.WORDS_AT_PHONE].append(word_id)
                    current_node = current_node[phoneme_id]

                terminating_phoneme = phoneme_sequence[-1]
                terminating_phoneme_id = self.phoneme_to_id[
                    terminating_phoneme]

                if not terminating_phoneme_id in current_node:
                    current_node[terminating_phoneme_id] = {
                        PhonemeTrie.WORDS_AT_PHONE: [word_id]
                    }
                else:
                    current_node[terminating_phoneme_id][
                        PhonemeTrie.WORDS_AT_PHONE].append(word_id)

                if not PhonemeTrie.TERMINATING_WORDS in current_node[
                        terminating_phoneme_id]:
                    current_node[terminating_phoneme_id][
                        PhonemeTrie.TERMINATING_WORDS] = []

                current_node[terminating_phoneme_id][
                    PhonemeTrie.TERMINATING_WORDS].append(word_id)
Пример #10
0
def main():
    print("Start time: {0}".format(datetime.now().time()))
    cmu = cmudict.dict()
    all_transcribed_lines = dict()

    load_and_transcribe_poetry(all_transcribed_lines, cmu)
    print("Poetry Transcribed: {0}".format(datetime.now().time()))
    load_and_transcribe_lyrics(all_transcribed_lines, cmu)
    print("Lyrics Transcribed: {0}".format(datetime.now().time()))

    with open('transcribed_data.csv', mode='w') as out_file:
        out_writer = csv.writer(out_file, delimiter=',', quotechar='"')
        for line, val in all_transcribed_lines.items():
            out_writer.writerow([line.encode('utf-8'), val])
    print("Written to file: {0}".format(datetime.now().time()))
Пример #11
0
def test_dict_comments():
    DICT = cmudict.dict()
    EXPECTED_DICT = {
        "d'artagnan": [['D', 'AH0', 'R', 'T', 'AE1', 'NG', 'Y', 'AH0', 'N']],
        "danglar": [['D', 'AH0', 'NG', 'L', 'AA1', 'R']],
        "danglars": [['D', 'AH0', 'NG', 'L', 'AA1', 'R', 'Z']],
        "gdp": [['G', 'IY1', 'D', 'IY1', 'P', 'IY1']],
        "hiv": [['EY1', 'CH', 'AY1', 'V', 'IY1']],
        "porthos": [['P', 'AO0', 'R', 'T', 'AO1', 'S']],
        "spieth": [['S', 'P', 'IY1', 'TH'], ['S', 'P', 'AY1', 'AH0', 'TH']]
    }
    for TEST_WORD in EXPECTED_DICT:
        EXPECTED_PRONOUNCIATION = EXPECTED_DICT[TEST_WORD]
        PRONUNCIATION = DICT[TEST_WORD]
        if EXPECTED_PRONOUNCIATION != PRONUNCIATION:
            raise AssertionError(
                'cmudict.dict(): Expected "{0}", got "{1}".'.format(
                    EXPECTED_PRONOUNCIATION, PRONUNCIATION))
Пример #12
0
def percentage_of_numbers_to_all_OOD(filename, verbose):
    n_not_in_CMU = 0
    n_only_numbers = 0
    n_contain_numbers = 0
    only_numbers = []
    contain_numbers = []
    dict = cmudict.dict()
    total_songs = 0
    if verbose:
        print('Following words are not in CMU dictionary and ARE NOT NUMBERS:')
    with open(filename, 'r') as input_file:
        data = json.load(input_file)
        for song in data:
            total_songs += 1
            for line in song['lyrics']:
                last_word = line.strip().split(' ')[-1]
                last_word = last_word.lower()
                # Remove punctuation.
                last_word = re.sub(r"[^\w\d'\s]+", '', last_word)
                if last_word == '':
                    continue
                pron = dict.get(last_word)
                if not pron:
                    if last_word.isdecimal():
                        n_only_numbers += 1
                        only_numbers.append(last_word)
                    elif re.search("\d", last_word):
                        n_contain_numbers += 1
                        contain_numbers.append(last_word)
                    else:
                        if verbose:
                            print(last_word)
                    n_not_in_CMU += 1
    if verbose:
        print("Following words contain a number:")
        for n in contain_numbers:
            print(n)
        print("Following words are numbers:")
        for n in only_numbers:
            print(n)
    print(f'Analyzed {total_songs} songs.')
    return (n_contain_numbers +
            n_only_numbers) / n_not_in_CMU, n_only_numbers / n_not_in_CMU
Пример #13
0
 def __init__(self):
     self.filename = "LCP/lcpr_i.sav"
     self.cmudict = cmudict.dict()
     self.wnlp = WonderlicNLP()
     self.embeddings_index = {}
     self.wiki_top10 = [
         word[0].split()[0]
         for word in pd.read_csv("LCP/wiki_top10.csv").values
     ][:10001]
     self.infersent_model_path = 'LCP/infersent%s.pkl' % 1
     self.infersent_model_params = {
         'bsize': 64,
         'word_emb_dim': 300,
         'enc_lstm_dim': 2048,
         'pool_type': 'max',
         'dpout_model': 0.0,
         'version': 1
     }
     self.infersent = InferSent(self.infersent_model_params)
     self.model = RandomForestRegressor(n_estimators=100)
Пример #14
0
    def __init__(self):
        self.cmudict = cmudict.dict()

        self.title_bank = None
        self.folder = os.path.dirname(os.path.realpath(__file__))

        # Try reading content for the title_bank
        
        try:
            with open(os.path.join(self.folder, "data", "titles.pickle"), "rb") as f:
                self.title_bank = pickle.load(f)
        
        except FileNotFoundError:
            from title_scrape import download_gutenberg, gutenberg_preprocess
        
            download_gutenberg()
            gutenberg_preprocess()
        
            with open(os.path.join(self.folder, "data", "titles.pickle"), "rb") as f:
                self.title_bank = pickle.load(f)
Пример #15
0
class transliterate:
    try:
        # Make english dictionary with pronounciation
        import cmudict
        english_dict = cmudict.dict()
    except ModuleNotFoundError:
        print("Need cmu dictionary with pronounciation. TRY 'pip install cmudict'")
        raise ModuleNotFoundError

    def __init__(self, arpa_kata_csv_file_path):
        # Arpabet syllable pronounciation to katakana dictionary
        import pandas as pd

        arpa_kata_df = pd.read_csv(arpa_kata_csv_file_path)
        self.arpa_kata_dict = {}
        for ak in arpa_kata_df.values:
            c1 = ak[0].replace("'", '').strip()
            v1 = ak[1].strip().split(':')
            self.arpa_kata_dict[c1] = v1
            if not c1[-1].isnumeric(): continue
            for st in ['0', '1', '2']:
                if c1[:-1] + st in self.arpa_kata_dict: continue
                self.arpa_kata_dict[c1[:-1] + st] = v1

    # Word break algorithm based on english dictionary
    @staticmethod
    def breakings(w):
        def wordbreak(s, i):
            if not s[i:]: return []
            p = ''
            a = []
            for c in s[i:-1]:
                p += c
                i += 1
                if not transliterate.english_dict.get(p) or len(transliterate.english_dict.get(p)) == 0: continue
                a += [[p] + arr for arr in wordbreak(s, i) if arr]
            p += s[-1]
            if not transliterate.english_dict.get(p) or len(transliterate.english_dict.get(p)) == 0: return a
            a += [[p]]
            return a
        a = wordbreak(w, 0)
        return a

    # Forms a single string based on [1, [2, 3], [5]] => [125, 135]
    @staticmethod
    def merger(l):
        if not l: return []
        if all([type(ll) is not list for ll in l]):
            return l
        if len(l) == 1: return transliterate.merger(l[0])
        b = transliterate.merger(l[0:int(len(l)/2)])
        c = transliterate.merger(l[int(len(l)/2):])
        if not c: return b
        if not b: return c
        return [w1 + w2 for w1 in b for w2 in c]

    def con_pronounciation_katakana(self, arpa_list):
        b_1 = []
        for c_b_i in range(len(arpa_list)):
            c_b = arpa_list[c_b_i]
            b_2 = []
            ps = transliterate.english_dict.get(c_b)
            for p in ps:
                i = 0
                k_w = []
                while i<len(p):
                    arpa_key = p[i]
                    if p[i][-1].isnumeric():
                        i += 1
                    elif i+1<len(p) and str(p[i+1][-1]).isnumeric():
                        arpa_key +=  ' ' + p[i+1]
                        i += 2
                    elif c_b_i != len(arpa_list)-1 and 'ッ' != self.arpa_kata_dict[p[i]][0] and i == len(p)-1:
                        k_w.append(['ッ'])
                        i += 1
                    else:
                        i += 1
                    k_w.append(self.arpa_kata_dict[arpa_key])
                b_2 += transliterate.merger(k_w)
            b_1.append(b_2)
        return b_1

    def english_to_katakana(self, english_words):
        english_kata_dict = {}
        ii = 0
        for w in english_words:
            english_kata_dict[w] = set()
            ii += 1
            if ii % 50 == 0: print(ii)
            w_splits = [[w]]  if transliterate.english_dict.get(w) and len(transliterate.english_dict.get(w)) != 0 else transliterate.breakings(w)
            max_split = 0
            max_split_arr = []
            for f in w_splits:
                ff = [w for w in f if len(w) > 2]
                if len(''.join(ff)) > max_split:
                    max_split = len(''.join(ff))
                    max_split_arr = [ff]
                elif len(''.join(ff)) == max_split:
                    max_split_arr.append(f)
            for b in max_split_arr:
                b_1 = self.con_pronounciation_katakana(b)
                english_kata_dict[w] = english_kata_dict[w].union(transliterate.merger(b_1))
        return english_kata_dict
Пример #16
0
            prev_vowel = True
        else:
            prev_vowel = False

    # y at the end of the word adds one syllable to the
    if word[-1] == 'y' and not prev_vowel:
        count += 1

    # e at the end of the word does is usually silent
    if word[-1] == 'e' and not prev_vowel:
        count -= 1

    return count


rhyming_dict = cmudict.dict()


# My function for determining if two words rhyme
def is_rhyme(a, b):
    vowels = ['a', 'e', 'i', 'o', 'u']
    test_a = rhyming_dict[a]
    test_b = rhyming_dict[b]
    last_a = 0
    last_b = 0

    # This is when either the first or second word are not in the CMU rhyming dictionary
    # In this case I say they rhyme if the last syllable in each word is the same
    if len(test_a) == 0 or len(test_b) == 0:
        for i in range(len(a) - 1, -1, -1):
            if a[i] in vowels:
Пример #17
0
import cmudict
from g2p.data import DoubleBets

a2a_dataset = [[
    DoubleBets.alphabet.tseq2iseq(alphaseq),
    DoubleBets.arpabet.tseq2iseq(arpaseq)
] for k, vs in cmudict.dict().items()
               for alphaseq, arpaseq in [(k, v) for v in vs]]
Пример #18
0
import cmudict

vowels = ["A", "E", "I", "O", "U"]
the_dict = cmudict.dict()


def get_syllables(word):
    """Get the syllables from looking up the CMU dict"""
    try:
        phonems = the_dict[word]
        count = 0
        for p in phonems[0]:
            if (p[0] in vowels):
                count += 1
        return count
    except Exception:
        count = _syllables(word)
        return count


def _syllables(word):
    """Otherwise, if the CMU dict does not contain the entries, manually check
    the syllables, this is not completely correct, however,
    since there are no true standards beyond the standard dictionary,this is
    our assumption that the general factoring rules apply simply on several
    rules"""
    vowels = 'aeiouAEIOU'
    count = 0
    word = line.rstrip()
    if len(word) <= 3:
        """if the word is small, if the word ends with y and starts with a vowel
Пример #19
0
 def __init__(self, df_lyrics):
     self.df_lyrics = df_lyrics
     self.syllables = cmudict.dict()
Пример #20
0
import pandas as pd
import re
import numpy as np
import cmudict
import textdistance
import sklearn
import pickle
import nltk
# Load vocab
# file = open('vocab', 'rb')
# vocab = pickle.load(file)
unigrams_df = pd.read_csv('unigram_freq.csv', index_col='word')
pron_dict = cmudict.dict()


# The class for Word
class Word:
    # Constructor
    def __init__(self, token):
        self.token = re.sub(r'\W+', '', str(token)).lower()

    # Basic features
    def length(self):
        if not self.token:
            return 0
        return len(self.token)

    def frequency(self):
        try:
            return np.log(unigrams_df.loc[self.token]['count'])
        except:
Пример #21
0
class CANLTK:
    stops = stopwords.words('english')
    arpabet = cmudict.dict()
    twitter_tokenizer = TweetTokenizer()

    @staticmethod
    def n_lower_chars(string):
        return sum(map(str.islower, string))

    @staticmethod
    def n_upper_chars(string):
        return sum(map(str.isupper, string))

    @staticmethod
    def n_isspace_chars(string):
        return sum(map(str.isspace, string))

    @staticmethod
    def n_vowels_chars(string):
        return sum(map(string.lower().count, "aeiou"))

    @staticmethod
    def n_count_and_print_alphabets(CONFIG, string):
        CONFIG['feature_count']
        d = {}
        uppercaseAlphabetArray = [
            "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M",
            "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"
        ]
        lowercaseAlphabetArray = [
            "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
            "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"
        ]
        #     print(len(uppercaseAlphabetArray))
        #     print(len(lowercaseAlphabetArray))
        for i in range(26):
            combo = uppercaseAlphabetArray[i] + lowercaseAlphabetArray[i]
            CONFIG['feature_count'] += 1
            d["f" + (str(CONFIG['feature_count'])) + "-" + combo] = len(
                re.findall('[' + combo + ']', string))
        return d

    @staticmethod
    def n_special_chars(string):
        return sum(map(string.lower().count, ".,?!<>@#$%&()[]:;\'\""))

    @staticmethod
    def n_count_and_print_special_chars(CONFIG, string):
        CONFIG['feature_count']
        d = {}
        i = 0
        specialChars = [
            ".", ",", "—", "–", "’", "‘", "?", "!", "<", ">", "@", "#", "$",
            "%", "&", "(", ")", "[", "]", ":", ";", "\'", "\""
        ]
        specialCharsNamed = [
            "fullstop", "comma", "em-dash", "en-dash",
            "right-single-quotation-mark", "left-single-quotation-mark",
            "question-mark", "exclamation", "less-than-sign",
            "greate-than-sign", "at-sign", "hash", "dollar", "percentage",
            "ampersand", "open-brackets", "closing-brackets",
            "open-sq-brackets", "close-sq-brackets", "colan", "semi-colan",
            "single-quotes", "double-quotes"
        ]
        for c in specialChars:
            CONFIG['feature_count'] += 1
            d["f" + str(CONFIG['feature_count']) + "-p-" +
              specialCharsNamed[i]] = len(re.findall('[' + c + ']', string))
            i += 1
        return d

    @staticmethod
    def extract_emojis(str):
        return ''.join(c for c in str if c in emoji.UNICODE_EMOJI)

    @staticmethod
    def n_long_words(wordTokens):
        filteredTokens = list(filter(lambda x: len(x) > 6, wordTokens))
        return len(filteredTokens)

    @staticmethod
    def n_words_le_3(wordTokens):
        filteredTokens = list(filter(lambda x: len(x) <= 3, wordTokens))
        return len(filteredTokens)

    @staticmethod
    def n_words_le_2(wordTokens):
        filteredTokens = list(filter(lambda x: len(x) <= 2, wordTokens))
        return len(filteredTokens)

    @staticmethod
    def avg_words(wordTokens):
        count = 0
        for token in wordTokens:
            count += len(token)
        if len(wordTokens) != 0:
            return count / len(wordTokens)
        else:
            return 0

    @staticmethod
    def n_lowercase_sentences(sentTokens):
        count = 0
        for token in sentTokens:
            if token[0].islower():
                count += 1
        return count

    @staticmethod
    def n_uppercase_sentences(sentTokens):
        count = 0
        for token in sentTokens:
            if token[0].isupper():
                count += 1
        return count

    @staticmethod
    def n_each_emoticons(string):
        d = {}
        # pylint: disable=unused-variable
        for key, value in emot.EMOTICONS.items():
            d[key] = string.count(key)
        return d

    @staticmethod
    def n_each_emojis(string):
        d = {}
        # pylint: disable=unused-variable
        for key, value in emoji.UNICODE_EMOJI.items():
            d[key] = string.count(key)
        return d

    @staticmethod
    def n_misspelled_words(sentTokens):
        spell = SpellChecker()
        # single proper punctuation isn't regarderded as mispelled multiple false punctuation is considered as mispelled
        #     misspelled = spell.unknown(['somessthing', 'is', 'hapenning', 'here', '!', '👍', ',', ':)'])
        misspelled = spell.unknown(sentTokens)
        return len(misspelled)

    @staticmethod
    def n_total_punctuations(text):
        count = 0
        for p in string.punctuation:
            count += text.count(p)
        return count

    @staticmethod
    def print_n_each_punctuation(CONFIG, text):
        CONFIG['feature_count']
        # pylint: disable=unused-variable
        d = {}
        count = 0
        i = 0
        specialCharsNamed = [
            "exclamation", "double-quotes", "hash", "dollar", "percentage",
            "ampersand", "single-quotes", "open-brackets", "closing-brackets",
            "asterix", "plus", "comma", "dash", "fullstop", "slash", "colan",
            "semi-colan", "less-than-sign", "equal-sign", "greater-than-sign",
            "question-mark", "at-sign", "open-sq-brackets", "back-slash",
            "close-sq-brackets", "caret", "underscore", "grave-accent",
            "open-curly-brace", "vertical-bar", "close-curly-brace", "tilde"
        ]
        for p in string.punctuation:
            CONFIG['feature_count'] += 1
            d["f" + str(CONFIG['feature_count']) + "-p-" +
              specialCharsNamed[i]] = text.count(p)
            i += 1
        return d

    @staticmethod
    def avg_syllables(sentTokens):
        #     print(sentTokens)
        # emoticons like O.o scews the data up, also Okay👍👍🏿 as a single word so doesnt count syllables there need to cleanse and send
        validWords = 0
        count = 0
        for token in sentTokens:
            token = token.lower()
            if token in CANLTK.arpabet:
                #             print(token)
                #             print(arpabet[token])
                count += len(CANLTK.arpabet[token][0])
                validWords += 1
        if validWords != 0:
            return count / validWords
        else:
            return 0

    # https://stackoverflow.com/questions/33666557/get-phonemes-from-any-word-in-python-nltk-or-other-modules

    @staticmethod
    def n_punctuation(string):
        # need to cleanse emojis
        count = 0
        for c in string:
            if "P" in unicodedata.category(c):
                #             print(c, unicodedata.category(c))
                count += 1
        return count

    @staticmethod
    def prune_emojis_emoticons(string):
        # at tim doesn't work specially when emoticons comes after a weird emoji like O.o which isnt registered
        if "location" in emot.emoji(string).keys() is not None:
            for loc in reversed(emot.emoji(string)['location']):
                string = string[0:loc[0]] + string[loc[1] + 1::]
        #     print(emot.emoticons(string))
        if "location" in emot.emoticons(string):
            for loc in reversed(emot.emoticons(string)['location']):
                string = string[0:loc[0]] + string[loc[1] + 1::]
        return string

    @staticmethod
    def n_function_words(string):
        count = 0
        for f in string:
            if f in CANLTK.stops:
                count += 1
        return count

    @staticmethod
    def n_context_words(string):
        count = 0
        for f in string:
            if f not in CANLTK.stops:
                count += 1
        return count

    # def count__each_most_common_words(string):
    #     d = {}
    #     freq_dist = FreqDist(wordTokens)
    #     for li in freq_dist.most_common(10):
    #         li

    @staticmethod
    def n_total_emoticons(string):
        if any("value" in d for d in emot.emoticons(string)):
            return len(emot.emoticons(string)["value"])
        else:
            return 0

    @staticmethod
    def n_total_emojis(string):
        if any("value" in d for d in emot.emoji(string)):
            return len(emot.emoji(string)["value"])
        else:
            return 0

    @staticmethod
    def prune_punctuations_special_characters(text_arr):
        # need to recorrect
        for text in text_arr:
            if text in string.punctuation:
                text_arr.remove(text)
        return text_arr

    @staticmethod
    def prune_function_words(string_arr):
        items_to_remove = []
        for string in string_arr:
            string_lowercase = string.lower()
            if string_lowercase in CANLTK.stops:
                items_to_remove.append(string)
        for rm in items_to_remove:
            string_arr.remove(rm)
        return string_arr

    @staticmethod
    def n_words(string, string_arr):
        count = 0
        # for f in string:
        if string in string_arr:
            count += 1
        return count

    @staticmethod
    def replace_keys(string):
        return string.replace("!", "exclamation").replace(
            "\"", "double-quotes"
        ).replace("$", "dollar").replace("%", "percentage").replace(
            "&", "ampersand").replace("'", "single-quotes").replace(
                "(", "open-brackets").replace(")", "closing-brackets").replace(
                    "*", "asterix").replace("+", "plus").replace(
                        ",", "comma").replace("-", "dash").replace(
                            ".", "fullstop").replace("/", "slash").replace(
                                ":",
                                "colan").replace(";", "semi-colan").replace(
                                    "<", "less-than-sign"
                                ).replace(">", "greater-than-sign").replace(
                                    "=", "equal-sign"
                                ).replace("?", "question-mark").replace(
                                    "@", "at-sign"
                                ).replace("[", "open-sq-brackets").replace(
                                    "\\", "back-slash"
                                ).replace("]", "close-sq-brackets").replace(
                                    "^", "caret"
                                ).replace("_", "underscore").replace(
                                    "`", "grave-accent").replace(
                                        "{", "open-curly-brace").replace(
                                            "|", "vertical-bar").replace(
                                                "}", "close-curly-brace"
                                            ).replace("~", "tilde").replace(
                                                "#", "hash").replace(
                                                    "•",
                                                    "bullet-sign").replace(
                                                        " ", "-")

    @staticmethod
    def get_top_from_dictionary(dictionary, min_prune_value=1):
        items_to_remove = []
        for key, value in dictionary.items():
            if value < min_prune_value:
                items_to_remove.append(key)
        for item in items_to_remove:
            del dictionary[item]
        return dictionary

    @staticmethod
    def misspelled_word_list(CONFIG, word_tokens):
        misspelled_freq_dist = {}
        spell = SpellChecker()
        # words like I'll I'm are indicated as misspelt, this is fine though that means that person spells it in that particular manner and is beieng flagged
        # misspelled = spell.unknown(['somessthing', 'is', 'hapenning', 'here', '!', '👍', ',', ':)', 'somessthing'])
        # misspelled = spell.unknown(['hamzas', 'emails', "I'll", "I'm"])
        misspelled = spell.unknown(word_tokens)
        word_tokens_lowercase = []
        for w in word_tokens:
            word_tokens_lowercase.append(w.lower())
        for w in misspelled:
            misspelled_freq_dist[w] = word_tokens_lowercase.count(w)
        # sorted(misspelled_freq_dist.items(), key=lambda x: x[1], reverse=True)
        misspelled_freq_dist = CANLTK.get_top_from_dictionary(
            misspelled_freq_dist, CONFIG['MISSPELLED_MIN_PRUNE_VALUE'])
        return misspelled_freq_dist

    # Old method
    # line = f2.readline().rstrip('\n')
    # while line:
    #     generate_values(line)
    #     line = f2.readline().rstrip('\n')
    # f2.close()

    # def preprocessing_fn(inputs):
    #     modified_inputs = {}
    #     for key, value in inputs.items():
    #         modified_inputs[key] = tft.scale_to_0_1(value)
    #     return modified_inputs

    # def normalize_data(data):
    #       # Ignore the warnings
    #     fieldnames = data[0].keys()
    #     datatypes = {}
    #     for fn in fieldnames:
    #         datatypes[fn] = tf.io.FixedLenFeature([], tf.float32)
    #     raw_data_metadata = dataset_metadata.DatasetMetadata(dataset_schema.from_feature_spec(datatypes))
    #     with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
    #         transformed_dataset, transform_fn = (  # pylint: disable=unused-variable
    #             (data, raw_data_metadata) | tft_beam.AnalyzeAndTransformDataset(
    #                 preprocessing_fn))

    #     transformed_data, transformed_metadata = transformed_dataset  # pylint: disable=unused-variable
    #     return transformed_data

    @staticmethod
    def lexical_ttr(tokens):
        return lex_div.ttr(tokens)

    @staticmethod
    def lexical_msttr(tokens):
        return lex_div.msttr(tokens)

    @staticmethod
    def lexical_mattr(tokens):
        return lex_div.mattr(tokens)

    @staticmethod
    def lexical_hdd(tokens):
        return lex_div.hdd(tokens)

    @staticmethod
    def lexical_mtld(tokens):
        return lex_div.mtld(tokens)

    @staticmethod
    def n_word_extensions(tokens):
        count = 0
        for token in tokens:
            try:
                corrected_word = pytypo.correct(token)
                if not (corrected_word == token):
                    count = count + 1
            except:
                ex = 1
        return count

    @staticmethod
    def most_common_word_extensions(CONFIG, tokens):
        fd = nltk.FreqDist(tokens)
        extend_word_list_dict = []
        for k, v in fd.items():
            try:
                corrected_word = pytypo.correct(k)
                if not (corrected_word == k):
                    extend_word_list_dict.append({"word": k, "count": v})
            except:
                ex = 1
        extend_word_list_dict_sorted = sorted(extend_word_list_dict,
                                              key=lambda k: k['count'],
                                              reverse=True)
        top_extend_word_list_dict = extend_word_list_dict_sorted[:int(
            CONFIG['NO_OF_MOST_FREQ_WORD_EXTENSIONS'])]
        return top_extend_word_list_dict

    @staticmethod
    def count_each_word_extension(extend_word_list_dict, tokens):
        each_extend_word_list_dict = []
        for extended_word_d in extend_word_list_dict:
            count = 0
            if extended_word_d["word"] in tokens:
                count = count + 1
            each_extend_word_list_dict.append({
                "word": extended_word_d["word"],
                "count": count
            })
        return each_extend_word_list_dict

    @staticmethod
    def most_common_bigrams(CONFIG, tokens):
        #Create your bigrams
        bgs = nltk.bigrams(tokens)

        #compute frequency distribution for all the bigrams in the text
        bigrams_freq_dist = nltk.FreqDist(bgs)
        top_list = list(
            filter(lambda x: x[1] >= int(CONFIG['BIGRAMS_MIN_PRUNE_VALUE']),
                   bigrams_freq_dist.items()))
        converted_list = []
        for i in top_list:
            converted_list.append({"ngrams": list(i[0]), "count": i[1]})
        return converted_list

    @staticmethod
    def most_common_trigrams(CONFIG, tokens):
        #Create your trigrams
        tgs = nltk.trigrams(tokens)

        #compute frequency distribution for all the trigrams in the text
        trigrams_freq_dist = nltk.FreqDist(tgs)
        top_list = list(
            filter(lambda x: x[1] >= int(CONFIG['TRIGRAMS_MIN_PRUNE_VALUE']),
                   trigrams_freq_dist.items()))
        converted_list = []
        for i in top_list:
            converted_list.append({"ngrams": list(i[0]), "count": i[1]})
        return converted_list

    @staticmethod
    def count_each_ngrams(tokens, freq_dist_dict):
        ngs = nltk.bigrams(tokens)

        ngrams_freq_dist = nltk.FreqDist(ngs)
        ngram_freq_dist_count = []
        for li in freq_dist_dict:
            freq_count = 0
            for bigram, count in ngrams_freq_dist.items():
                if bigram == li:
                    freq_count = count
                    break
            ngram_freq_dist_count.append({
                "ngrams": li["ngrams"],
                "count": freq_count
            })
        return ngram_freq_dist_count

    @staticmethod
    def n_grammar_errors(string):
        tool = language_check.LanguageTool('en-US')
        matches = tool.check(string)
        count = 0
        for m in matches:
            if m.category == "Grammar":
                count = count + 1
        return count

    @staticmethod
    def normalize_data(data):
        raw = []
        for i in data:
            r2 = []
            for k, v in i.items():
                r2.append(v)
            raw.append(r2)

        # r_normalized = preprocessing.normalize(raw, norm='l2',axis=0)
        min_max_scaler = preprocessing.MinMaxScaler()
        r_normalized = min_max_scaler.fit_transform(raw)

        normalized_list = []
        for val in r_normalized:
            normalized_dict = {}
            heading_names = list(data[0])
            for index, inner_val in enumerate(val):
                normalized_dict[heading_names[index]] = inner_val
            normalized_list.append(normalized_dict)
        return normalized_list


# main()
# write code to generate negative results for analysis using same params

# ADDITIONAL REFERENCES
# https://pc.net/emoticons/
# TODO do CBOW for additional accuracy
# TODO use LIWC API or maybe develop some of those features http://www.utpsyc.org/TAT/LIWCTATresults.php https://liwc.wpengine.com/compare-dictionaries/
# TODO https://github.com/LSYS/lexicalrichness https://sp1718.github.io/nltk.pdf
# TODO https://github.com/adeshpande3/LSTM-Sentiment-Analysis
#

# TODO Setiment Analysis
# https://github.com/axelnine/Sentiment-Analysis
# https://github.com/shubhi-sareen/Sentiment-Analysis
# https://github.com/ian-nai/Simple-Sentiment-Analysis
# https://github.com/changhuixu/sentiment-analysis-using-python
# https://textblob.readthedocs.io/en/dev/quickstart.html#sentiment-analysis
# https://github.com/MohamedAfham/Twitter-Sentiment-Analysis-Supervised-Learning

# TODO Lexical richness
# https://pypi.org/project/lexicalrichness/

# TODO Typo and word extensions
# https://pypi.org/project/pytypo/
# https://stackoverflow.com/questions/20170022/elongated-word-check-in-sentence

# TODO british or  american enlgish
# https://datascience.stackexchange.com/questions/23236/tokenize-text-with-both-american-and-english-words
# https://stackoverflow.com/questions/42329766/python-nlp-british-english-vs-american-english
# https://github.com/hyperreality/American-British-English-Translator
Пример #22
0
# -*- coding: utf-8 -*-
"""


@author: Dan
"""

import nltk
import sys
import cmudict
from string import punctuation
import json
cmudict = cmudict.dict()

path = (r'Desktop\python code')
file = 'txt cmu.txt'

f = open(r'C:\Users\Dan\Desktop\python code\zip\train.txt')
text = f.read()

with open('missing_words.json') as f:
    missing_words = json.load(f)


def count_syllables(words):
    words = words.replace('-', ' ')
    words = words.lower().split()
    num_sylls = 0
    for word in words:
        word = word.strip(punctuation)
        if word.endswith("'s") or word.endswith("’s"):
def get_syllables(word):
    d = cmudict.dict()
    return [
        len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]
    ]
Пример #24
0
def lookup_word(word_s):
    return cmudict.dict().get(word_s)  # standard dict access
Пример #25
0
	Input:
		input_phonemes  List of the phonemes that make up the word

	Output:
		syllable_count  Number of syllables for that word
"""
def calculate_number_of_syllables(input_phonemes):
	phonemes_string = "".join(input_phonemes[0])
	syllable_count = 0
	for letter in phonemes_string:
		if (letter.isdigit()):
			syllable_count += 1
	return syllable_count

if __name__ == "__main__":
	pronunciation_dictionary = cmudict.dict()

	word_count = 0
	output_dictionary = {}
	start_time = time.time()
	for key, value in pronunciation_dictionary.items():
		# Check if it has valid letters (Ignore words with apostrophe's in them)
		is_valid = True
		for index in key:
			if (not index.islower()):
				is_valid = False
				break
		
		number_of_syllables = calculate_number_of_syllables(value)

		# Ignore unwanted words
Пример #26
0
# -*- coding: utf-8 -*-
"""


@author: Dan
"""

import nltk
import sys
import json
import cmudict 
from string import punctuation
cmu=cmudict.dict()

path=(r'Desktop\python code')
file='txt cmu.txt'


f=open( r'C:\Users\Dan\Desktop\python code\zip\train.txt')
text=f.read()



def main():
    haiku= load_haiku(text)
    exceptions= cmuduct_missing(haiku)
    missing_words_dict= make_exceptions_dict(exceptions)
    save_exceptions(missing_words_dict)
    
def load_haiku(file):
    haiku=set(in_file.read().replace('-',' ').split())
Пример #27
0
import cmudict
from functools import reduce
import re
import numpy as np

from lib.features import features, feature_weights

# We store the cmudict as an object in memory so that we don't have to reload
# it every single time we call word_to_phonemes.
cmudict_cache = cmudict.dict()

# Maps from diphthongs to their monophthonic parts. Also "ER" for no reason.
diphthong_pairs = {
    "AW": ["AE", "UH"],
    "AY": ["AE", "IH"],
    "ER": ["R"],
    "EY": ["E", "IH"],
    "OW": ["O", "UH"],
    "OY": ["AO", "IH"],
}


def expand_phoneme(phoneme):
    """
    Expands a phoneme to potentially multiple phonemes. Used to map diphthongs
    to its monophthongs in series.
    """
    return diphthong_pairs.get(phoneme, [phoneme])


def word_to_phonemes(word):
from text.cmuToKorean import CMUToKorean
import cmudict  # dup in this dir

ret = CMUToKorean.convert('seventeen',
                          " ".join(cmudict.dict()["seventeen"][0]))
print(ret)
Пример #29
0
#!/usr/bin/env python

from utils import rhyme_pattern, slant_distance, n_syllables

if __name__ == "__main__":
    """
    Create neo4j database to house the words and patterns.
    """
    import os
    import cmudict
    from neo4j import GraphDatabase
    from itertools import product

    D = cmudict.dict()

    USERNAME = os.environ["NEO4J_USERNAME"]
    PASSWORD = os.environ["NEO4J_PASSWORD"]
    URI = os.environ["NEO4J_URI"]

    driver = GraphDatabase.driver(URI, auth=(USERNAME, PASSWORD))

    with open('words.txt', 'r') as fh:
        words = fh.read().strip().split('\n')

    with driver.session() as session:
        session.run("MATCH (n) DETACH DELETE n;")
        for word in words:
            for pron_list in D[word]:
                pron_str = ' '.join(pron_list)
                rp_pron_str = rhyme_pattern(pron_str)
Пример #30
0
 def __init__(self, lang='en'):
     if lang == 'en':
         self.cmu_dict = cmudict.dict()