Python Hyphenator.syllables示例，hyphen.Hyphenator.syllables Python示例

示例#1

0

显示文件

文件： hyphenation.py 项目： visualspace/django-vspace-utils

def hyphenate(value, arg=None, autoescape=None):
    # Default minimal length
    minlen = 6

    if arg:
        args = arg.split(u',')
        code = args[0]

        # Override minimal length, if specified
        if len(args) > 1:
            minlen = int(args[1])
    else:
        # No language specified, use Django's current
        code = get_language()

    # Normalize the locale code, ignoring a potential encoding suffix
    lang = locale.normalize(code).split('.')[0]

    # Make sure the proper language is installed
    if not dictools.is_installed(lang):
        dictools.install(lang)

    h = Hyphenator(lang)
    new = []
    for word in value.split(u' '):
        if len(word) > minlen and word.isalpha():
            new.append(u'&shy;'.join(h.syllables(word)))
        else:
            new.append(word)

    result = u' '.join(new)
    return mark_safe(result)

示例#2

0

显示文件

def build_sentence_info(timestamps, sentence, sent_dict):
    '''
    Build sentence info from timestamps, sentence text and sentiment lexicon
    :param timestamps:
    :param sentence:
    :param sent_dict:
    :return:
    '''
    # for test
    # print sentence

    h_en = Hyphenator('en_US')
    info_list = []
    # words = re.split('\W+', sentence)
    words = re.split('[,.!?\r\n ]+', sentence)
    # print words
    # print len(words)
    # print len(timestamps)
    words.remove('')
    words_with_punct = sentence.split()

    for ind, word in enumerate(words):
        if word in sent_dict:
            c_sentiment = sent_dict[word]
        else:
            c_sentiment = 0
        punct = ''
        if words_with_punct[ind] != word:
            punct = words_with_punct[ind][-1]
        num = t2n.text2num(word)
        info_list.append(
            (word, timestamps[ind * 2], timestamps[ind * 2 + 1],
             len(h_en.syllables(unicode(word))), c_sentiment, punct, num))
    return info_list

示例#3

0

显示文件

文件： textgenerator.py 项目： rowhawn/PilosopherPunchOut

class TextGenerator:
	def __init__(self, generatorName, trainString, prefixLength):
		self.generatorName = generatorName
		self.chain = MarkovChain()
		self.chain.generateDatabase(trainString, n=prefixLength)
		self.currState = []
		self.hyphenator = Hyphenator('en_US')
		self.syllableQ = Queue()
		self.stripPattern = re.compile('[\W_]+')
		while (len(self.currState) < prefixLength):
			self.currState = self.chain.generateString().split()[-(prefixLength+1):-1]
	
	def load_next_word(self):
		nextword = ""
		try:
			while nextword == "":
				nextword = self.stripPattern.sub('', self.chain._nextWord(self.currState))
				self.currState = self.currState[1:]
				self.currState.append(nextword)
			if len(nextword) < 4: # because hyphenator doesnt work for words less than 4 letters
				self.syllableQ.put(nextword)
			else: 
				for syllable in self.hyphenator.syllables(nextword):
					self.syllableQ.put(syllable)
		except UnicodeEncodeError:
			print("unicode error")
		
	def get_next_syllable(self):
		if (self.syllableQ.empty()):
			self.load_next_word()
		return self.syllableQ.get()

示例#4

0

显示文件

def syllablize(poem):
    # syllablizer setup
    if not is_installed(language): install(language)
    hyph = Hyphenator(language)

    # output dict to send back through API
    output = []

    for line in poem:
        # list of words in line
        words = line.split()
        syllablized_line = []

        for word in words:
            syls = hyph.syllables(word)

            new_word = ""

            if len(syls) == 0:
                new_word = word
            else:
                for syl in syls:
                    new_word += syl
                    new_word += " "

            syllablized_line.append(new_word.strip())

        if len(syllablized_line) > 0:
            output.append(syllablized_line)

    return output

示例#5

0

显示文件

文件： en_syllab_sorted_tokenizer.py 项目： maks5507/cognitive-complexity

class EnSyllabSortedTokenizer():
    def __init__(self, stopwords):
        self.preprocessor = preprocessor.Preprocessing(stopwords)
        self.syllbler = Hyphenator('en_US')

    def tokenize(self,
                 text,
                 use_preproc=False,
                 use_stem=False,
                 use_lemm=False,
                 check_length=True,
                 check_stopwords=True):

        preprocessed_text = text

        if use_preproc:
            preprocessed_text, _ = self.preprocessor.preproc(
                text,
                use_lemm=use_lemm,
                use_stem=use_stem,
                check_stopwords=check_stopwords,
                check_length=check_length)

        syllables = []
        for word in preprocessed_text.split():
            tokens = self.syllbler.syllables(word)
            syllables += [''.join(sorted(token)) for token in tokens]

        return syllables

示例#6

0

显示文件

def syllabizeNames(nameList):
    tempList = []
    for lang in ['en_US']:
        if not is_installed(lang): install(lang)
    en_US = Hyphenator('en_US')
    for item in nameList:
        tempList.append(en_US.syllables(item))
    return tempList

示例#7

0

显示文件

文件： __main__.py 项目： jsonnet/ConcurrentTex

def main(arguments: List[str] = None):
    namespace = parser.parse_args(arguments)

    command = namespace.command

    if command == 'export_font':
        from .pdf import PDF
        glyphs = set(GLYPHS)
        cwd = pathlib.Path('.')
        if namespace.text is not None:
            for text_glob in namespace.text:
                for text_file in cwd.glob(text_glob):
                    print(f'Taking glyphs from:\n  {text_file}')
                    glyphs.update(set(text_file.read_text('utf-8')))
        font = PDF.font(namespace.font_name,
                        namespace.font_size,
                        glyphs=glyphs)
        font.export(namespace.output)
    elif command == 'tester':
        from .tester import main
        main(namespace)
    elif command == 'hyphenate':
        text = namespace.input.read().decode()
        hyphenator = Hyphenator(language=namespace.language)
        for token_type, text in tokenize(text):
            if token_type is TokenType.WORD:
                syllables = hyphenator.syllables(text) or [text]
                namespace.output.write_chunk('-'.join(syllables).encode())
            else:
                namespace.output.write_chunk(text.encode())
    elif command == 'render':
        import json

        from .printer import Page, FontSpec, Fragment
        from .pdf import PDF

        text = namespace.input.read()
        raw_pages = text.split('\0\n')

        pages = []

        for raw_page in raw_pages:
            if not raw_page:
                continue
            page_data = json.loads(raw_page)
            font_spec = FontSpec(page_data['font_spec']['name'],
                                 page_data['font_spec']['size'])
            paper_width = page_data['paper_width']
            paper_height = page_data['paper_height']
            fragments = [
                Fragment(**fragment) for fragment in page_data['fragments']
            ]
            page = Page(font_spec, paper_width, paper_height, fragments)
            pages.append(page)

        pdf = PDF(namespace.output)
        pdf.render(pages)
        pdf.finish()

示例#8

0

显示文件

文件： mnemo2.py 项目： icodeiexist/codeprojects

 def encode(self, word):
     num_string = ""
     h_mx = Hyphenator('es_MX')
     for syllable in h_mx.syllables(unicode(word)):
         for idx, pattern in enumerate(self.patterns):
             for regex in pattern:
                 if re.match(regex, syllable):
                     num_string += str(idx)
     return num_string

示例#9

0

显示文件

文件： helpers.py 项目： Delejnr/yesterday-tomorrow

def get_syllables(lyrics):
    h = Hyphenator()
    syllables = []
    for word in lyrics.split(" "):
        syl = h.syllables(word)
        if syl:
            syllables.append(syl)
        else:
            syllables.append([word])
    return syllables

示例#10

0

显示文件

文件： histogram.py 项目： Idorobots/word-histograms

def by_syllable(input_gen, lang, install_lang_p):
    if install_lang_p and not dictools.is_installed(lang):
        dictools.install(lang)

    hyphenator = Hyphenator(lang)

    for word in input_gen:
        syllables = hyphenator.syllables(word)
        logging.debug("syllables: {}".format(syllables))
        for syllable in syllables:
            yield syllable

示例#11

0

显示文件

文件： test_hyphenator.py 项目： robinwhittleton/PyHyphen

    def test_beautiful(self):
        h_en = Hyphenator('en_US')

        self.assertEqual([['beau', 'tiful'], [u'beauti', 'ful']],
                         h_en.pairs('beautiful'))

        self.assertEqual(['beau-', 'tiful'], h_en.wrap('beautiful', 6))

        self.assertEqual(['beauti-', 'ful'], h_en.wrap('beautiful', 7))

        self.assertEqual(['beau', 'ti', 'ful'], h_en.syllables('beautiful'))

示例#12

0

显示文件

文件： util.py 项目： goptavares/shakespeare-hmm

def getUniqueSyllables(sonnets):
    h = Hyphenator('en_GB')
    s = set()
    for sonnet in sonnets:
        for sentence in sonnet:
            for word in sentence:
                syllables = h.syllables(unicode(word.lower()))
                if len(syllables) < 2:
                    s.add(unicode(word.lower()))
                else:
                    s |= set(syllables)
    return(list(s))

示例#13

0

显示文件

    def sylTokenizer(text):
        words = wordTokenizer(text)

        if language == 'en':
            en = Hyphenator('en_US')
            syl_split = map(lambda x: en.syllables(x)
                            if (len(x) > 1 and len(en.syllables(x)) > 0)
                            else [x],
                            words)
            comb_syl_split = map(lambda x: ["".join(x[i:i + ngrams])
                                            for i in
                                            range(max(len(x) - ngrams + 1,
                                                      1))
                                            ], syl_split)
            return reduce(lambda x, y: x + y, comb_syl_split)
        elif language == 'te':
            te = Syllabifier()
            syl_split = map(lambda x: te.syllabify_te(x)
                            if (len(x) > 1 and len(te.syllabify_te(x)) > 0)
                            else [x],
                            words)
            comb_syl_split = map(lambda x: ["".join(x[i:i + ngrams])
                                            for i in
                                            range(max(len(x) - ngrams + 1,
                                                      1))
                                            ], syl_split)
            return reduce(lambda x, y: x + y, comb_syl_split)

        else:
            hi = Syllabifier()
            syl_split = map(lambda x: hi.syllabify_hi(x)
                            if (len(x) > 1 and len(hi.syllabify_hi(x)) > 0)
                            else [x],
                            words)
            comb_syl_split = map(lambda x: ["".join(x[i:i + ngrams])
                                            for i in
                                            range(max(len(x) - ngrams + 1,
                                                      1))
                                            ], syl_split)
            return reduce(lambda x, y: x + y, comb_syl_split)

示例#14

0

显示文件

文件： nomen.py 项目： Bankq/nomen

class Nomen:
    
    def __init__(self):
        self.hyphen = Hyphenator('en_US')
        pass
    
    def load(self, file="./data/data-1.txt"):
        """ load training data"""
        self.data = Data()
        if self.data:
            print "Data loaded success"
            print str(self.data)

    def train(self):
        pass

    def rank(self):
        pass

    def get(self, en_name):
        en_name = en_name.lower()
        # lookup = self.data.find(en_name)
        # if lookup:
        #     return lookup

        syll = self.hyphen.syllables(en_name)
        split_onsets(syll)
        split_codas(syll)
        split_glides(syll)
        split_mcs(syll)
        expand_dipththongs(syll)
        print "Syllables:", syll
        return self.backward_max_matching(0, syll)

    def backward_max_matching(self, i, syll):
        if i >= (len(syll)):
            return ''
        if not syll or len(syll) == 0:
            return ''
        lx = self.data.lexicons
        key = ''.join(syll[i:])
        print "try:", key
        if key in lx:
            candidate = self.rank(lx[key], 0)
            print "find:", key, candidate
            return self.backward_max_matching(0,syll[0:i]) + self.rank(lx[key], 0)
        else:
            return self.backward_max_matching(i+1, syll)

    def rank(self, l, i):
        rl = sorted(l, reverse=True, key=lambda x:x[1])
        return rl[i][0]

示例#15

0

显示文件

def tokenize_word_to_syllables(word, lang):
    global hyphenator
    if hyphenator is None:
        print('Initializing Hyphenator (' + lang + ')...')
        hyphenator = Hyphenator(lang)

    syllables = hyphenator.syllables(word)

    # Word with only one syllable need special treatment,
    # because the hyphenator does not recognize them
    if len(syllables) == 0:
        syllables = [word]

    return syllables

示例#16

0

显示文件

文件： hyphenate.py 项目： prvn16/staff-app-demo

def hyphenate(value, arg=None, autoescape=None):
    if autoescape:
        esc = conditional_escape
    else:
        esc = lambda x: x
    
    minlen = 7

    if arg:
        args = arg.split(u',')
        code = args[0]
        if len(args) > 1:
            minlen = int(args[1])
    else:
        code = settings.LANGUAGE_CODE

    #
    # Looks like this is assuming that the language code will arrive as 'xx-
    # YY'. In our case, it will arrive as simply 'en', so we MUST expand this
    # into a locale in order to work with PyHyphen.
    #

    # TODO: This should probably be a lookup against a dict in settings?

    s = code.split(u'-')

    if len(s) == 1:
        if s[0] == 'en':
            s.append(u'US')
        elif s[0] == 'bg':
            s.append(u'BG')

    lang = s[0].lower() + u'_' + s[1].upper()
    
    if not dictools.is_installed(lang): 
        dictools.install(lang)
        
    h = Hyphenator(lang)

    new = []

    for word in value.split(u' '):
        if len(word) > minlen and word.isalpha():
            new.append(u'&shy;'.join(h.syllables(word)))
        else:
            new.append(word)
    
    result = u' '.join(new)
    return mark_safe(result)

示例#17

0

显示文件

文件： preprocess.py 项目： miguelarocao/Shakespearebot_9000

    def parse_word(self, word):
        """Returns syllables and stress of each syllable if exists, else None.
        First tries using NLTK cmudict, if failes then uses pyhyphen."""

        syl_stress = None
        try:
            word_info = self.cmu_dict[word.lower()][0]  # no way to differentiate between different pronunciation
            syl_num = len(list(y for y in word_info if y[-1].isdigit()))
            syl_stress = list(int(y[-1]) for y in word_info if y[-1].isdigit())
        except KeyError:
            h_en = Hyphenator('en_GB')
            syl_num = len(h_en.syllables(unicode(word)))
            if syl_num == 0:
                syl_num = 1

        return syl_num, syl_stress

示例#18

0

显示文件

文件： hyphenator.py 项目： matteo-pagliari/polispell

class hyphenator:
    def __init__(self, language='it_IT'):

        self.h = Hyphenator(language)

    def split_syllables(self, word):

        syllables = self.h.syllables(utils.check_unicode(word))

        return syllables

    def split_word(self, word):

        pairs = self.h.pairs(utils.check_unicode(word))

        return pairs

示例#19

0

显示文件

def syllablize(line):
    """
    take a line and split it into a list of syllables
    """
    hyph_en = Hyphenator('en_US')
    syll_list = []
    #get words separately + count hyphenated words as 2 words
    words = line.replace("-", " ").split()
    for word in words:
        #remove common punctuation
        word = word.replace(",", "").replace(":", "").replace(";", "")
        syllables = hyph_en.syllables(word)
        if not syllables:
            #pyhyphen sometimes returns 1 syllable words back to you,
            #but sometimes return an empty list... don't know why
            syll_list.append(word)
        for syll in syllables:
            syll_list.append(syll)
    return syll_list

示例#20

0

显示文件

文件： PrepareText.py 项目： yohannesaf/Computational-Approach-to-Analyze-and-Visualize-Rap-Lyrics

 def word_phonic_dict_func(self):
     '''
     Output: Ordered dictionary
         Keys - word
         Value - phonetic representation of the key
     '''
     h_en = Hyphenator('en_US')
     for line in self.lyrics_tokenized:
         for word in line:
             if word not in self.arpabet_dict.keys():
                 try:
                     self.arpabet_dict.update(
                         {word: pr.phones_for_word(word)[0]})
                     temp = h_en.syllables(unicode(word))
                     if len(temp) > 0:
                         self.word_syl_dict.update({word: temp})
                     else:
                         self.word_syl_dict.update({word: [unicode(word)]})
                 except Exception as e:
                     print e

示例#21

0

显示文件

文件： Utilities.py 项目： Juncai/SpeechTextLabeler

def build_sentence_info(timestamps, sentence, sent_dict):
    '''
    Build sentence info from timestamps, sentence text and sentiment lexicon
    :param timestamps:
    :param sentence:
    :param sent_dict:
    :return:
    '''
    # for test
    # print sentence


    h_en = Hyphenator('en_US')
    info_list = []
    # words = re.split('\W+', sentence)
    words = re.split('[,.!?\r\n ]+', sentence)
    # print words
    # print len(words)
    # print len(timestamps)
    words.remove('')
    words_with_punct = sentence.split()

    for ind, word in enumerate(words):
        if word in sent_dict:
            c_sentiment = sent_dict[word]
        else:
            c_sentiment = 0
        punct = ''
        if words_with_punct[ind] != word:
            punct = words_with_punct[ind][-1]
        num = t2n.text2num(word)
        info_list.append((word,
                          timestamps[ind * 2],
                          timestamps[ind * 2 + 1],
                          len(h_en.syllables(unicode(word))),
                          c_sentiment,
                          punct,
                          num))
    return info_list

示例#22

0

显示文件

def build_sentence_data(title, timestamps, sentence, sent_dict):
    '''
    Build sentence info from timestamps, sentence text and sentiment lexicon
    :param timestamps:
    :param sentence:
    :param sent_dict:
    :return: a SentenceData object contain text-based information about the sentence
    '''
    # for test
    # print sentence

    s = SentenceData(title, sentence)
    s.words = []

    h_en = Hyphenator('en_US')
    words = re.split('[,.!?\r\n ]+', sentence)

    words.remove('')
    words_with_punct = sentence.split()

    for ind, word in enumerate(words):
        if word in sent_dict:
            c_sentiment = sent_dict[word]
        else:
            c_sentiment = 0
        punct = ''
        if words_with_punct[ind] != word:
            punct = words_with_punct[ind][-1]
        num = t2n.text2num(word)
        if num == -1:
            num = ''
        else:
            num = str(num)
        w = WordData(word, float(timestamps[ind * 2]),
                     float(timestamps[ind * 2 + 1]), c_sentiment,
                     len(h_en.syllables(unicode(word))), punct, num)
        s.words.append(w)
    return s

示例#23

0

显示文件

文件： views.py 项目： beall49/paDjangoApp

def getSyllables(request, strParm):
    from hyphen import Hyphenator

    #your language english
    h_en = Hyphenator('en_US')

    #this makes sure the words come out in english
    style = 'utf-8'
    wordList.extend(word.strip() for word in wordList.replace("\n", "").split(","))

    words =[]
    #for each words in your word list
    for word in wordList:
        #this cuts the word into syllables
        brokenUpWord      = '-'.join(h_en.syllables(word.decode(style)))

        #this gets the count of syllables
        countOfSyllables        = str(len(str(brokenUpWord).split('-')))

        #print them out
        words.extend(brokenUpWord +';', countOfSyllables + ' syllable' + ('s' if countOfSyllables>1 else '') +'\n')

    return HttpResponse( json.dumps({'words': words}), content_type='application/json')

示例#24

0

显示文件

文件： surpisehaiku.py 项目： anabranch/SurpiseHaiku

class HyphenatorAlgorithm(object):
    """
    This is a small wrapper on the Hyphenator method from our Hyphen import.
    Conforms to the same return type as the HyphenatorDictionary class
    """
    def __init__(self):
        """
        Initialize the class
        """
        self._hyphenator = Hyphenator('en_US')

    def syllables(self, word):
        """
        Calculates the number of syllables, if it tries to return 0 it returns 1.
        All words should count as a syllable
        """
        syll = self._hyphenator.syllables(unicode(word))
        length = len(syll)

        if length != 0:
            return length
        else:
            return 1

示例#25

0

显示文件

文件： Utilities.py 项目： Juncai/SpeechTextLabeler

def build_sentence_data(title, timestamps, sentence, sent_dict):
    '''
    Build sentence info from timestamps, sentence text and sentiment lexicon
    :param timestamps:
    :param sentence:
    :param sent_dict:
    :return: a SentenceData object contain text-based information about the sentence
    '''
    # for test
    # print sentence

    s = SentenceData(title, sentence)
    s.words = []

    h_en = Hyphenator('en_US')
    words = re.split('[,.!?\r\n ]+', sentence)

    words.remove('')
    words_with_punct = sentence.split()

    for ind, word in enumerate(words):
        if word in sent_dict:
            c_sentiment = sent_dict[word]
        else:
            c_sentiment = 0
        punct = ''
        if words_with_punct[ind] != word:
            punct = words_with_punct[ind][-1]
        num = t2n.text2num(word)
        if num == -1:
            num = ''
        else:
            num = str(num)
        w = WordData(word, float(timestamps[ind * 2]), float(timestamps[ind * 2 + 1]), c_sentiment,
                     len(h_en.syllables(unicode(word))), punct, num)
        s.words.append(w)
    return s

示例#26

0

显示文件

文件： data.py 项目： Bankq/nomen

    def parse(self):
        p = Pinyin()
        s = Hyphenator('en_US')
        with codecs.open(self.filepath, encoding='utf-8', mode='r') as f:
            for line in f:
                self.count = self.count + 1
                line = line[0:-1]
                words = line.split()
                if len(words) != 2:
                    print "Error on line", self.count
                    raise ValueError
                c = words[0].strip()
                e = words[1].strip().lower()

                self.ch.append(c)
                self.pinyin.append(p.get_pinyin(c, ' ').split())

                self.en.append(e)
                if len(e) > 3:
                    syll= s.syllables(e)
                    syll = self.sub_syllables(e, c, syll)
                else:
                    syll = [e]
                self.syllables.append(syll)

示例#27

0

显示文件

文件： Test.py 项目： afcarl/SpeechTextLabeler

# pitch_array = np.array(pitch_info[1])
# print np.mean(pitch_array)
# print np.std(pitch_array)

# sent_dict = build_sentiment_dict(sentiment_dict_path)
#
# for key in sent_dict:
#     print key
#
# ts = get_time_stamp(bml_path)
#
# for t in ts:
#     print t

# words = re.split('\W+', 'OK, well, shall we start? Welcome to Finnmore Associates!')
# words.remove('')
# for ind, word in enumerate(words):
#     print word, ind

h_en = Hyphenator('en_US')
#
print len(h_en.syllables(unicode(u'beautiful')))
#
# alist = [1, 2, 4, 2, 5, 6, 9.1]
# barray = np.array(alist)
# dev = np.std(barray)
# m = np.mean(barray)
# print m
# print dev

示例#28

0

显示文件

文件： heuristic.py 项目： fsiddiqi/haiku-generator

def count_syllables(word):
	hyphenator = Hyphenator('en_US')
	return max(len(hyphenator.syllables(word)),1)

示例#29

0

显示文件

def getPoem():
    # TODO Get URL, check if the database already has the poem info
    # conn = sqlite3.connect('example.db')

    # TODO scan head for metadata

    post_link = request.args.get('post_link')
    post_req = get(post_link)

    post_soup = BeautifulSoup(post_req.text, 'html.parser')
    post_head = post_soup.head

    placeName = post_soup.find("meta", {'name': "geo.placename"})["content"]
    coordinates = post_soup.find("meta", {'name': "geo.position"})["content"]

    poem_body = post_soup.find(id='postingbody')
    item = ''
    for item in poem_body.children:
        pass
    words = item.split()
    haiku_syllables = 0
    ind_syllables = 0
    debug_text = ''
    poem_text = ''

    newline = '<br>'

    from hyphen import Hyphenator
    h_en = Hyphenator('en_US')

    for word in words:
        if haiku_syllables >= 17:
            break
        syl = h_en.syllables(word)
        old_haiku_syllables = haiku_syllables
        num_syls = len(syl)
        haiku_syllables += num_syls
        if num_syls == 0:
            debug_text += '0'
            num_syls = 1
            haiku_syllables += 1
        if num_syls > 0:
            syllable_bounds = [5,12]
            did_bound_op = False
            for bd in syllable_bounds:
                if haiku_syllables == bd:
                    did_bound_op = True
                    poem_text += word + newline
                    debug_text += 'a'
                if old_haiku_syllables < bd and haiku_syllables > bd:
                    debug_text += 'b'
                    did_bound_op = True
                    syl_index = 0
                    while syl_index + old_haiku_syllables < bd:
                        poem_text += syl[syl_index]
                        syl_index += 1
                    poem_text += '-{}'.format(newline)
                    while syl_index + old_haiku_syllables < haiku_syllables:
                        poem_text += syl[syl_index]
                        syl_index += 1
                    poem_text += ' '
                if did_bound_op:
                    break
            if not did_bound_op:
                debug_text += 'c'
                poem_text += word + ' '
        ind_syllables += 1
    obj = dict()
    obj['poem'] = poem_text
    obj['url'] = post_link
    obj['placeName'] = placeName
    obj['coordinates'] = coordinates
    import json
    return json.dumps(obj)

示例#30

0

显示文件

文件： contextexceptions.py 项目： SzieberthAdam/SzieberthAdam.github.io

            if k == "főszöveg":
                főszöveg[-1].append(sor)
        előző_sor = sor

    for k in záró_állapotok:
        főszöveg.pop()

    főszöveg_s = "\n\n".join("\n".join(bekezdes) for bekezdes in főszöveg)

    szavak = sorted(s2 for s2 in set(re.split(r'\W+', főszöveg_s))
                    if s2 and not s2.isdecimal())
    elválasztások = []

    for szó in szavak:
        print(szó)
        h_szótagok = h.syllables(szó)
        if len(h_szótagok) < 2:
            continue
        mtxrun = subprocess.check_output(['mtxrun'] + mtxrun_args +
                                         [szó]).decode("utf8")
        mtx_patterns = [
            sor for sor in mtxrun.split("\n") if sor.startswith("mtx-patterns")
        ][0]
        mtx1, mtx2 = [s.strip() for s in mtx_patterns.split(":")[-2:]]
        mtx_szótagok = mtx2.split("-")
        if h_szótagok != mtx_szótagok:
            e = contextexceptions(szó, h_szótagok)
            print(f'kivétel: {e}')
            elválasztások.append(e)

    pki = p.parent / f'{p.stem}_hyp.tex'

示例#31

0

显示文件

文件： colorsyllables.py 项目： alex03122016/learny

def color_syllables(inputtext):
    import subprocess
    import os
    try:
        import docxprint
    except ImportError:
        from learny import docxprint
    #python-docx module code: pip install python-docx
    import docx
    from docx.shared import RGBColor

    #code: pip install PyHyphen
    from hyphen import Hyphenator
    de_DE = Hyphenator('de_DE')

    print('color syllables')
    print('input Text: ' + inputtext)

    # variables and lists used
    #save_path = os.path.join(os.path.expanduser('~'),'python-project' ,'kivy-test', 'learny', __name__ + 'fileTitle.docx')

    i = 0
    word_print = []
    Aufgabe = {
        "Kopfzeile": "Name: 				Klasse: 				Datum:  \n ",
        "Titel": "",
        "1. Aufgabe": "Lies den Text! Tippe bei jeder Silbe auf den Tisch!\n",
        "Hinweise": "Hier sind die Wörter aus den Lücken: \n",
        "Rätselwörter": "Hier ein paar Rätselwörter aus dem Text: \n",
    }
    # create document object
    doc = docx.Document()
    save_path = docxprint.docx_print(Doc=doc, save='colorsyllables')
    #get the input text
    text = inputtext
    #only work on text with '' specific words
    if text.find('') != -1:
        hyph_print = []
        word_print = []
        hyph = []

        #seperate text into words and put them in list hyph_input
        hyph_input = text.split()
        docxprint.docx_print(printText=Aufgabe["Kopfzeile"], Doc=doc)
        docxprint.docx_print(printText=Aufgabe["1. Aufgabe"],
                             Bold=True,
                             Doc=doc)
        paragraph = docxprint.docx_print(Doc=doc)

        #work on separated words
        for w in hyph_input:
            hyph_print = []
            i = 1
            hyph = de_DE.syllables(w)
            if hyph == []:
                #paragraph.add_run(w+' ')
                docxprint.docx_print(printText=w + ' ',
                                     Paragraph=paragraph,
                                     Doc=doc)  #adds the text
            for syl in hyph:
                if i % 2 == 0:
                    docxprint.docx_print(printText=syl,
                                         color="red",
                                         Paragraph=paragraph,
                                         Doc=doc)
                else:
                    #paragraph.add_run(syl)
                    docxprint.docx_print(printText=syl,
                                         Paragraph=paragraph,
                                         Doc=doc)  #adds the text
                i += 1
            docxprint.docx_print(printText=' ', Paragraph=paragraph,
                                 Doc=doc)  #adds the text

    #save the result in absolute path
    doc.save(save_path)

    return

示例#32

0

显示文件

文件： hyphenate.py 项目： nathanharper/glibreviews

import sys
import json

""" 2.7 and up version is capitalized (annoying) """
if sys.version_info >= (2, 7):
    from hyphen import Hyphenator, dictools
    hy = Hyphenator('en_US')
else:
    from hyphen import hyphenator, dictools
    hy = hyphenator('en_US')

try:
    json_object = {}
    for word in sys.argv[1:]:
        json_object[word] = hy.syllables(unicode(word))
    print json.dumps(json_object)
except IndexError:
    sys.exit(1)

sys.exit(0)

示例#33

0

显示文件

文件： urban_dictionary.py 项目： mullikine/this-word-does-not-exist

    print(f"Total removed {100 * (1 - len(ret) / len(words)):.2f}%")

    return ret

from hyphen import Hyphenator
h_en = Hyphenator('en_US')

wi = WordIndex(
    [
        Word(
            word=w.word,
            definition=w.definition,
            pos=w.pos,
            topic=w.topic,
            example=clean_example(w.word, w.example),
            syllables=h_en.syllables(w.word),
            probably_exists=False,
            dataset_type=wordservice_pb2.DatasetType.UD_UNFILTERED,

        ) for w in words

    ]
)
wi.dump_encrypted("../website/data/words_ud_unfiltered.enc.gz", fernet_key=os.environ.get("FERNET_ENCRYPTION_KEY"))

wg = WordGenerator(
    device="cuda:0",
    forward_model_path="/mnt/evo/projects/title-maker-pro/models/urban_dictionary_250_cleaned_lr_00005_b9_seed4/checkpoint-140000",
    inverse_model_path=None,
    blacklist_path="/mnt/evo/projects/title-maker-pro/models/blacklist.pickle",
    quantize=False,

示例#34

0

显示文件

文件： modelhmm_reverse_counting.py 项目： YerongLeopard/PoetryGeneration

def poem_generate(num_of_hidden_states, num_pairs):
    print "Number of hidden states:", num_of_hidden_states
    print "Number of poems to generate:", num_pairs
    # how many pairs to generate
    ending_words_dict = sample_ending_word(num_pairs)
    poems_dict = dict()

    h_en = Hyphenator('en_US')
    prondict = nltk.corpus.cmudict.dict()
    prob_file_name = '../probability/prob_num'+str(num_of_hidden_states)+'.txt'
    fwrite = open(prob_file_name, 'w')
    ###
    for ind in ['A','B','C','D','E','F','G']:
    ### for ind in ['A']:
        print "Group:", ind
        # get ending words
        ending_words = ending_words_dict[ind]

        # preprocess data
        corpusname = '../data/grouping1/group' + ind + '.txt'
        corpus = importasline(corpusname, ignorehyphen=False)

        vectorizer = CountVectorizer(min_df=1)
        X = vectorizer.fit_transform(corpus)
        analyze = vectorizer.build_analyzer()
        Y = [[vectorizer.vocabulary_[x] for x in analyze(corpus[i])] for i in range(len(corpus))]
        ending_tokens = [[vectorizer.vocabulary_[x] for x in ending_words[i]] for i in range(len(ending_words))]
        # print(Y)
        words = vectorizer.get_feature_names()
        print "Number of words:", len(words)
        # train in a reverse direction
        for i, line in enumerate(Y):
            Y[i] = line[::-1]
        # print(Y)

        # generate number of syllables for every word
        words_num_syllables = np.zeros(len(words), dtype=int)
        for wordid, word in enumerate(words):
            try:
                phon = prondict[word][0]
                words_num_syllables[wordid] = sum(map(hasNumbers, phon))
            except:
                words_num_syllables[wordid] = len(h_en.syllables(unicode(word)))
            if not words_num_syllables[wordid]:
                words_num_syllables[wordid] = count_syllables(word)

        # train model
        ntrial = 10
        logp = np.zeros(ntrial) # logp is an 1-D array
        subpoems = [None]*num_pairs
        for i in range(ntrial):
            modelname = 'modelnhiddengroup'+ind+'_'+str(num_of_hidden_states)+'_trial'+str(i)
            hmm = modelhmm(num_of_hidden_states, len(words), Y, words_num_syllables, modelname)
            logp[i] = hmm.trainHHM(Y)
            if (i==0) or (i>0 and logp[i] > max(logp[0:i])):
                hmm.savemodel()
                hmm.loadmodel()

                # generate poems
                for pairid in range(num_pairs):
                    start_token = ending_tokens[pairid]
                    robotpoem0 = ''
                    line0,linew0 = hmm.generating_random_line_end(start_token[0])
                    for j in linew0[::-1]:
                        robotpoem0+=' '+words[j]+' '
                    print robotpoem0, 'robotpoem0'
                    robotpoem1 = ''
                    line1,linew1 = hmm.generating_random_line_end(start_token[1])
                    for j in linew1[::-1]:
                        robotpoem1+=' '+words[j]+' '
                    print(robotpoem1)
                    subpoems[pairid] = (robotpoem0, robotpoem1)

                hmm.analyzing_word(words)

        # add the best subpoem to poems_dict
        poems_dict[ind] = subpoems
        print>>fwrite, ind 
        print>>fwrite, str(logp)
        print "List of log probability:", logp
    fwrite.close()

    # write down the poems
    poem_file_name = '../poems_counting/reverse_'+str(num_of_hidden_states)+'.txt'
    fwrite = open(poem_file_name, 'w')
    for poemid in range(num_pairs):
        # construct poems
        robotpoem = [None]*14
        robotpoem[0] = poems_dict['A'][poemid][0]
        robotpoem[2] = poems_dict['A'][poemid][1]
        robotpoem[1] = poems_dict['B'][poemid][0]
        robotpoem[3] = poems_dict['B'][poemid][1]
        robotpoem[4] = poems_dict['C'][poemid][0]
        robotpoem[6] = poems_dict['C'][poemid][1]
        robotpoem[5] = poems_dict['D'][poemid][0]
        robotpoem[7] = poems_dict['D'][poemid][1]
        robotpoem[8] = poems_dict['E'][poemid][0]
        robotpoem[10] = poems_dict['E'][poemid][1]
        robotpoem[9] = poems_dict['F'][poemid][0]
        robotpoem[11] = poems_dict['F'][poemid][1]
        robotpoem[12] = poems_dict['G'][poemid][0]
        robotpoem[13] = poems_dict['G'][poemid][1]

        robotpoem = Format(robotpoem)
        
        # write into file
        print>>fwrite, str(poemid)
        for lineid in range(len(robotpoem)):
            print>>fwrite, robotpoem[lineid]
    fwrite.close()

示例#35

0

显示文件

def syllable_division(phrase):
    """
    this function will split in syllabs a given strig
    input: string
    output: list of list (list of word each one contains a list of syllables)  
    """
    h_it = Hyphenator('it_IT')
    word_list = phrase.split()
    if len(word_list) == 0:
        return []
    vocali_sill = ["a", "e", "o"]
    b_lis = ['u', 'i']
    sillabe_frase = []
    for w in word_list:
        ton = tonale(w)
        sillabe_custom = []
        if len(w) < 3:
            sillabe_custom.append(w)
        elif len(w) == 3:
            sy = w
            i = 0
            while True:
                if ((sy[i] in vocali_sill) and
                    (sy[i + 1] in vocali_sill)) or (ton and (
                        ((sy[i] in b_lis) and (sy[i + 1] in vocali_sill)) or
                        ((sy[i] in vocali_sill) and (sy[i + 1] in b_lis)))):
                    sillabe_custom.append(sy[:i + 1])
                    sillabe_custom.append(sy[i + 1:])
                    break
                i += 1
                if i >= 2:
                    sillabe_custom.append(sy)
                    break
        else:
            if (((w[0] in vocali) and (w[1] in consonanti) and
                 (w[2] in vocali)) or ((w[0] in vocali) and (w[1] == "s") and
                                       (w[2] in consonanti)) or
                ((w[0] in vocali) and (w[1] == "g") and (w[2] in consonanti))):
                sillabe_custom.append(w[0])
                w = w[1:]
            middle_division = h_it.syllables(w)
            if not middle_division:
                sillabe_custom.append(w)
            else:
                # control to recognize wrong syllables
                for sy in middle_division:
                    if len(sy) < 3:
                        sillabe_custom.append(sy)
                    else:
                        i = 0
                        while True:
                            if i >= (len(sy) - 1):
                                sillabe_custom.append(sy)
                                break
                            if ((sy[i] in vocali_sill) and
                                (sy[i + 1] in vocali_sill)) or (ton and (
                                    ((sy[i] in b_lis) and
                                     (sy[i + 1] in vocali_sill)) or
                                    ((sy[i] in vocali_sill) and
                                     (sy[i + 1] in b_lis)))):
                                sillabe_custom.append(sy[:i + 1])
                                sillabe_custom.append(sy[i + 1:])
                                break
                            i += 1

        sillabe_frase.append(sillabe_custom)
    # else: sillabe_frase.append(h_it.syllables(w))
    return group_sy(sillabe_frase)

示例#36

0

显示文件

from hyphen import Hyphenator
from hyphen.dictools import *
import sys

for lang in ['es_MX', 'es_SP']:
    if not is_installed(lang): install(lang)
h_mx = Hyphenator('es_MX')
for word in sys.argv[1:]:
    print h_mx.syllables(unicode(word.encode('utf-8')))

示例#37

0

显示文件

文件： autotab.py 项目： brmdv/AutoTab

        ## find word positions
        prevchar = ''
        splitchars = ' \t.,;:\'"-()[]{}'
        for i, char in enumerate(textline):
            if char not in splitchars and prevchar in splitchars:
                wordstarts.append(i)
            elif char in splitchars and prevchar not in splitchars:
                wordstarts.append(i)
            elif char == ' ' and prevchar in splitchars[2:]:
                wordstarts.append(i)
            prevchar = char

        ## find syllable positions
        if not args.full_words:
            for idx in range(len(wordstarts) - 1):
                sylls = hyph.syllables(
                    textline[wordstarts[idx]:wordstarts[idx + 1]])
                cursor = wordstarts[idx]
                if len(sylls) > 1:
                    for s in sylls[:-1]:
                        cursor += len(s)
                        syllstarts.append(cursor)
                    pass

        # join text into parts that are separated by chords
        ## find chord positions
        prevchar = ''
        current_chord = ''
        position = -1
        break_positions = list(set(syllstarts + wordstarts))
        chorded_parts = []
        for i, char in enumerate(chordline):

示例#38

0

显示文件

文件： model2rdMM_reverse_counting_format.py 项目： YerongLeopard/PoetryGeneration

def poem_generate(num_pairs):
    print "We are doing the 2rd order Markov model!"
    print "Number of poems to generate:", num_pairs
    # how many pairs to generate
    ending_words_dict = sample_ending_word(num_pairs)
    poems_dict = dict()

    h_en = Hyphenator('en_US')
    prondict = nltk.corpus.cmudict.dict()

    for ind in ['A','B','C','D','E','F','G']:
        print "Group:", ind
        # get ending words
        ending_words = ending_words_dict[ind]

        # preprocess data
        corpusname = '../data/grouping2/group' + ind + '.txt'
        corpus = importasline(corpusname, ignorehyphen=False)

        vectorizer = CountVectorizer(min_df=1)
        X = vectorizer.fit_transform(corpus)
        analyze = vectorizer.build_analyzer()
        Y = [[vectorizer.vocabulary_[x] for x in analyze(corpus[i])] for i in range(len(corpus))]
        ending_tokens = [[vectorizer.vocabulary_[x] for x in ending_words[i]] for i in range(len(ending_words))]
        # print(Y)
        words = vectorizer.get_feature_names()
        print "Number of words:", len(words)
        # train in a reverse direction
        for i, line in enumerate(Y):
            Y[i] = line[::-1]
        # print(Y)

        # generate number of syllables for every word
        words_num_syllables = np.zeros(len(words), dtype=int)
        for wordid, word in enumerate(words):
            try:
                phon = prondict[word][0]
                words_num_syllables[wordid] = sum(map(hasNumbers, phon))
            except:
                words_num_syllables[wordid] = len(h_en.syllables(unicode(word)))
            if not words_num_syllables[wordid]:
                words_num_syllables[wordid] = count_syllables(word)

        # train model
        modelname = 'model2rdMMgroup' + ind
        hmm = Markov( len(words), Y, words_num_syllables, modelname)
        print(len(hmm.inversetable))

        # generate poems
        subpoems = [None]*num_pairs
        for pairid in range(num_pairs):
            start_token = ending_tokens[pairid]
            robotpoem0 = ''
            line0,linew0 = hmm.generating_random_line_end(start_token[0])
            for j in linew0[-2::-1]:
                robotpoem0+=' '+words[j]+' '
            print(robotpoem0)
            robotpoem1 = ''
            line1,linew1 = hmm.generating_random_line_end(start_token[1])
            for j in linew1[-2::-1]:
                robotpoem1+=' '+words[j]+' '
            print(robotpoem1)
            subpoems[pairid] = (robotpoem0, robotpoem1)

        # add the best subpoem to poems_dict
        poems_dict[ind] = subpoems

    # write down the poems
    poem_file_name = '../poems2rdMM/reverse_with_punctuations.txt'
    fwrite = open(poem_file_name, 'w')
    for poemid in range(num_pairs):
        # construct poems
        robotpoem = [None]*14
        robotpoem[0] = poems_dict['A'][poemid][0]
        robotpoem[2] = poems_dict['A'][poemid][1]
        robotpoem[1] = poems_dict['B'][poemid][0]
        robotpoem[3] = poems_dict['B'][poemid][1]
        robotpoem[4] = poems_dict['C'][poemid][0]
        robotpoem[6] = poems_dict['C'][poemid][1]
        robotpoem[5] = poems_dict['D'][poemid][0]
        robotpoem[7] = poems_dict['D'][poemid][1]
        robotpoem[8] = poems_dict['E'][poemid][0]
        robotpoem[10] = poems_dict['E'][poemid][1]
        robotpoem[9] = poems_dict['F'][poemid][0]
        robotpoem[11] = poems_dict['F'][poemid][1]
        robotpoem[12] = poems_dict['G'][poemid][0]
        robotpoem[13] = poems_dict['G'][poemid][1]

        robotpoem = Format(robotpoem)

        # write into file
        print>>fwrite, str(poemid)
        for lineid in range(14):
            print>>fwrite, robotpoem[lineid]
    fwrite.close()

示例#39

0

显示文件

文件： Test.py 项目： Juncai/SpeechTextLabeler

# pitch_array = np.array(pitch_info[1])
# print np.mean(pitch_array)
# print np.std(pitch_array)

# sent_dict = build_sentiment_dict(sentiment_dict_path)
#
# for key in sent_dict:
#     print key
#
# ts = get_time_stamp(bml_path)
#
# for t in ts:
#     print t

# words = re.split('\W+', 'OK, well, shall we start? Welcome to Finnmore Associates!')
# words.remove('')
# for ind, word in enumerate(words):
#     print word, ind

h_en = Hyphenator('en_US')
#
print len(h_en.syllables(unicode(u'beautiful')))
#
# alist = [1, 2, 4, 2, 5, 6, 9.1]
# barray = np.array(alist)
# dev = np.std(barray)
# m = np.mean(barray)
# print m
# print dev

示例#40

0

显示文件

def resultado(request):
    #carrega o nome da sessao
    filename = request.session['filename']
    print("filename -"), filename

    #cria o objeto do speechrecognition
    r = sr.Recognizer()
    with sr.WavFile(filename) as source:
        tam = source.DURATION
        audio = r.record(source)  # read the entire WAV file

    print "instanciou sr"

    erro = "OK"
    texto = ""
    silabas = []
    vel = 0

    try:
        # for testing purposes, we're just using the default API key
        # to use another API key, use `r.recognize_google(audio, key="GOOGLE_SPEECH_RECOGNITION_API_KEY")`
        # instead of `r.recognize_google(audio)`
        texto = r.recognize_google(audio,
                                   key="aqui vai a chave do google api key",
                                   language="pt-BR")
        #print("Debug - texto do google: " + texto)
    except sr.UnknownValueError:
        erro = "Google Speech Recognition não pode entender o que você disse ... :("
        #print(erro)
    except sr.RequestError as e:
        erro = "Não foi possível obter os resultados... :( - {0}".format(e)
        #print(erro)

    s = ""
    sil = 0

    if texto != "":
        #salva na sessao o texto
        request.session['texto'] = texto

        # from hyphen.dictools import is_installed, install
        from hyphen import Hyphenator

        h_br = Hyphenator('pt_BR',
                          directory='/caminho/ate/matebot/matebotweb/')

        palavras = texto.split(" ")
        for palavra in palavras:
            silabas = h_br.syllables(palavra)
            # print (palavra, silabas)
            if len(silabas) == 0:
                s = s + palavra + "-"
                sil += 1
            else:
                s = s + "-".join(silabas) + "-"
                sil = len(silabas) + sil

        # print "lista de silabas -", s
        request.session['silabas'] = s
        request.session['numsil'] = sil
        request.session['tamanho'] = tam
        #velocidade em silabas por minuto
        vel = (sil / tam) * 60
        print "Vel -", vel
        request.session['vel'] = vel

    dados = ""

    if texto == "":
        texto = erro
        dados = "Houve um erro no serviço tente novamente mais tarde..."
        #desabilita botao
        btntxt = "Desativado"
        return render_to_response('portal/resultado.html', {
            'tam': ceil(tam),
            'texto': texto,
            'total': sil,
            'vel': round(vel, 2),
            'velocidade': vel,
            'dados': dados,
            'btntxt': btntxt
        },
                                  context_instance=RequestContext(request))

    idade = request.session['idade']
    print "idade -", idade

    #iterando o dicionario a procura do grupo da idade
    media = ""
    dp = ""
    for key in pop_idade:
        idmin, idmax = pop_idade[key][0]
        if int(idmin) <= int(idade) and int(idade) <= int(idmax):
            media, dp = pop_idade[key][1]
            media, dp = float(media), float(dp)
            break

    print "media e dp -", media, dp
    #z critico
    z = (vel - media) / dp

    print "z -", z
    # teste 1
    # h0 a pessoa esta normal
    # h1 a pessoa esta estressada media da amostra maior que da pop
    if z < 1.64:
        #h0 - pessoa nao esta estressada mas testa se esta relaxada
        # teste 2
        # h0 a pessoa esta normal
        # h1 a pessoa esta relaxada media da amostra menor que da pop
        if -1.64 < z:
            #h0 - pessoa normal
            dados = "Você está bem..."
        else:
            #h1 - pessoa relaxada
            dados = "Você está relaxado!"
    else:
        #h1 - pessoa estressada
        dados = "Você está estressado!"

    return render_to_response('portal/resultado.html', {
        'tam': ceil(tam),
        'texto': texto,
        'total': sil,
        'vel': round(vel, 2),
        'velocidade': vel,
        'dados': dados
    },
                              context_instance=RequestContext(request))

示例#41

0

显示文件

文件： hypyen.py 项目： IanMulvany/colour-reader

from hyphen import Hyphenator
h_en = Hyphenator('en_US')

output = h_en.syllables('longer')
print(output)


def get_syllables(word):
    """
    using hypenator return syllables of an input word
    """
    syllables = h_en.syllables(word)
    if syllables == []:
        return [word]
    else:
        return syllables

def get_coloured_para(para):
    """
    for each word in a para
    get the sylleblyes of that word
    create a coloured version of the word
    patch these together into a new vibrant paragraph
    """
    coloured_para = []
    for word in para:
        colored_word = color_word(word)
        coloured_para.append(colored_word)
    return coloured_para

示例#42

0

显示文件

文件： tweetManip.py 项目： d-m/bmorehackathon2014

class checkTweet():
    def __init__(self, text = 'Defualt Tweet'):
        # only keep latin chars:
        self.rawText = re.sub(ur'[^\x00-\x7F]', u'', text) 
        self.textWords = self.rawText.split()
        self.h_en = Hyphenator('en_US')
        self.badSymbols = ['http:', 'https:', '&']
        self.forbiddenThings = ['@'] # random syms
        self.forbiddenWords = ['el', 'la', 'en', 'tu', # spanish
            'Et', 'le', 'aux', 'les', 'de', 'des', 'du', 'il', 'Elle',
            'ses', 'sa', 'ces', 'cela', 'est', 'vous', 'tous', 'nous',
            'allez', 'alons'] # french
        self.forbiddenEnds = ['the', 'and', 'a', 'an', 'for', 'at', 'except', 'or', 'has',
            'my', 'your', 'their', 'his', 'hers', 'her\'s', 'get', 'it\'ll', 'to', 'like',
            'is', 'I']        

    def qualityControl(self):
        self.replaceText()
        self.remove_at_symbol_first()
        self.remove_symbolWords()
        if self.check_forbiddenThings():
            return False
        print "post QC tweet: ", " ".join(self.textWords)
        return True
    
    def replaceText(self):
        self.textWords = [w.replace('#', 'hashtag ') for w in self.textWords]

    def remove_at_symbol_first(self):
        if re.search('RT', self.textWords[0]):
            del self.textWords[0]
        if re.search('@', self.textWords[0]):
            del self.textWords[0]

    def remove_symbolWords(self):
        # remove words with badSymbols
        for i, word in enumerate(self.textWords):
                for s in self.badSymbols:
                    if re.search(s, word):
                        del self.textWords[i]
                        break
            
    def words_no_vowels(self, wordList):
        for word in wordList:
            if not re.search("([aeiouyAEIOUY]+)", word):
                print word, ' - did not contain any vowels'
                return True
        return False

    def check_forbiddenThings(self):
        for s in self.forbiddenThings:
            if any([re.search(s, word) for word in self.textWords]):
                print 'the forbidden thing: ', s, ' was found'
                return True
        for s in self.forbiddenWords:
            if any([re.search('^'+s+'$', word, re.IGNORECASE) for word in self.textWords]):
                print 'the forbidden word: ', s, ' was found'
                return True
        return False

    def checkSylbls(self, Nsyls):
        finalWords = self.confirmSylsCounts(Nsyls)
        if not finalWords or self.words_no_vowels(finalWords) \
        or any(finalWords[-1] == s for s in self.forbiddenEnds):
            return list()
        print Nsyls, "syls found... final text: ", finalWords  
        return finalWords               
    
    def confirmSylsCounts(self, Nsyls):
        nWords = len(self.textWords)
        i = 0
        sylsCount = 0;
        tooHard = False;
        # loop until the end of the word list, we count Nsyls or can't figure out a word
        while i < nWords and sylsCount < Nsyls and not tooHard:
            if len(self.textWords[i]) >= 100: #hyphenator will break and something is crazy
                return list()
            libreSyls = len(self.h_en.syllables(self.textWords[i]))
            libreSyls = max(libreSyls, 1)
            simplSyls = self.count_syllables(self.textWords[i])
            if libreSyls == simplSyls[0] or libreSyls == simplSyls[1]:
                sylsCount = sylsCount + libreSyls
            elif simplSyls[0] == simplSyls[1]:
                sylsCount = sylsCount + simplSyls[1]
            else: # this tweet is too hard
                tooHard = True
            i += 1
        if (sylsCount == Nsyls) and not tooHard:
            return self.textWords[:i]
        else:
            return list()
            
    def count_syllables(self, word):
        if not word:
            return 0, 0
        vowels = ['a', 'e', 'i', 'o', 'u']

        on_vowel = False
        in_diphthong = False
        minsyl = 0
        maxsyl = 0
        lastchar = None

        word = word.lower()
        for c in word:
            is_vowel = c in vowels

            if on_vowel == None:
                on_vowel = is_vowel

            # y is a special case
            if c == 'y':
                is_vowel = not on_vowel

            if is_vowel:
                if not on_vowel:
                    # We weren't on a vowel before.
                    # Seeing a new vowel bumps the syllable count.
                    minsyl += 1
                    maxsyl += 1
                elif on_vowel and not in_diphthong and c != lastchar:
                    # We were already in a vowel.
                    # Don't increment anything except the max count,
                    # and only do that once per diphthong.
                    in_diphthong = True
                    maxsyl += 1

            on_vowel = is_vowel
            lastchar = c

        # Some special cases:
        if word[-1] == 'e':
            minsyl -= 1
        # if it ended with a consonant followed by y, count that as a syllable.
        if word[-1] == 'y' and not on_vowel:
            maxsyl += 1

        return minsyl, maxsyl

示例#43

0

显示文件

文件： all_texts_maker.py 项目： sshuster/obama

def syllables(word):
	u_word = word.decode('utf-8')
	h_en = Hyphenator('en_US')
	return [syl.encode('utf-8') for syl in h_en.syllables(u_word)]

示例#44

0

显示文件

文件： shakespeare_syllables.py 项目： goptavares/shakespeare-hmm

def main():
    # Load Shakespeare dataset.
    sonnets = util.loadShakespeareSonnets()
    tokens = util.getUniqueSyllables(sonnets)
    numObs = len(tokens)
    numStates = 4
    model = hmm_end_state.HMM(numStates, numObs)

    # Train model on tokenized dataset.
    h = Hyphenator('en_GB')
    words = []
    for sonnet in sonnets:
        for sentence in sonnet:
            for word in sentence:
                tokenizedWord = []
                syllables = h.syllables(unicode(word.lower()))
                if len(syllables) < 2:
                    tokenizedWord.append(tokens.index(unicode(word.lower()))) 
                else:
                    for syllable in syllables:
                        tokenizedWord.append(tokens.index(syllable))
            words.append(tokenizedWord)
    model.train(words, maxIter=4)

    # Generate artificial sonnet with any generated words and detokenize it.
    artificialSonnet = model.generateSonnetFromSyllables(numSentences=14,
                                                         numWordsPerSentence=8)
    detokenizedSonnet = []
    for sentence in artificialSonnet:
        detokenizedSentence = []
        for w, word in enumerate(sentence):
            detokenizedWord = ''
            if w == 0:
                syll = word[0]
                detokenizedWord += tokens[syll][0].upper() + tokens[syll][1:]
                for syll in word[1:]:
                    detokenizedWord += tokens[syll]
            else:
                for syll in word:
                    detokenizedWord += tokens[syll]
            detokenizedSentence.append(detokenizedWord)
        detokenizedSonnet.append(detokenizedSentence)

    # Write detokenized sonnet to text file.
    util.writeSonnetToTxt(detokenizedSonnet)

    # Generate artificial sonnet with only valid words and detokenize it.
    artificialSonnet = model.generateSonnetFromSyllables(
        numSentences=14, numWordsPerSentence=8,
        validWords=util.getUniqueWords(sonnets), tokens=tokens)
    detokenizedSonnet = []
    for sentence in artificialSonnet:
        detokenizedSentence = []
        for w, word in enumerate(sentence):
            detokenizedWord = ''
            if w == 0:
                syll = word[0]
                detokenizedWord += tokens[syll][0].upper() + tokens[syll][1:]
                for syll in word[1:]:
                    detokenizedWord += tokens[syll]
            else:
                for syll in word:
                    detokenizedWord += tokens[syll]
            detokenizedSentence.append(detokenizedWord)
        detokenizedSonnet.append(detokenizedSentence)

    # Write detokenized sonnet to text file.
    util.writeSonnetToTxt(detokenizedSonnet)

示例#45

0

显示文件

文件： LSTM_train.py 项目： skansi/chatbot-unk

    for f in files:
        print('Working on file:', f)
        SOURCE = str(subdir) + '/' + str(f)

        # load text and covert to lowercase
        text = open(SOURCE, encoding='utf-8').read()
        text = text.lower()
        text_list = text.split()

        syllables_list = []

        # split the data on syllables
        print('\n> Splitting data to syllables...')
        for word in text_list:
            try:
                l = h_en.syllables(word)
                for s in l:
                    if l == []:
                        s = ' '
                    if l.index(s) == (len(l) - 1):
                        s = s + ' '
                    syllables_list = syllables_list + [s]
            except ValueError:
                print(word)
        print('Done!\n')

        print(
            '> Changing data to be written only with syllables from vocabulary...'
        )
        syllables_list = [i for i in syllables_list if i in VOCAB]
        print('Done!\n')

示例#46

0

显示文件

文件： hyphen_syllables.py 项目： icodeiexist/codeprojects

from hyphen import Hyphenator
from hyphen.dictools import *
import sys
 
for lang in ['es_MX', 'es_SP']:
        if not is_installed(lang): install(lang)
h_mx = Hyphenator('es_MX')
for word in sys.argv[1:]:
    print h_mx.syllables(unicode(word.encode('utf-8')))

示例#47

0

显示文件

文件： process_data.py 项目： tmadhuri/controlled-text-generation

def build_data_cv(dataset, cv=10, clean_string=True):
    """
    Loads data and split into 10 folds.
    """

    if dataset[:2].lower() == "mr":
        data_folder = ["rt-polarity.pos", "rt-polarity.neg"]
        if dataset == "mr_te":
            data_folder = ["mr_te.pos", "mr_te.neg"]
        if dataset == "mr_hi":
            data_folder = ["mr_hi.pos", "mr_hi.neg"]

        revs = []
        pos_file = data_folder[0]
        neg_file = data_folder[1]
        vocab = defaultdict(float)

        l = [0, 0]

        with open(pos_file, "rb") as f:
            for line in f:
                rev = []
                rev.append(line.decode("utf8", "ignore").strip().lower())
                if clean_string:
                    orig_rev = clean_str(" ".join(rev), dataset)
                else:
                    orig_rev = " ".join(rev).lower()
                words = orig_rev.split()
                if CHAR != 0 and SYL == 0:
                    words = reduce(
                        lambda y, z: y + z,
                        map(
                            lambda x: [
                                x[i:i + CHAR]
                                for i in range(max(1,
                                                   len(x) - CHAR + 1))
                            ], words))
                elif SYL != 0:
                    en = Hyphenator('en_US')
                    syl_split = map(
                        lambda x: en.syllables(x)
                        if (len(x) > 1 and len(en.syllables(x)) > 0) else [x],
                        words)
                    syl_split = map(
                        lambda x: x[:-1] + [x[-1] + u">"],
                        map(lambda x: [u"<" + x[0]] + x[1:], syl_split))
                    comb_syl_split = map(
                        lambda x: [
                            "".join(x[i:i + SYL])
                            for i in range(max(len(x) - SYL + 1, 1))
                        ], syl_split)
                    words = reduce(lambda x, y: x + y, comb_syl_split)
                for word in set(words):
                    vocab[word] += 1
                datum = {
                    "y": 1,
                    "text": words,
                    "num_words": len(words),
                    "split": np.random.randint(0, cv)
                }
                revs.append(datum)
                l[1] += 1
        with open(neg_file, "rb") as f:
            for line in f:
                rev = []
                rev.append(line.decode("utf8", "ignore").strip().lower())
                if clean_string:
                    orig_rev = clean_str(" ".join(rev), dataset)
                else:
                    orig_rev = " ".join(rev).lower()
                words = orig_rev.split()
                if CHAR != 0 and SYL == 0:
                    words = reduce(
                        lambda y, z: y + z,
                        map(
                            lambda x: [
                                x[i:i + CHAR]
                                for i in range(max(1,
                                                   len(x) - CHAR + 1))
                            ], words))
                elif SYL != 0:
                    en = Hyphenator('en_US')
                    syl_split = map(
                        lambda x: en.syllables(x)
                        if (len(x) > 1 and len(en.syllables(x)) > 0) else [x],
                        words)
                    syl_split = map(
                        lambda x: x[:-1] + [x[-1] + u">"],
                        map(lambda x: [u"<" + x[0]] + x[1:], syl_split))
                    comb_syl_split = map(
                        lambda x: [
                            "".join(x[i:i + SYL])
                            for i in range(max(len(x) - SYL + 1, 1))
                        ], syl_split)
                    words = reduce(lambda x, y: x + y, comb_syl_split)
                for word in set(words):
                    vocab[word] += 1
                datum = {
                    "y": 0,
                    "text": words,
                    "num_words": len(words),
                    "split": np.random.randint(0, cv)
                }
                revs.append(datum)
                l[0] += 1
        print l
        return revs, vocab

    elif dataset[-4:] == "TeSA":
        revs = []
        data_file = "ACTSA_telugu_polarity_annotated_UTF.txt"
        if dataset != "TeSA":
            data_file = dataset + ".txt"

        l = [0, 0, 0]

        vocab = defaultdict(float)
        with open(data_file, "rb") as f:
            for line in f:
                line = line.decode('utf-8', 'ignore').strip().split(" ", 1)
                label = int(line[0].strip())
                if label == 0:
                    label = 2
                if label == -1:
                    label = 0
                if label != 0 and label != 1 and label != 2:
                    print label

                l[label] += 1

                rev = []
                rev.append(line[1].strip())
                if clean_string:
                    orig_rev = clean_str(" ".join(rev), dataset)
                else:
                    orig_rev = " ".join(rev).lower()
                words = orig_rev.split()
                if CHAR != 0 and SYL == 0:
                    words = reduce(
                        lambda y, z: y + z,
                        map(
                            lambda x: [
                                x[i:i + CHAR]
                                for i in range(max(1,
                                                   len(x) - CHAR + 1))
                            ], words))
                elif SYL != 0:
                    te = Syllabifier()
                    syl_split = map(
                        lambda x: te.syllabify_te(x) if
                        (len(x) > 1 and len(te.syllabify_te(x)) > 0) else [x],
                        words)
                    syl_split = map(
                        lambda x: x[:-1] + [x[-1] + u">"],
                        map(lambda x: [u"<" + x[0]] + x[1:], syl_split))
                    comb_syl_split = map(
                        lambda x: [
                            "".join(x[i:i + SYL])
                            for i in range(max(len(x) - SYL + 1, 1))
                        ], syl_split)
                    words = reduce(lambda x, y: x + y, comb_syl_split)
                for word in set(words):
                    vocab[word] += 1
                datum = {
                    "y": label,
                    "text": words,
                    "num_words": len(words),
                    "split": np.random.randint(0, cv)
                }
                revs.append(datum)
        print l
        return revs, vocab

    elif dataset[-4:] == "HiSA":
        revs = []
        data_file = "5001-end.txt"
        if dataset != "HiSA":
            data_file = dataset + ".txt"

        l = [0, 0, 0]

        vocab = defaultdict(float)
        with open(data_file, "rb") as f:
            for line in f:
                line = line.decode('utf-8', 'ignore').strip().split("\t", 1)
                if ((len(line) < 2) or
                    ((line[1].strip() != 'p') and (line[1].strip() != 'n') and
                     (line[1].strip() != 'o'))):
                    continue

                label = 2
                if line[1].strip() == 'p':
                    label = 1
                elif line[1].strip() == 'n':
                    label = 0

                l[label] += 1

                rev = []
                rev.append(line[0].strip())
                if clean_string:
                    orig_rev = clean_str(" ".join(rev), dataset)
                else:
                    orig_rev = " ".join(rev).lower()
                words = orig_rev.split()
                if CHAR != 0 and SYL == 0:
                    words = reduce(
                        lambda y, z: y + z,
                        map(
                            lambda x: [
                                x[i:i + CHAR]
                                for i in range(max(1,
                                                   len(x) - CHAR + 1))
                            ], words))
                elif SYL != 0:
                    hi = Syllabifier()
                    syl_split = map(
                        lambda x: hi.syllabify_hi(x) if
                        (len(x) > 1 and len(hi.syllabify_hi(x)) > 0) else [x],
                        words)
                    syl_split = map(
                        lambda x: x[:-1] + [x[-1] + u">"],
                        map(lambda x: [u"<" + x[0]] + x[1:], syl_split))
                    comb_syl_split = map(
                        lambda x: [
                            "".join(x[i:i + SYL])
                            for i in range(max(len(x) - SYL + 1, 1))
                        ], syl_split)
                    words = reduce(lambda x, y: x + y, comb_syl_split)
                for word in set(words):
                    vocab[word] += 1
                datum = {
                    "y": label,
                    "text": words,
                    "num_words": len(words),
                    "split": np.random.randint(0, cv)
                }
                revs.append(datum)

        print l
        return revs, vocab

    elif dataset[:4] == "TREC":
        revs = []
        data_file = []
        if dataset[-2:] == "En":
            data_file = ["train_5500.label", "TREC_10.label"]
        elif dataset[-2:] == "Hi":
            data_file = ["train_5500.hi.label", "TREC_10.hi.label"]
        elif dataset[-3:] == "w2w":
            data_file = [
                "train_5500.hi_w2w_en.label", "TREC_10.hi_w2w_en.label"
            ]

        train_file = data_file[0]
        test_file = data_file[1]

        l = [0, 0, 0, 0, 0, 0]

        classes = {
            "DESC": 0,
            "ENTY": 1,
            "ABBR": 2,
            "HUM": 3,
            "NUM": 4,
            "LOC": 5
        }

        vocab = defaultdict(float)
        with open(train_file, "rb") as f:
            for line in f:
                line = line.decode('utf-8').strip().split(" ", 1)

                if ((len(line) < 2) or (line[0].split(":")[0] not in classes)):
                    continue

                label = classes[line[0].split(":")[0]]

                l[label] += 1

                rev = []
                rev.append(line[1].strip())
                if clean_string:
                    orig_rev = clean_str(" ".join(rev), dataset)
                else:
                    orig_rev = " ".join(rev).lower()
                words = orig_rev.split()
                if CHAR != 0 and SYL == 0:
                    words = reduce(
                        lambda y, z: y + z,
                        map(
                            lambda x: [
                                x[i:i + CHAR]
                                for i in range(max(1,
                                                   len(x) - CHAR + 1))
                            ], words))
                elif SYL != 0:
                    if dataset[-2:] == "En":
                        en = Hyphenator('en_US')
                        syl_split = map(
                            lambda x: en.syllables(x) if
                            (len(x) > 1 and len(en.syllables(x)) > 0) else [x],
                            words)
                        syl_split = map(
                            lambda x: x[:-1] + [x[-1] + u">"],
                            map(lambda x: [u"<" + x[0]] + x[1:], syl_split))
                    else:
                        hi = Syllabifier()
                        syl_split = map(
                            lambda x: hi.syllabify_hi(x)
                            if (len(x) > 1 and len(hi.syllabify_hi(x)) > 0)
                            else [x], words)
                        syl_split = map(
                            lambda x: x[:-1] + [x[-1] + u">"],
                            map(lambda x: [u"<" + x[0]] + x[1:], syl_split))
                    comb_syl_split = map(
                        lambda x: [
                            "".join(x[i:i + SYL])
                            for i in range(max(len(x) - SYL + 1, 1))
                        ], syl_split)
                    words = reduce(lambda x, y: x + y, comb_syl_split)
                for word in set(words):
                    vocab[word] += 1

                datum = {
                    "y": label,
                    "text": words,
                    "num_words": len(words),
                    "split": 1
                }
                revs.append(datum)

        t = [0, 0, 0, 0, 0, 0]
        with open(test_file, "rb") as f:
            for line in f:
                line = line.decode('utf-8').strip().split(" ", 1)

                if ((len(line) < 2) or (line[0].split(":")[0] not in classes)):
                    continue

                label = classes[line[0].split(":")[0]]

                t[label] += 1

                rev = []
                rev.append(line[1].strip())
                if clean_string:
                    orig_rev = clean_str(" ".join(rev), dataset)
                else:
                    orig_rev = " ".join(rev).lower()

                words = orig_rev.split()
                if CHAR != 0 and SYL == 0:
                    words = reduce(
                        lambda y, z: y + z,
                        map(
                            lambda x: [
                                x[i:i + CHAR]
                                for i in range(max(1,
                                                   len(x) - CHAR + 1))
                            ], words))
                elif SYL != 0:
                    if dataset[-2:] == "En":
                        en = Hyphenator('en_US')
                        syl_split = map(
                            lambda x: en.syllables(x) if
                            (len(x) > 1 and len(en.syllables(x)) > 0) else [x],
                            words)
                        syl_split = map(
                            lambda x: x[:-1] + [x[-1] + u">"],
                            map(lambda x: [u"<" + x[0]] + x[1:], syl_split))
                    else:
                        hi = Syllabifier()
                        syl_split = map(
                            lambda x: hi.syllabify_hi(x)
                            if (len(x) > 1 and len(hi.syllabify_hi(x)) > 0)
                            else [x], words)
                        syl_split = map(
                            lambda x: x[:-1] + [x[-1] + u">"],
                            map(lambda x: [u"<" + x[0]] + x[1:], syl_split))
                    comb_syl_split = map(
                        lambda x: [
                            "".join(x[i:i + SYL])
                            for i in range(max(len(x) - SYL + 1, 1))
                        ], syl_split)
                    words = reduce(lambda x, y: x + y, comb_syl_split)
                for word in set(words):
                    vocab[word] += 1

                datum = {
                    "y": label,
                    "text": words,
                    "num_words": len(words),
                    "split": 0
                }
                revs.append(datum)

        print l, t
        return revs, vocab

示例#48

0

显示文件

class checkTweet():
    def __init__(self, text='Defualt Tweet'):
        # only keep latin chars:
        self.rawText = re.sub(ur'[^\x00-\x7F]', u'', text)
        self.textWords = self.rawText.split()
        self.h_en = Hyphenator('en_US')
        self.badSymbols = ['http:', 'https:', '&']
        self.forbiddenThings = ['@']  # random syms
        self.forbiddenWords = [
            'el',
            'la',
            'en',
            'tu',  # spanish
            'Et',
            'le',
            'aux',
            'les',
            'de',
            'des',
            'du',
            'il',
            'Elle',
            'ses',
            'sa',
            'ces',
            'cela',
            'est',
            'vous',
            'tous',
            'nous',
            'allez',
            'alons'
        ]  # french
        self.forbiddenEnds = [
            'the', 'and', 'a', 'an', 'for', 'at', 'except', 'or', 'has', 'my',
            'your', 'their', 'his', 'hers', 'her\'s', 'get', 'it\'ll', 'to',
            'like', 'is', 'I'
        ]

    def qualityControl(self):
        self.replaceText()
        self.remove_at_symbol_first()
        self.remove_symbolWords()
        if self.check_forbiddenThings():
            return False
        print "post QC tweet: ", " ".join(self.textWords)
        return True

    def replaceText(self):
        self.textWords = [w.replace('#', 'hashtag ') for w in self.textWords]

    def remove_at_symbol_first(self):
        if re.search('RT', self.textWords[0]):
            del self.textWords[0]
        if re.search('@', self.textWords[0]):
            del self.textWords[0]

    def remove_symbolWords(self):
        # remove words with badSymbols
        for i, word in enumerate(self.textWords):
            for s in self.badSymbols:
                if re.search(s, word):
                    del self.textWords[i]
                    break

    def words_no_vowels(self, wordList):
        for word in wordList:
            if not re.search("([aeiouyAEIOUY]+)", word):
                print word, ' - did not contain any vowels'
                return True
        return False

    def check_forbiddenThings(self):
        for s in self.forbiddenThings:
            if any([re.search(s, word) for word in self.textWords]):
                print 'the forbidden thing: ', s, ' was found'
                return True
        for s in self.forbiddenWords:
            if any([
                    re.search('^' + s + '$', word, re.IGNORECASE)
                    for word in self.textWords
            ]):
                print 'the forbidden word: ', s, ' was found'
                return True
        return False

    def checkSylbls(self, Nsyls):
        finalWords = self.confirmSylsCounts(Nsyls)
        if not finalWords or self.words_no_vowels(finalWords) \
        or any(finalWords[-1] == s for s in self.forbiddenEnds):
            return list()
        print Nsyls, "syls found... final text: ", finalWords
        return finalWords

    def confirmSylsCounts(self, Nsyls):
        nWords = len(self.textWords)
        i = 0
        sylsCount = 0
        tooHard = False
        # loop until the end of the word list, we count Nsyls or can't figure out a word
        while i < nWords and sylsCount < Nsyls and not tooHard:
            if len(self.textWords[i]
                   ) >= 100:  #hyphenator will break and something is crazy
                return list()
            libreSyls = len(self.h_en.syllables(self.textWords[i]))
            libreSyls = max(libreSyls, 1)
            simplSyls = self.count_syllables(self.textWords[i])
            if libreSyls == simplSyls[0] or libreSyls == simplSyls[1]:
                sylsCount = sylsCount + libreSyls
            elif simplSyls[0] == simplSyls[1]:
                sylsCount = sylsCount + simplSyls[1]
            else:  # this tweet is too hard
                tooHard = True
            i += 1
        if (sylsCount == Nsyls) and not tooHard:
            return self.textWords[:i]
        else:
            return list()

    def count_syllables(self, word):
        if not word:
            return 0, 0
        vowels = ['a', 'e', 'i', 'o', 'u']

        on_vowel = False
        in_diphthong = False
        minsyl = 0
        maxsyl = 0
        lastchar = None

        word = word.lower()
        for c in word:
            is_vowel = c in vowels

            if on_vowel == None:
                on_vowel = is_vowel

            # y is a special case
            if c == 'y':
                is_vowel = not on_vowel

            if is_vowel:
                if not on_vowel:
                    # We weren't on a vowel before.
                    # Seeing a new vowel bumps the syllable count.
                    minsyl += 1
                    maxsyl += 1
                elif on_vowel and not in_diphthong and c != lastchar:
                    # We were already in a vowel.
                    # Don't increment anything except the max count,
                    # and only do that once per diphthong.
                    in_diphthong = True
                    maxsyl += 1

            on_vowel = is_vowel
            lastchar = c

        # Some special cases:
        if word[-1] == 'e':
            minsyl -= 1
        # if it ended with a consonant followed by y, count that as a syllable.
        if word[-1] == 'y' and not on_vowel:
            maxsyl += 1

        return minsyl, maxsyl

示例#49

0

显示文件

文件： heuristic.py 项目： gs0510/GimmeHaiku-bot

def count_syllables(word):
    hyphenator = Hyphenator('en_US')
    return max(len(hyphenator.syllables(word)), 1)

示例#50

0

显示文件

def split_lyrics_to_syllables(selected_song, user_lyrics):
    """
    The lyrics text in original music scores are split into multiple syllables and each syllable will be paired with 1 or more key/beat in the song. 
    For example, in the "Happy Birthday" song, the word "happy" has been split into "hap" and "py" and each syllable corresponds to one beat in the song.
    Hence, we need to split the user lyrics into multiple syllables as well.

    This function utilizes a Hyphenator to split the user's lyrics into several syllables until the syllables can fit into the modifiable region of the
    song music score. i.e. the number of syllables from split user lyrics should be equal to the number of syllables in the modifiable region of music score.
    The modifiable region of each song has already been defined in song_details.json, and can be obtained through the argument selected_song.

    Arguments:
    selected_song - A JSON object representing the song selected by the user. This object includes information such as the song music score file path,
                    original song lyrics and the position of the modifiable region of the music score. The JSON object is retrieved from api/static/song_details.json.
    user_lyrics   - A string which is the lyrics text that will replace the orginal lyrics in the modifiable portion of the song music score

    Exceptions raised:
    ValueError - Raised when the song language is not English or Spanish
    RuntimeError - Raised when the split user lyrics cannot fit into the song modifiable region
    
    Return:
    split_user_lyrics - A list of strings, where the length of the list is equal to the length of modifiable region in the music score, and each string in the list
                       will replace one syllable in the modifiable region of the song
    """
    # retrieve the position of modifiable lyrics region in the music score & the song language
    start_edit_pos, end_edit_pos, song_language = selected_song[
        "startEditPos"], selected_song["endEditPos"], selected_song["language"]

    # determine the total number of syllables that can be modified in the music score file
    xml_edit_num = end_edit_pos - start_edit_pos + 1

    # create Hyphenator object based on song language
    if song_language == "en_US":
        h = Hyphenator('en_US')
    elif song_language == "es":
        h = Hyphenator('es')
    else:
        raise ValueError(
            "Song language not supported, currently only support English and Spanish."
        )

    split_user_lyrics = []

    # split the user's lyrics sentence into a list of words
    user_lyrics_words = user_lyrics.split()

    # split each word into their corresponding syllables
    user_lyrics_syllables = []
    for word in user_lyrics_words:
        syllable = h.syllables(word)
        if syllable != []:
            user_lyrics_syllables += syllable
        else:
            # handle the case of single-syllable word
            user_lyrics_syllables.append(word)

    syllable_fitting_ratio = xml_edit_num / len(user_lyrics_syllables)

    if syllable_fitting_ratio == 1:
        # split user lyrics syllables fit perfectly into the modifiable area
        split_user_lyrics = user_lyrics_syllables
    elif syllable_fitting_ratio > 1:
        # split user lyrics syllables can fit into modifiable area but has too few syllables
        while len(user_lyrics_syllables) < xml_edit_num:
            user_lyrics_syllables.append("")
        split_user_lyrics = user_lyrics_syllables
    else:
        # split user lyrics syllables is more than the number of syllables requried in the modifiable area
        # need to re-split the word
        word_fitting_ratio = xml_edit_num / len(user_lyrics_words)

        if word_fitting_ratio == 1:
            # cases where number of words in user lyrics can fit into the music score modifiable area
            split_user_lyrics = user_lyrics_words
        elif word_fitting_ratio > 1:
            #  cases where number of words can fit into the modificable area, but has too few words
            while len(user_lyrics_words) < xml_edit_num:
                user_lyrics_words.append("")
            split_user_lyrics = user_lyrics_words
        else:
            # cases where number of words in user lyrics cannot fit into the music score modifiable area
            # repetitively combine first two words into one, until word_fitting_ratio becomes 1 (i.e. until user lyrics word can fit into the modifiable area)
            while word_fitting_ratio != 1 and len(user_lyrics_words) > 1:
                user_lyrics_words[0:2] = [''.join(user_lyrics_words[0:2])]
                word_fitting_ratio = xml_edit_num / len(user_lyrics_words)
            split_user_lyrics = user_lyrics_words

    if len(split_user_lyrics) == xml_edit_num:
        return split_user_lyrics
    else:
        raise RuntimeError(
            'Fail to fit user lyrics into the song modifiable region')

示例#51

0

显示文件

文件： util.py 项目： goptavares/shakespeare-hmm

def getSentenceSyllCount(sentence):
    h = Hyphenator('en_GB')
    count = 0
    for word in sentence:
        count += max(len(h.syllables(unicode(word))), 1)
    return count