示例#1
0
 def __init__(self):
     latin = deepcopy(LATIN)
     exceptions = self.get_exceptions()
     latin['exceptions'] = exceptions
     latin['diphthongs'] = ["ae", "au", "oe"]  # Not: eu, ei
     latin['mute_consonants_and_f'].append('h')
     self.syllabifier = Syllabifier(latin)
示例#2
0
 def test_latin_syllabifier(self):
     """Test Latin syllabifier."""
     word = 'sidere'
     syllabifier = Syllabifier()
     syllables = syllabifier.syllabify(word)
     target = ['si', 'de', 're']
     self.assertEqual(syllables, target)
示例#3
0
 def test_latin_syllabifier(self):
     """Test Latin syllabifier."""
     word = 'sidere'
     syllabifier = Syllabifier()
     syllables = syllabifier.syllabify(word)
     target = ['si', 'de', 're']
     self.assertEqual(syllables, target)
示例#4
0
    def third_declension(self):

        from cltk.stem.latin.syllabifier import Syllabifier

        syllabifier = Syllabifier()

        vowels = ['a', 'e', 'i', 'o', 'u', 'ā', 'ē', 'ī', 'ō', 'ū']

        base = DeclineNoun.id_declension(self)[1]

        forms = [self.nom, self.gen]

        endings = [
            'ī', 'em', 'e', 'blah', 'ēs', 'um', 'ibus', 'ēs', 'ibus', 'ēs'
        ]

        for i in endings:
            forms.append(base + i)

        forms[5] = self.nom

        nom_syllable = len(syllabifier.syllabify(self.nom))
        gen_syllable = len(syllabifier.syllabify(self.gen))

        i_stem = False

        if nom_syllable == gen_syllable:
            if self.nom[-2:] in ['is', 'es']:
                i_stem = True
        elif self.nom[-1] in ['x', 's']:
            if base[-1] not in vowels and base[-2] not in vowels:
                i_stem = True

        if i_stem == True:

            forms[7] = forms[7][:-2] + 'i' + forms[7][-2:]

#        if int(self.gender) == 3:
        if self.gender == 'neutrum':
            forms[5] = self.nom
            forms[3] = self.nom
            forms[6] = base + 'a'
            forms[9] = base + 'a'
            forms[11] = base + 'a'
            if self.nom[-1] == 'e' or self.nom[-2:] in ['al', 'ar']:
                forms[4] = base + 'ī'
                forms[6] = base + 'ia'
                forms[7] = base + 'ium'
                forms[9] = base + 'ia'
                forms[11] = base + 'ia'
        return forms
示例#5
0
 def test_latin_syllabifier(self):
     """Test Latin syllabifier."""
     word = 'sidere'
     syllabifier = Syllabifier()
     syllables = syllabifier.syllabify(word)
     target = ['si', 'de', 're']
     self.assertEqual(syllables, target)
     # tests for macronized words
     macronized_word = 'audītū'
     macronized_syllables = syllabifier.syllabify(macronized_word)
     macronized_target = ['au', 'dī', 'tū']
     self.assertEqual(macronized_syllables, macronized_target)
     macronized_word2 = 'conjiciō'
     macronized_syllables2 = syllabifier.syllabify(macronized_word2)
     macronized_target2 = ['con', 'ji', 'ci', 'ō']
     self.assertEqual(macronized_syllables2, macronized_target2)
     macronized_word3 = 'ā'
     macronized_syllables3 = syllabifier.syllabify(macronized_word3)
     macronized_target3 = ['ā']
     self.assertEqual(macronized_syllables3, macronized_target3)
示例#6
0
文件: test_stem.py 项目: vierth/cltk
 def test_latin_syllabifier(self):
     """Test Latin syllabifier."""
     word = 'sidere'
     syllabifier = Syllabifier()
     syllables = syllabifier.syllabify(word)
     target = ['si', 'de', 're']
     self.assertEqual(syllables, target)
     # tests for macronized words
     macronized_word = 'audītū'
     macronized_syllables = syllabifier.syllabify(macronized_word)
     macronized_target = ['au', 'dī', 'tū']
     self.assertEqual(macronized_syllables, macronized_target)
     macronized_word2 = 'conjiciō'
     macronized_syllables2 = syllabifier.syllabify(macronized_word2)
     macronized_target2 = ['con', 'ji', 'ci', 'ō']
     self.assertEqual(macronized_syllables2, macronized_target2)
     macronized_word3 = 'ā'
     macronized_syllables3 = syllabifier.syllabify(macronized_word3)
     macronized_target3 = ['ā']
     self.assertEqual(macronized_syllables3, macronized_target3)
from cltk.stem.latin.syllabifier import Syllabifier
from cltk.tokenize.word import WordTokenizer

cato_agri_praef = "Est interdum praestare mercaturis rem quaerere, nisi tam periculosum sit, et item foenerari, si tam honestum. Maiores nostri sic habuerunt et ita in legibus posiverunt: furem dupli condemnari, foeneratorem quadrupli. Quanto peiorem civem existimarint foeneratorem quam furem, hinc licet existimare. Et virum bonum quom laudabant, ita laudabant: bonum agricolam bonumque colonum; amplissime laudari existimabatur qui ita laudabatur. Mercatorem autem strenuum studiosumque rei quaerendae existimo, verum, ut supra dixi, periculosum et calamitosum. At ex agricolis et viri fortissimi et milites strenuissimi gignuntur, maximeque pius quaestus stabilissimusque consequitur minimeque invidiosus, minimeque male cogitantes sunt qui in eo studio occupati sunt. Nunc, ut ad rem redeam, quod promisi institutum principium hoc erit."
word_tokenizer = WordTokenizer('latin')
cato_word_tokens = word_tokenizer.tokenize(cato_agri_praef)
cato_word_tokens_no_punt = [
    token for token in cato_word_tokens if token not in ['.', ',', ':', ';']
]

#print(cato_word_tokens_no_punt)

syllabifier = Syllabifier()

#for word in cato_word_tokens_no_punt:
#syllables = syllabifier.syllabify(word)
#print(word, syllables)

############################################################

#use the macronizer
from cltk.prosody.latin.macronizer import Macronizer

macronizer = Macronizer('tag_ngram_123_backoff')

text = 'Quo usque tandem, O Catilina, abutere nostra patientia?'

prose_text = macronizer.macronize_text(text)
print(prose_text)
示例#8
0
class ChantSyllabifier(metaclass=Singleton):
    def __init__(self):
        latin = deepcopy(LATIN)
        exceptions = self.get_exceptions()
        latin['exceptions'] = exceptions
        latin['diphthongs'] = ["ae", "au", "oe"]  # Not: eu, ei
        latin['mute_consonants_and_f'].append('h')
        self.syllabifier = Syllabifier(latin)

    def get_exceptions(self):
        # See notebook "Identify syllabification errors" for background:
        # We checked the most frequent under/oversegmentations, and
        # manually corrected those

        undersegmented = {
            'euouae': ['e', 'u', 'o', 'u', 'a', 'e'],  # 29.43%
            'quia': ['qui', 'a'],  # 13.91%
            'seuouae': ['se', 'u', 'o', 'u', 'a', 'e'],  # 6.65%
            'israel': ['is', 'ra', 'el'],  # 2.64%
            'cui': ['cu', 'i'],  # 1.74%
            'michael': ['mic', 'ha', 'el'],  # 0.84%
            #'qui': ['qui'],                                   # 0.50%
            'requiem': ['re', 'qui', 'em'],  # 0.41%
            'huic': ['hu', 'ic'],  # 0.41%
            #'jerusalem': ['je', 'ru', 'sa', 'lem'],           # 0.38%
            # 'alleluia': ['al', 'le', 'lu', 'ia'],             # 0.27%
            #'noe': ['noe'],                                   # 0.22%
            'requiescet': ['re', 'qui', 'es', 'cet'],  # 0.21%
            'exiit': ['ex', 'i', 'it'],  # 0.17%
            'exierunt': ['ex', 'i', 'e', 'runt'],  # 0.13%
            'eloquium': ['e', 'lo', 'qui', 'um'],  # 0.12%
            'exiet': ['ex', 'i', 'et'],  # 0.12%
            # 'gelboe': ['gel', 'boe'],                         # 0.11%
            'ierit': ['i', 'e', 'rit'],  # 0.10%
            'christi': ['chris', 'ti'],  # 0.10%
            'saul': ['sa', 'ul'],  # 0.09%
            'colloquiis': ['col', 'lo', 'qui', 'is'],  # 0.09%
            'israelita': ['is', 'ra', 'e', 'li', 'ta'],  # 0.09%
            'michaele': ['mic', 'ha', 'e', 'le'],  # 0.08%
            'requiescit': ['re', 'qui', 'es', 'cit'],  # 0.08%
            'obsequia': ['ob', 'se', 'qui', 'a'],  # 0.07%
            # 'jesus': ['je', 'sus'],                           # 0.07%
            'nicolaum': ['ni', 'co', 'laum'],  # 0.06%
            'requies': ['re', 'qui', 'es'],  # 0.06%
            'requiescunt': ['re', 'qui', 'es', 'c**t'],  # 0.06%
            'exierit': ['ex', 'i', 'e', 'rit'],  # 0.06%
            'michaelis': ['mic', 'ha', 'e', 'lis'],  # 0.05%
            'requiescent': ['re', 'qui', 'es', 'cent'],  # 0.05%
        }

        # Recurring issues are "guen" and "quu"
        oversegmented = {
            'sanguine': ['san', 'gui', 'ne'],  # 1.45%
            'sanguinem': ['san', 'gui', 'nem'],  # 1.43%
            'lingua': ['lin', 'gua'],  # 1.11%
            'alleluya': ['al', 'le', 'lu', 'ya'],  # 0.88%
            'sanguis': ['san', 'guis'],  # 0.83%
            'est*': ['est*'],  # 0.64%
            #'eleison': ['e', 'le', 'i', 'son'],               # 0.59%
            'linguis': ['lin', 'guis'],  # 0.59%
            'linguae': ['lin', 'guae'],  # 0.47%
            'sequuntur': ['se', 'quun', 'tur'],  # 0.42%
            'sanguinis': ['san', 'gui', 'nis'],  # 0.40%
            #'euge': ['e', 'u', 'ge'],                         # 0.29%
            'eleemosynam': ['e', 'lee', 'mo', 'sy', 'nam'],  # 0.27%
            'iniquum': ['in', 'i', 'quum'],  # 0.23%
            'sunt*': ['sunt*'],  # 0.23%
            'unguenti': ['un', 'guen', 'ti'],  # 0.21%
            'persequuntur': ['per', 'se', 'quun', 'tur'],  # 0.20%
            'unguentum': ['un', 'guen', 'tum'],  # 0.20%
            'unguentorum': ['un', 'guen', 'to', 'rum'],  # 0.16%
            'urbs': ['urbs'],  # 0.16%
            'equuleo': ['e', 'quu', 'le', 'o'],  # 0.15%
            #'perpetuum': ['per', 'pe', 'tu', 'um'],           # 0.14%
            #'antiquus': ['an', 'ti', 'qu', 'us'],             # 0.14%
            'sanguinibus': ['san', 'gui', 'ni', 'bus'],  # 0.13%
            'eleemosyna': ['e', 'lee', 'mo', 'sy', 'na'],  # 0.13%
            'linguam': ['lin', 'guam'],  # 0.13%
            'stirps': ['stirps'],  # 0.11%
            #'ait': ['a', 'it'],                               # 0.11%
            'languores': ['lan', 'guo', 'res'],  # 0.11%
            #'jerusalem': ['je', 'ru', 'sa', 'lem'],           # 0.10%
            'loquuntur': ['lo', 'quun', 'tur'],  # 0.09%
            # 'tuum': ['tu', 'um'],                             # 0.09%
            # 'ideoque': ['i', 'de', 'o', 'que'],               # 0.09%
            'annuntiaverunt*': ['an', 'nun', 'ti', 'a', 've',
                                'runt*'],  # 0.09%
            'linguarum': ['lin', 'gua', 'rum'],  # 0.09%
            'in*': ['in*'],  # 0.09%
            'unguento': ['un', 'guen', 'to'],  # 0.09%
            'urguentes': ['ur', 'guen', 'tes'],  # 0.09%
            'langueo': ['lan', 'gue', 'o'],  # 0.08%
            'sanguinum': ['san', 'gui', 'num'],  # 0.08%
            'ihesum': ['ihe', 'sum'],  # 0.08%
            'languoribus': ['lan', 'guo', 'ri', 'bus'],  # 0.07%
            'probaverunt': ['pro', 'ba', 've', 'runt'],  # 0.07%
            'faciam': ['fa', 'ci', 'am'],  # 0.07%
            #'equum': ['e', 'qu', 'um'],                       # 0.07%
            #'jerusalem*': ['je', 'ru', 'sa', 'lem*'],         # 0.07%
            'moyses': ['moy', 'ses'],  # 0.07%
            'pinguedine': ['pin', 'gue', 'di', 'ne'],  # 0.07%
            'linguas': ['lin', 'guas'],  # 0.06%
            #'erue': ['e', 'ru', 'e'],                         # 0.06%
            'galaaditim': ['ga', 'laa', 'di', 'tim'],  # 0.06%
            'languentium': ['lan', 'guen', 'ti', 'um'],  # 0.05%
            'mansuetudinem': ['man', 'sue', 'tu', 'di', 'nem'],  # 0.05%
            #'iniquus': ['in', 'i', 'quus'],               # 0.05%
            #'filiis': ['fi', 'li', 'is'],                     # 0.05%
            'gloria*': ['glo', 'ri', 'a*'],  # 0.05%
            'leyson': ['ley', 'son'],  # 0.05%
            'moysi': ['moy', 'si'],  # 0.05%
            #'suavitatis': ['su', 'a', 'vi', 'ta', 'tis'],     # 0.05%
            'accipite': ['ac', 'ci', 'pi', 'te'],  # 0.05%
            'exsurgens*': ['ex', 'sur', 'gens*'],  # 0.05%
        }

        js_cantus_exceptions = {
            # Exceptions from the alignment algorithm used on the
            # Cantus website
            #'euouae': ['e', 'u', 'o', 'u', 'a', 'e'],
            #'seuouae': ['se', 'u', 'o', 'u', 'a', 'e'],
            #'alleluya': ['al', 'le', 'lu', 'ya'],
            'hierusalem': ['hie', 'ru', 'sa', 'lem'],
            'hiesum': ['hie', 'sum'],
            'kyrieleison': ['ky', 'ri', 'e', 'lei', 'son'],
            'xpisteleison': ['xpi', 'ste', 'lei', 'son'],
            'eleison': ['e', 'lei', 'son'],
        }

        exceptions = dict(LATIN['exceptions'], **undersegmented,
                          **oversegmented, **js_cantus_exceptions)
        return exceptions

    def syllabify(self, text):
        """
        Syllabifies the (lowercased) text

        Lowercased since since otherwise CLTK doesn't work well
        """
        return self.syllabifier.syllabify(text.lower())
示例#9
0
文件: chant.py 项目: eyequeue/chant
def _recalculate():

    from cltk.stem.latin.syllabifier import Syllabifier
    syllabifier = Syllabifier()

    corpus = lmloCorpus()

    # populate chant data frame

    translate_subcorpus = dict()
    translate_subcorpus['Feast'] = 'LF'
    translate_subcorpus['Saint'] = 'LS'
    translate_subcorpus['Humbert'] = 'H'
    translate_subcorpus['Humbert Sanct.'] = 'HS'
    translate_subcorpus['Humbert Temp.'] = 'HT'

    _data = defaultdict(list)
    for i, c in enumerate(corpus.chants):
        _data['chantID'].append(i)
        _data['corpus'].append('L')
        _subcorpus = c.office.split(']')[0][1:]
        # if _subcorpus == 'Saint':
        #     _subcorpus = 'Sanctorale'
        # if _subcorpus == 'Humber':
        #     _subcorpus = 'Humbert'
        # if _subcorpus == 'Feast':
        #     _subcorpus = 'Feast'
        _data['subcorpus'].append(translate_subcorpus[_subcorpus])
        _data['Modus'].append(c.mode)
        _data['modus'].append(c.mode.lower())
        if c.mode[0] in ['1','2']:
            _data['maneria'].append('protus')
        elif c.mode[0] in ['3','4']:
            _data['maneria'].append('deuterus')
        elif c.mode[0] in ['5','6']:
            _data['maneria'].append('tritus')
        elif c.mode[0] in ['7','8']:
            _data['maneria'].append('tetrardus')
        else:
            _data['maneria'].append('unknown')
        if c.mode[1] == c.mode[1].upper():
            _data['ambitus'].append('excessive')
        elif c.mode[0] in ['1','3','5','7']:
            _data['ambitus'].append('authentic')
        elif c.mode[0] in ['2','4','6','8']:
            _data['ambitus'].append('plagal')
        else:
            _data['ambitus'].append('unknown')


        _data['office'].append(' '.join(c.office.split()[1:]))

        # switching the names Service/service and Genre/genre from lmlo module
        # for consistency with Modus/modus: capital is the more granular grouping

        _data['Service'].append(c.service)
        _data['service'].append(c.Service)
        _data['ordinal'].append(c.index)
        _data['Genre'].append(c.genre)
        _data['genre'].append(c.Genre)
        _data['text'].append(c.fulltext)
        # _data['lmloHeader'].append(c.header)
        # _data['lmloEncoding'].append(c.lmloEncoding)
        _data['volpiano'].append(c.volpiano)

    chants = pd.DataFrame(_data)
    chants.to_pickle('chantData.zip', protocol=4)

    # populate note data frame

    # first some utils we'll use in the loop below

    def pindex(sd):
        return (int(sd[0])*7 + int(sd[1]))

    def intclass(interval):
        interval = abs(interval)
        if interval == 0:
            return 'rep'
        elif interval == 1:
            return 'step'
        elif interval == 2:
            return 'slip'
        else:
            return 'leap'


    _data = defaultdict(list)
    for i_c, c in enumerate(corpus.chants):
        i = 1
        for i_w, w in enumerate(c.words):
            for i_s, s in enumerate(w.syllables):
                for i_n, n in enumerate(s.notes):

                    # identify note's location in the corpus

                    _data['chantID'].append(i_c)
                    _data['word'].append(i_w)
                    _data['syll'].append(i_s)
                    _data['note'].append(i_n)


                    # identify initial and final syllable (1) and word (2) boundaries

                    initial = 0
                    if i_n == 0:
                        initial += 1
                    if i_s == 0:
                        initial *= 2
                    final = 0
                    if i_n == len(s.notes) - 1:
                        final += 1
                    if i_s == len(w.syllables) - 1:
                        final *= 2
                    _data['boundary_before'].append(initial)
                    _data['boundary_after'].append(final)

                    # extract pitch and register features

                    _data['reg_abs'].append(n.letter[0])
                    _data['pc_abs'].append(n.letter[1])
                    _data['pitch_abs'].append( n.letter[0] + '.' + n.letter[1])
                    _data['reg_rel'].append(n.sd[0])
                    _data['pc_rel'].append(n.sd[1])
                    _data['pitch_rel'].append( n.sd[0] + '.' + n.sd[1])
                    

                    # calculate intervallic context


                    if i == 1:
                        _data['lint'].append(99)
                        _data['lint_class'].append('edge')
                        _data['lint_dir'].append('edge')
                    else:
                        interval = int(pindex(c.flatSD[i]) - pindex(c.flatSD[i-1]))
                        _data['lint'].append(interval)
                        _data['lint_class'].append(intclass(interval))
                        if interval > 0:
                            _data['lint_dir'].append('up')
                        elif interval < 0:
                            _data['lint_dir'].append('down')
                        else:
                            _data['lint_dir'].append('rep')

                    if i == len(c.flatSD)-2:
                        _data['rint_class'].append('edge')
                        _data['rint_dir'].append('edge')
                        _data['rint'].append(99)
                    else:
                        interval = int(pindex(c.flatSD[i+1]) - pindex(c.flatSD[i]))
                        _data['rint'].append(interval)
                        _data['rint_class'].append(intclass(interval))
                        if interval > 0:
                            _data['rint_dir'].append('up')
                        elif interval < 0:
                            _data['rint_dir'].append('down')
                        else:
                            _data['rint_dir'].append('rep')
                        

                    i += 1

    # add interval info



    notes = pd.DataFrame(_data)
    modekey = chants.merge(notes).query("word == 0 and syll == 0 and note == 0").set_index('chantID').modus.to_frame()
    notes = notes.join(modekey.modus, on='chantID', how='inner')
    notes.to_pickle('noteData.zip', protocol=4)
    
    syllables = defaultdict(list)
    override = dict()
    override['eius'] = ['e','ius']
    override['dei'] = ['de','i']
    override['deus'] = ['de','us']
    override['quia'] = ['qui','a']
    override['christi'] = ['chris','ti']
    override['christe'] = ['chris','te']
    override['eum'] = ['e','um']
    override['deum'] = ['de','um']
    override['meum'] = ['me','um']
    override['meus'] = ['me','us']
    override['christo'] = ['chris','to']
    override['christus'] = ['chris','tus']
    override['christum'] = ['chris','tum']
    override['mei'] = ['me','i']
    override['ei'] = ['e','i']
    override['cui'] = ['cu','i']
    override['israel'] = ['is','ra','el']
    override['sanguine'] = ['san','gui','ne']
    override['meis'] = ['me','is']
    override['eis'] = ['e','is']
    override['fidei'] = ['fi','de','i']
    override['sanguinem'] = ['san','gui','nem']
    override['lingua'] = ['lin','gua']
    override['thronum'] = ['thro','num']
    override['pulchra'] = ['pul','chra']
    override['oleum'] = ['o','le','um']
    override['adiutor'] = ['ad','iu','tor']
    override['sanguis'] = ['san','guis']
    override['sanguinis'] = ['san','gui','nis']
    override['huic'] = ['hu','ic']
    override['alleluia'] = ['al','le','lu','ia']
    override['michael'] = ['mi','cha','el']
    override['noe'] = ['no','e']
    
    
    for i, c in chants.iterrows():
      if c.modus not in basicModes+['6c']:
          continue
      # if i>200: break
      words = c.text.lower().split()
      vwords = c.volpiano[4:-3].split('--')
      if len(words) != len(vwords):
        # print(f'oops: {len(words)} {len(vwords)}')
        # print(words)
        # print(vwords)    
        vwords[-2] = vwords[-2] + '-' + vwords[-1]
        vwords.pop(-1)
        # print('--fixing--')
        # print(words)
        # print(vwords)
      for j in range(len(words)):
        if words[j] in override:
          sylls = override[words[j]]
        else:
          sylls = syllabifier.syllabify(words[j].lower())
        vsylls = vwords[j].split('-')
        if len(sylls) != len(vsylls):
          sylls = [f'[{words[j]}]'] * len(vsylls)
        for k in range(len(vsylls)):
          syllables['chantID'].append(c.chantID)
          syllables['syllable'].append(sylls[k])
          syllables['last_syll'].append(k+1 == len(vsylls))
          v = vsylls[k]
          syllables['n_notes'].append(len(v))
          syllables['volpiano'].append(v)
          notes = ''
          for vchar in v:
            notes += f'{gamut_pitches[gamut_volpiano.index(vchar)]} '
          syllables['notes'].append(notes)
          syllables['pitch_initial'].append(gamut_pitches[gamut_volpiano.index(v[0])])
          syllables['pitch_final'].append(gamut_pitches[gamut_volpiano.index(v[-1])])
          syllables['t_type'].append(v2r(v))
          syllables['e_type'].append(v2r(v[0]+v[-1]))
          syllables['c_type'].append(v2c(v))
   

    syllables = pd.DataFrame(syllables)
    syllables = syllables.join(modekey.modus, on='chantID', how='inner')
    syllables['extrema'] = syllables['pitch_initial'] + '-' + syllables['pitch_final']
    syllables.to_pickle('syllableData.zip', protocol=4)


    ngrams = defaultdict(list)
    
    
    for i, c in chants.iterrows():
        # if i>0: break
        v = c.volpiano.replace('-','')
    
        # V = v with duplicate pitches removed
    
        V = v[0]
        for k in range(1, len(v)):
            if v[k] != V[-1]:
                V += v[k]
    
        
        for n in range(1, n_limit+1):
            for k in range(1,len(V)-n):
                v = V[k:k+n]
                ngrams['chantID'].append(c.chantID)
                ngrams['pos'].append(k)
                ngrams['n_notes'].append(len(v))
                ngrams['volpiano'].append(v)
                notes = ''
                for vchar in v:
                    notes += f'{gamut_pitches[gamut_volpiano.index(vchar)]} '
                ngrams['notes'].append(notes)
                ngrams['pitch_initial'].append(gamut_pitches[gamut_volpiano.index(v[0])])            
                ngrams['pitch_final'].append(gamut_pitches[gamut_volpiano.index(v[-1])])
                ngrams['t_type'].append(v2r(v))
                ngrams['e_type'].append(v2r(v[0]+v[-1]))
                ngrams['c_type'].append(v2c(v))
    
    ngrams = pd.DataFrame(ngrams)
    ngrams = ngrams.join(modekey.modus, on='chantID', how='inner')
    
    ngrams['extrema'] = ngrams['pitch_initial'] + '-' + ngrams['pitch_final']
    
    print('making pickles')
    ngrams.to_pickle('ngramData.zip', protocol=4)