示例#1
0
def save_intervalframe_to_textgrid(framedict, filepath, encoding='utf-8'):
    """Write a dict of IntervalFrames in a textgrid-File.

       Arguments:
       framedict    --  Dictionary of dataframes. The keys become tier
                        names in the textgrid file
       filepath     --  Path + filename of the file to be written.

       Keyword arguments:
       encoding: character encoding to save textgrid file

    """

    if len(framedict) < 1:
        print "invalid data!"
        return
    mytextgrid = tgt.TextGrid()
    for tier_name in framedict.keys():
        newtier = framedict[tier_name]
        if len(newtier.columns) == 3:
            mytier = tgt.IntervalTier(name=tier_name)
            for row in newtier.index:
                myinterval = tgt.Interval(newtier[newtier.columns[0]][row],
                                          newtier[newtier.columns[1]][row],
                                          newtier[newtier.columns[2]][row])
                mytier.add_interval(myinterval)
        elif len(newtier.columns) == 2:
            mytier = tgt.PointTier(name=tier_name)
            for row in newtier.index:
                mypoint = tgt.Point(newtier[newtier.columns[0]][row],
                                    newtier[newtier.columns[1]][row])
                mytier.add_point(mypoint)
        mytextgrid.add_tier(mytier)
    tgt.write_to_file(mytextgrid, filepath, encoding=encoding, format="long")
示例#2
0
def main():

    # Parse the command-line arguments.
    args = parse_arguments()
    tg_path = args['tg_path']
    offset_start = args['offset_start']
    offset_end = args['offset_end']
    outpath = args['outpath']

    # Read the TextGrid
    tg = tgt.read_textgrid(tg_path)
    tg_part = tgt.TextGrid()

    if offset_start is None and offset_end is None:
        raise Exception(
            'At least one of offset_start and offset_end must be specified.')
    elif offset_start is None:
        offset_start = tg.start_time
    elif offset_end is None:
        offset_end = tg.end_time

    for tr in tg:
        intr_part = tr.get_annotations_between_timepoints(
            offset_start, offset_end)
        tier_part = tgt.IntervalTier(name=tr.name,
                                     start_time=tr.start_time,
                                     end_time=tr.end_time,
                                     objects=intr_part)
        tg_part.add_tier(tier_part)

    if outpath is None:
        tg_dirname, tg_filename = os.path.split(tg_path)
        outpath = os.path.splitext(tg_filename)[0] + '_part.TextGrid'

    tgt.write_to_file(tg_part, outpath)
示例#3
0
    def save_annotations(self, filename, tiers=['cycles', 'holds'],
                         filetype='textgrid', merge_holds=False):
        """Save annotations to file."""

        if filetype not in ['textgrid', 'eaf', 'table']:
            raise ValueError('Unsupported file type: {}'.format(filetype))

        tg = tgt.TextGrid()

        if 'holds' in tiers or merge_holds:
            # holds = tgt.IntervalTier(name='holds')
            # for start, end in self.holds:
            #     holds.add_interval(tgt.Interval(start, end, 'hold'))
            if not merge_holds:
                tg.add_tier(self.holds)

        if 'cycles' in tiers:
            if merge_holds:
                segments_merged= tgt.IntervalTier(name='cycles')
                tg.add_tier(self.merge_holds(self.segments, self.holds))
            else:
                tg.add_tier(self.segments)

        if len(tg.tiers):
            filetype = 'short' if filetype == 'textgrid' else filetype
            tgt.write_to_file(tg, filename, format=filetype)
示例#4
0
def save_intervalframe_to_textgrid(framedict, filepath, encoding='utf-8'):
    """Write a dict of IntervalFrames in a textgrid-File.

       Arguments:
       framedict    --  Dictionary of dataframes. The keys become tier
                        names in the textgrid file
       filepath     --  Path + filename of the file to be written.

       Keyword arguments:
       encoding: character encoding to save textgrid file

    """

    if len(framedict) < 1:
        print "invalid data!"
        return
    mytextgrid = tgt.TextGrid()
    for tier_name in framedict.keys():
        newtier = framedict[tier_name]
        if len(newtier.columns) == 3:
            mytier = tgt.IntervalTier(name=tier_name)
            for row in newtier.index:
                myinterval = tgt.Interval(newtier[newtier.columns[0]][row],
                                          newtier[newtier.columns[1]][row],
                                          newtier[newtier.columns[2]][row])
                mytier.add_interval(myinterval)
        elif len(newtier.columns) == 2:
            mytier = tgt.PointTier(name=tier_name)
            for row in newtier.index:
                mypoint = tgt.Point(newtier[newtier.columns[0]][row],
                                    newtier[newtier.columns[1]][row])
                mytier.add_point(mypoint)
        mytextgrid.add_tier(mytier)
    tgt.write_to_file(mytextgrid, filepath, encoding=encoding, format="long")
示例#5
0
def add_lemmas(title, input1_path, output_path):

    # Load textgrid
    tg = tgt.read_textgrid(os.path.join(input1_path, title + '.TextGrid'))
    tier_names = tg.get_tier_names()

    # Load pos tier
    pos_tier_name = [name for name in tier_names if 'pos' in name][0]
    pos_tier = tg.get_tier_by_name(pos_tier_name)

    # Load words tier
    words_tier_name = [name for name in tier_names if 'words' in name][0]
    words_tier = tg.get_tier_by_name(words_tier_name)

    # Start empty lemmas tier
    lemmas_tier = tgt.IntervalTier()
    lemmas_tier_name = [name for name in tier_names
                        if 'words' in name][0].replace('words', 'lemmas')
    lemmas_tier.name = lemmas_tier_name

    # Generate lemma intervals
    lemmas_intervals = [
        tgt.Interval(w_interval.start_time, w_interval.end_time,
                     lemmatize_word(w_interval.text, pos_tier[i].text))
        for i, w_interval in enumerate(words_tier)
    ]

    # Add lemmas to tier
    lemmas_tier.add_annotations(lemmas_intervals)
    tg.add_tier(lemmas_tier)

    tgt.write_to_file(tg,
                      os.path.join(output_path, title + '.TextGrid'),
                      format='short')
def main():
    args = parse_arguments()
    
    for input_path in args['input_files']:

        #Parse the input file
        annotation_root = etree.parse(input_path).getroot()
        
        # Only do the conversion if neither the --list-features nor the --list-tracks
        # command-line arguments have been passed.
        if not (args['list_features'] or args['list_tracks']):

            # Unless an output_dir is specified, save output files to the directory of the input file
            input_dir, input_filename = os.path.split(input_path)
            if input_filename[-4:] == '.xml':
                output_filename = input_filename[:-4] + '.TextGrid'
            else:
                output_filename = input_filename + '.TextGrid'
            if args['output_dir'] is not None:
                output_path = os.path.join(args['output_dir'][0], output_filename)
            else:
                output_path = os.path.join(input_dir, output_filename)

            textgrid = higgins2tg(annotation_root, args['features_included'], args['features_excluded'],
                               args['tracks_included'], args['tracks_excluded'])
            tgt.write_to_file(textgrid, output_path, encoding=args['output_encoding'], format=args['textgrid_type'])            

        elif args['list_features']:
            features_all = get_all_features(annotation_root)
            sys.stdout.write('%s -- Features: ' % input_filename+ ', '.join(features_all) + '\n')

        else:
            tracks = get_tracks(annotation_root)
            sys.stdout.write('%s -- Tracks: ' % input_filename + ', '.join(tracks) + '\n')
示例#7
0
def add_onsets_rhymes(title, input_path, output_path):

    # Load the textgrid
    tg = tgt.read_textgrid(os.path.join(input_path, title + '.TextGrid'))

    # Load name of all tiers
    tier_names = tg.get_tier_names()

    # Select a tier whose name contains 'syllables'
    sylls_tier_name = [name for name in tier_names if 'sylls' in name][0]
    sylls_tier = tg.get_tier_by_name(sylls_tier_name)

    # Select a tier whose name contains 'phones'
    phones_tier_name = [name for name in tier_names if 'phones' in name][0]
    phones_tier = tg.get_tier_by_name(phones_tier_name)

    # Start an empty tier for onset-rhymes
    onset_rhyme_tier = tgt.IntervalTier()
    onset_rhyme_tier_name = [name for name in tier_names
                             if 'words' in name][0].replace('words', 'OR')
    onset_rhyme_tier.name = onset_rhyme_tier_name

    onset_rhyme_intervals = []

    for syll in sylls_tier._get_annotations():

        #print(syll)
        phs = phones_tier.get_annotations_between_timepoints(
            syll.start_time, syll.end_time)

        nucleus_index = calculate_nucleus_index(phs)

        # If the first phone contains a number then it means the whole syll has no onset, so we only add a rhyme
        if nucleus_index == 0:
            onset_rhyme_intervals.append(
                tgt.Interval(syll.start_time, syll.end_time, 'R'))

        # If the onset is present add onset and rhyme intervals
        else:
            onset_rhyme_intervals.append(
                tgt.Interval(syll.start_time, phs[nucleus_index - 1].end_time,
                             'O'))

            onset_rhyme_intervals.append(
                tgt.Interval(phs[nucleus_index].start_time, syll.end_time,
                             'R'))

    # Add all the intervals to the onset rhyme tier
    onset_rhyme_tier.add_annotations(onset_rhyme_intervals)

    # Add the onset rhyme tier to the TextGrid
    tg.add_tier(onset_rhyme_tier)

    # Move syll tier after the onset_rhyme_tier
    tg.delete_tier(sylls_tier_name)
    tg.add_tier(sylls_tier)

    tgt.write_to_file(tg,
                      os.path.join(output_path, title + '.TextGrid'),
                      format='short')
def main():
	ap = argparse.ArgumentParser()
	ap.add_argument(
		'shift',
		help='offset by which to shift the boundaries (positive or negative)',
		type=float)
	ap.add_argument(
		'file',
		help='the textgrid file',
		type=str)
	ap.add_argument(
		'-e', '--encoding',
		help='file encoding (default "utf-8")',
		default='utf-8',
		type=str)
	ap.add_argument(
		'-f', '--format',
		help='the output format (default "short")',
		default='short',
		type=str)
	ap.add_argument(
		'-o', '--outfile',
		help='the output file (defaults to inputfile.shifted.Extension)',
		type=str)
	arguments = ap.parse_args()

	# Read file
	try:
		tg = tgt.read_textgrid(
				filename=arguments.file,
				encoding=arguments.encoding)
	except IOError:
		print('An error occurred reading file {file}'.
				format(file=arguments.file))
		sys.exit(1)
	# Create new textgrid
	if arguments.outfile is None:
		basename, extension = os.path.splitext(arguments.file)
		output_filename = basename + '.shifted' + extension
	else:
		output_filename = arguments.outfile
	tg_shifted = tgt.TextGrid(filename=output_filename)
	# Shift boundaries
	for tier in tg:
		ts = tgt.util.shift_boundaries(tier, arguments.shift, 0)
		tg_shifted.add_tier(ts)
	# Write file
	tgt.write_to_file(
		textgrid=tg_shifted,
		filename=tg_shifted.filename,
		format=arguments.format,
		encoding=arguments.encoding)
示例#9
0
def main():
    ap = argparse.ArgumentParser()
    ap.add_argument(
        'shift',
        help='offset by which to shift the boundaries (positive or negative)',
        type=float)
    ap.add_argument('file', help='the textgrid file', type=str)
    ap.add_argument('-e',
                    '--encoding',
                    help='file encoding (default "utf-8")',
                    default='utf-8',
                    type=str)
    ap.add_argument('-f',
                    '--format',
                    help='the output format (default "short")',
                    default='short',
                    type=str)
    ap.add_argument(
        '-o',
        '--outfile',
        help='the output file (defaults to inputfile.shifted.Extension)',
        type=str)
    arguments = ap.parse_args()

    # Read file
    try:
        tg = tgt.read_textgrid(filename=arguments.file,
                               encoding=arguments.encoding)
    except IOError:
        print('An error occurred reading file {file}'.format(
            file=arguments.file))
        sys.exit(1)
    # Create new textgrid
    if arguments.outfile is None:
        basename, extension = os.path.splitext(arguments.file)
        output_filename = basename + '.shifted' + extension
    else:
        output_filename = arguments.outfile
    tg_shifted = tgt.TextGrid(filename=output_filename)
    # Shift boundaries
    for tier in tg:
        ts = tgt.util.shift_boundaries(tier, arguments.shift, 0)
        tg_shifted.add_tier(ts)
    # Write file
    tgt.write_to_file(textgrid=tg_shifted,
                      filename=tg_shifted.filename,
                      format=arguments.format,
                      encoding=arguments.encoding)
示例#10
0
def main():

    # Parse the command-line arguments.
    args = parse_arguments()
    tg_path = args['tg_path']
    offset_start = args['offset_start']
    offset_end = args['offset_end']
    outpath = args['outpath']

    # Read the TextGrid
    tg = tgt.read_textgrid(tg_path)
    tg_part = tgt.TextGrid()

    if offset_start is None and offset_end is None:
        raise Exception('At least one of offset_start and offset_end must be specified.')
    elif offset_start is None:
        offset_start = tg.start_time
    elif offset_end is None:
        offset_end = tg.end_time

    for tr in tg:
        intr_part = tr.get_annotations_between_timepoints(
            offset_start, offset_end)
        tier_part = tgt.IntervalTier(
            name=tr.name,
            start_time=tr.start_time,
            end_time=tr.end_time,
            objects=intr_part)
        tg_part.add_tier(tier_part)


    if outpath is None:
        tg_dirname, tg_filename = os.path.split(tg_path)
        outpath = os.path.splitext(tg_filename)[0] + '_part.TextGrid'

    tgt.write_to_file(tg_part, outpath)
示例#11
0
    def write_grids(self, output_path, columns, one_grid_per_match, sound_path,
                    left_padding, right_padding, remember_time, file_prefix):

        if "coquery_invisible_origin_id" not in self.df.columns:
            one_grid_per_match = True

        self.output_path = output_path
        grids = self.fill_grids(columns, one_grid_per_match, sound_path,
                                left_padding, right_padding, remember_time)

        textgrids = collections.defaultdict(list)

        self.n = 0

        for x in grids:
            grid = grids[x]
            for i, tier in enumerate(grid.tiers):
                try:
                    hashed, tab, feature = self.resource.split_resource_feature(
                        tier.name)
                    if hashed is not None:
                        link, res = get_by_hash(hashed)
                        label = getattr(res, "{}_{}".format(tab, feature))
                        tier.name = "{}.{}".format(res.name, label)
                    else:
                        # try to retrieve a resource label for the tier name:
                        tier.name = getattr(self.resource, tier.name)
                except (ValueError, AttributeError):
                    # Failed to retrieve the tier name. This may happen if
                    # it's not a resource feature name, but for example an
                    # entry from the Query output branch.
                    pass

            if one_grid_per_match:
                match_fn, match_id = x
                basename, _ = os.path.splitext(os.path.basename(match_fn))
                filename = "{}_id{}".format(basename, match_id)
            else:
                match_fn, = x
                basename, _ = os.path.splitext(os.path.basename(match_fn))
                filename = basename
            target = os.path.join(
                output_path, "{}{}.TextGrid".format(file_prefix, filename))
            tgt.write_to_file(grid, target)
            self.n += 1
            textgrids[basename].append((grid, filename, self._offsets[x]))

        if sound_path:
            import wave
            from .sound import extract_sound

            # FIXME:
            # there should be a resource method that matches sound file names
            # and text grid names.
            for root, _, files in os.walk(sound_path):
                for file_name in files:
                    basename, _ = os.path.splitext(file_name)
                    if basename in textgrids:
                        source = os.path.join(root, file_name)
                        for grid, grid_name, offset in textgrids[basename]:
                            target = os.path.join(
                                output_path,
                                "{}{}.wav".format(file_prefix, grid_name))
                            start = max(0, offset - left_padding)
                            end = offset - left_padding + grid.end_time

                            try:
                                extract_sound(source, target, start, end)
                            except wave.Error:
                                pass
示例#12
0
import tgt

grid = tgt.TextGrid(filename="test")
tier = tgt.IntervalTier(start_time=0, end_time=5, name="mot")
label = tgt.core.Interval(2, 3, "word")
tier.add_annotation(label)
grid.add_tier(tier)
tgt.write_to_file(grid, "/home/leferrae/Desktop/These/test.textgrid")
def add_silences(title, input_path, output_path):

    tg = tgt.read_textgrid(os.path.join(input_path, title + '.TextGrid'))

    # Load name of all tiers
    tier_names = tg.get_tier_names()

    # Select a tier whose name contains 'phones'
    phones_tier_name = [name for name in tier_names if 'phones' in name][0]
    phones_tier = tg.get_tier_by_name(phones_tier_name)

    # Replace all sil and sp intervals with <sil> tag
    #store these intervals to a list so that we can add them to the other tiers
    sil_intervals = []
    for interval in phones_tier:
        if interval.text == 'sil' or interval.text == 'sp':
            interval.text = '<sil>'
            sil_intervals.append(interval)

    # WORDS TIER
    # Select a tier whose name contains 'words'
    words_tier_name = [name for name in tier_names if 'words' in name][0]
    words_tier = tg.get_tier_by_name(words_tier_name)

    # Add <sil> to words tier
    words_tier.add_annotations(sil_intervals)

    # LEMMAS TIER
    # Select a tier whose name contains 'lemmas'
    lemmas_tier_name = [name for name in tier_names if 'lemmas' in name][0]
    lemmas_tier = tg.get_tier_by_name(lemmas_tier_name)

    # Add <sil> to lemmas tier
    lemmas_tier.add_annotations(sil_intervals)

    # SYLLABLES TIER
    # Select a tier whose name contains 'sylls'
    sylls_tier_name = [name for name in tier_names if 'sylls' in name][0]
    sylls_tier = tg.get_tier_by_name(sylls_tier_name)

    # Add <sil> to syllables tier
    sylls_tier.add_annotations(sil_intervals)

    # POS TIER
    # Select a tier whose name contains 'pos'
    pos_tier_name = [name for name in tier_names if 'pos' in name][0]
    pos_tier = tg.get_tier_by_name(pos_tier_name)

    # Add <sil> to pos tier
    pos_tier.add_annotations(sil_intervals)

    # OR TIER
    # Select a tier whose name contains 'OR'
    onset_rhyme_name = [name for name in tier_names if 'OR' in name][0]
    onset_rhyme_tier = tg.get_tier_by_name(onset_rhyme_name)

    # Add <sil> to OR tier
    onset_rhyme_tier.add_annotations(sil_intervals)

    # BP TIER
    # Select a tier whose name contains 'bp'
    bp_tier_name = [name for name in tier_names if 'bp' in name][0]
    bp_tier = tg.get_tier_by_name(bp_tier_name)

    # Add <sil> to bp tier
    bp_tier.add_annotations(sil_intervals)

    # FP TIER
    # Select a tier whose name contains 'fp'
    fp_tier_name = [name for name in tier_names if 'fp' in name][0]
    fp_tier = tg.get_tier_by_name(fp_tier_name)

    # Add <sil> to fp tier
    fp_tier.add_annotations(sil_intervals)

    tgt.write_to_file(tg,
                      os.path.join(output_path, title + '.TextGrid'),
                      format='short')
示例#14
0
        print(sys.exc_info()[0])
        raise

    for tier in textgrid.tiers:
        if tier.tier_type() == 'IntervalTier':
            for i, interval in enumerate(tier.intervals):
                try:
                    interval.end_time = t_out[int(interval.end_time * 1000)]
                except:
                    print('Setting interval.end_time to t_out')
                    interval.end_time = t2[-1]
                interval.start_time = t_out[int(interval.start_time * 1000)]
        else:
            for point in tier.points:
                point.time = t_out[int(point.time * 1000)]
        try:
            tier.end_time = t_out[int(tier.end_time * 1000)]
        except:
            print('Setting tier.end_time to t_out')
            tier.end_time = t2[-1]

    tgt.write_to_file(textgrid,
                      data_folder + textgrid_dest,
                      format='long',
                      encoding='utf-16')

#%% wrap up
end_time = datetime.now()
dif_time = end_time - start_time
print('Finished all in {}'.format(dif_time))

def createTargetGrid(textGrid):
    targetInterval = findMatchInterval(textGrid, findTarget(textGrid))
    st = targetInterval.start_time
    et = targetInterval.end_time
    targetTier = tgt.IntervalTier(start_time=st,
                                  end_time=et,
                                  name="Target Word")
    targetTier.add_interval(targetInterval)
    textGrid.add_tier(targetTier)
    return textGrid


allAnnotatedGrid = [
    f for f in listdir(annotatedDir) if re.search(r'TextGrid', f)
]
allAlignedGrid = [f for f in listdir(alignedDir) if re.search(r'TextGrid', f)]

list.sort(allAnnotatedGrid)
list.sort(allAlignedGrid)

for i in range(len(allAnnotatedGrid)):
    annotatedTextGrid = tgt.read_textgrid(annotatedDir + allAnnotatedGrid[i])
    alignedTextGrid = tgt.read_textgrid(alignedDir + allAlignedGrid[i])
    outGrid = annotate(alignedTextGrid, annotatedTextGrid)
    outGrid = createTargetGrid(outGrid)
    print(allAlignedGrid[i])
    tgt.write_to_file(outGrid,
                      outDir + allAlignedGrid[i] + '_merged' + '.TextGrid',
                      format='short')
示例#16
0
def annotate(title, xml_path, textgrid_path, annotations_path):
    try:

        tree = ET.parse(os.path.join(xml_path, title + '.xml'))
        root = tree.getroot()

        stress_phone_seq = [
        ]  # The content here come from the xml file. Output format:  [[ph1, ph2, etc.], [ph1, ph2, etc.], etc]
        stress_seq = [
        ]  # The content here come from the xml file. Output format:  [[0], [2], [1],etc]
        for p in root[0]:
            for s in p:
                for phrase in s:
                    for word in phrase:
                        # get rid of words in xml that lack a phonemic counterpart in the textGrid
                        if word.text not in ('!', ',', '-', '.', '..', '...',
                                             ':', '?'):
                            for syllable in word:
                                stress_phone_group = []
                                stress_group = []
                                stress_group.append(syllable.attrib['stress'])
                                stress_seq.append(stress_group)
                                for ph in syllable:
                                    stress_phone_group.append(ph.attrib['p'])
                                stress_phone_seq.append(stress_phone_group)

        tg = tgt.read_textgrid(os.path.join(textgrid_path,
                                            title + '.TextGrid'))
        phones_tier = tg.get_tier_by_name('phones')
        word_tier = tg.get_tier_by_name('words')

        #word_durations = [w for w in word_tier._get_annotations() if w.text != '-'] # use this instead of the next snippet if you remove '-' from the vocabulary. Atm '-' is mapped to 'min@s'
        word_durations = []
        dash_intervals = []
        for w in word_tier._get_annotations():
            if w.text == '-':
                dash_intervals.append(w)
            else:
                word_durations.append(w)
        for dash in dash_intervals:
            # Here we delete all the phone annotation that are read out as "minus", if you don't u mess up the alignment
            phones_tier.delete_annotations_between_timepoints(
                dash.start_time,
                dash.end_time,
                left_overlap=False,
                right_overlap=False)

        phone_durations = [
            p for p in phones_tier._get_annotations() if p.text != 'sil'
        ]

        # here we gather the phone durations following the same format as pos_phone_seq, i.e. [[ph_dur1, ph_dur2, etc.], [ph_dur1, ph_dur2, etc.], etc]

        #print([j for i in stress_phone_seq for j in i])
        #print([i.text for i in phone_durations])

        l = []
        k = -1
        for i in range(0, len(stress_phone_seq)):
            m = []
            for j in range(0, len(stress_phone_seq[i])):
                k += 1
                m.append(phone_durations[k])
            l.append(m)

        # here we go thru this list ([[ph_dur1, ph_dur2, etc.], [ph_dur1, ph_dur2, etc.], etc]) and we keep the first and the last duration of every syllable
        syl_durations = [(syl[0].start_time, syl[-1].end_time) for syl in l]
        syllable_tier = tgt.IntervalTier()
        syllable_tier.name = 'syllables'
        syllable_tier.start_time = phones_tier.start_time
        syllable_tier.end_time = phones_tier.end_time
        syllable_intervals = [
            tgt.Interval(syl_durations[i][0], syl_durations[i][1],
                         str(stress_seq[i][0]))
            for i in range(0, len(syl_durations))
        ]
        syllable_tier.add_annotations(syllable_intervals)

        for phone in phones_tier:
            phone.text = phone.text.replace('Q', '@@').replace('ts',
                                                               't').replace(
                                                                   'sp', 'sil')

        vowels = [
            '@', '@@', 'a', 'aa', 'ai', 'au', 'e', 'e@', 'ei', 'i', 'i@', 'ii',
            'o', 'oi', 'oo', 'ou', 'u', 'u@', 'uh', 'uu'
        ]

        for phone in phones_tier:
            if phone.text in vowels:

                phone_centre = phone.start_time + (phone.end_time -
                                                   phone.start_time) / 2
                phone.text = phone.text + syllable_tier.get_annotations_by_time(
                    phone_centre)[0].text

        # For now we generate the modified TextGrids in the same folder is the old ones. Later, sent the new files into a new folder
        newTitle = os.path.join(annotations_path, title + '.TextGrid')
        tgt.write_to_file(tg, newTitle, format='short')
    except:
        pass
def add_punctuation(title, textgrid_path, txt_path, output_path):

    txt = ''
    with open(os.path.join(txt_path, title + '.txt'), 'r',
              encoding='utf-8') as f:
        for l in f:
            l = ' '.join(l.split())
            for char in l.replace('\n', ' ').replace('\t', ' ').lower():
                txt += char

    word_non_words = [detect_non_words(w) for w in txt.split()]

    # Exclude non-words such as -' , - etc.
    txt_words = [w for w in word_non_words if w != '<punct>']

    # Strip words of punctuation before and after the first/last alphanum
    txt_words = [clean_word(w, title, txt) for w in txt_words]

    tg = tgt.read_textgrid(os.path.join(textgrid_path, title + '.TextGrid'))

    # Load name of all tiers
    tier_names = tg.get_tier_names()

    # Select a tier whose name contains 'words'
    word_tier_name = [name for name in tier_names if 'words' in name][0]
    word_tier = tg.get_tier_by_name(word_tier_name)
    word_list = [w.text for w in word_tier._get_annotations()]

    if len(word_list) == len(txt_words):

        w_indices = []
        w_indices.append(0)
        start = 0
        for lw in txt_words:
            idx = txt.find(lw, start, len(txt))
            start = idx + len(lw)
            w_indices.append(idx)
            w_indices.append(idx + len(lw))
        w_indices.append(len(txt))

        p_indices = [[w_indices[i], w_indices[i + 1]]
                     for i in range(0,
                                    len(w_indices) - 1, 2)]
        punctuation = [txt[i[0]:i[1]].replace(' ', '') for i in p_indices]
        punctuation[0] = 'start' + punctuation[0]
        punctuation[-1] = punctuation[-1] + 'end'
        punctuation = [p if p != '' else '_' for p in punctuation]

        bp = punctuation[0:-1]
        fp = punctuation[1:]

        word_durations = []
        for w in word_tier._get_annotations():
            word_durations.append(w)

        # here we go thru this list ([[w_dur1, w_dur2, etc.], [w_dur1, w_dur2, etc.], etc]) and we keep the first and the last duration of every word
        bp_tier = tgt.IntervalTier()
        bp_tier.name = [name for name in tier_names
                        if 'words' in name][0].replace('words', 'bp')
        bp_intervals = [
            tgt.Interval(word_durations[i].start_time,
                         word_durations[i].end_time, bp[i])
            for i in range(0, len(word_durations))
        ]
        bp_tier.add_annotations(bp_intervals)
        tg.add_tier(bp_tier)

        fp_tier = tgt.IntervalTier()
        fp_tier.name = [name for name in tier_names
                        if 'words' in name][0].replace('words', 'fp')
        fp_intervals = [
            tgt.Interval(word_durations[i].start_time,
                         word_durations[i].end_time, fp[i])
            for i in range(0, len(word_durations))
        ]
        fp_tier.add_annotations(fp_intervals)
        tg.add_tier(fp_tier)

    else:

        word_durations = []
        for w in word_tier._get_annotations():
            word_durations.append(w)

        bp = ['start'] + ['<unk>' for i in range(len(word_durations) - 1)]
        fp = ['<unk>' for i in range(len(word_durations) - 1)] + ['end']

        bp_tier = tgt.IntervalTier()
        bp_tier.name = [name for name in tier_names
                        if 'words' in name][0].replace('words', 'bp')
        bp_intervals = [
            tgt.Interval(word_durations[i].start_time,
                         word_durations[i].end_time, bp[i])
            for i in range(0, len(word_durations))
        ]
        bp_tier.add_annotations(bp_intervals)
        tg.add_tier(bp_tier)

        fp_tier = tgt.IntervalTier()
        fp_tier.name = [name for name in tier_names
                        if 'words' in name][0].replace('words', 'fp')
        fp_intervals = [
            tgt.Interval(word_durations[i].start_time,
                         word_durations[i].end_time, fp[i])
            for i in range(0, len(word_durations))
        ]
        fp_tier.add_annotations(fp_intervals)
        tg.add_tier(fp_tier)

    # For now we generate the modified TextGrids in the same folder is the old ones. Later, sent the new files into a new folder
    tgt.write_to_file(tg,
                      os.path.join(output_path, title + '.TextGrid'),
                      format='short')
示例#18
0
    def write_grids(self, output_path, columns, one_grid_per_match,
                    sound_path, left_padding, right_padding, remember_time,
                    file_prefix):

        if "coquery_invisible_origin_id" not in self.df.columns:
            one_grid_per_match = True

        self.output_path = output_path
        grids = self.fill_grids(columns, one_grid_per_match, sound_path,
                                left_padding, right_padding, remember_time)

        textgrids = collections.defaultdict(list)

        self.n = 0

        for x in grids:
            grid = grids[x]
            for i, tier in enumerate(grid.tiers):
                try:
                    hashed, tab, feature = self.resource.split_resource_feature(tier.name)
                    if hashed is not None:
                        link, res = get_by_hash(hashed)
                        label = getattr(res, "{}_{}".format(tab, feature))
                        tier.name = "{}.{}".format(res.name, label)
                    else:
                        # try to retrieve a resource label for the tier name:
                        tier.name = getattr(self.resource, tier.name)
                except (ValueError, AttributeError):
                    # Failed to retrieve the tier name. This may happen if
                    # it's not a resource feature name, but for example an
                    # entry from the Query output branch.
                    pass

            if one_grid_per_match:
                match_fn, match_id = x
                basename, _ = os.path.splitext(os.path.basename(match_fn))
                filename = "{}_id{}".format(basename, match_id)
            else:
                match_fn, = x
                basename, _ = os.path.splitext(os.path.basename(match_fn))
                filename = basename
            target = os.path.join(output_path, "{}{}.TextGrid".format(
                file_prefix, filename))
            tgt.write_to_file(grid, target)
            self.n += 1
            textgrids[basename].append((grid, filename, self._offsets[x]))

        if sound_path:
            import wave
            from .sound import extract_sound

            # FIXME:
            # there should be a resource method that matches sound file names
            # and text grid names.
            for root, _, files in os.walk(sound_path):
                for file_name in files:
                    basename, _ = os.path.splitext(file_name)
                    if basename in textgrids:
                        source = os.path.join(root, file_name)
                        for grid, grid_name, offset in textgrids[basename]:
                            target = os.path.join(
                                        output_path,
                                        "{}{}.wav".format(
                                            file_prefix,
                                            grid_name))
                            start = max(0, offset - left_padding)
                            end = offset - left_padding + grid.end_time

                            try:
                                extract_sound(source, target, start, end)
                            except wave.Error:
                                pass
# Usage: python segment_laughter.py <input_audio_file> <stored_model_path> <output_folder> <save_to_textgrid>

if __name__ == '__main__':
    if parse_inputs():
        input_path, model_path, output_path, threshold, min_length, save_to_textgrid = parse_inputs(
        )
        min_length = seconds_to_frames(min_length)

        laughs = laugh_segmenter.segment_laughs(input_path, model_path,
                                                output_path, threshold,
                                                min_length, save_to_textgrid)
        print("found %d laughs." % (len(laughs)))

        if not save_to_textgrid:
            for laugh in laughs:
                print(laugh)
        else:
            tg = tgt.TextGrid()
            laughs_tier = tgt.IntervalTier(
                name='laughter',
                objects=[
                    tgt.Interval(l['start'], l['end'], 'laugh') for l in laughs
                ])
            tg.add_tier(laughs_tier)
            fname = os.path.splitext(os.path.basename(input_path))[0]
            tgt.write_to_file(
                tg, os.path.join(output_path, fname + '_laughter.TextGrid'))

            print('Saved laughter segments in {}'.format(
                os.path.join(output_path, fname + '_laughter.TextGrid')))
def add_pos(title, input1_path, input2_path, output_path):

    # Load the textgrid
    tg = tgt.read_textgrid(os.path.join(input1_path, title + '.TextGrid'))

    # Load name of all tiers
    tier_names = tg.get_tier_names()

    # Select a tier whose name contains 'words'
    words_tier_name = [name for name in tier_names if 'words' in name][0]
    words_tier = tg.get_tier_by_name(words_tier_name)

    # Start an empty tier for POS_tags
    pos_tier = tgt.IntervalTier()
    pos_tier_name = [name for name in tier_names
                     if 'words' in name][0].replace('words', 'pos')
    pos_tier.name = pos_tier_name

    # Extract words intervals
    word_intervals = [w for w in words_tier._get_annotations()]

    # Extract words
    words = [w.text for w in words_tier._get_annotations()]

    # Load text
    txt = ''
    with open(os.path.join(input2_path, title + '.txt'), 'r',
              encoding='utf-8') as f:
        for l in f:
            l = ' '.join(l.split())
            for char in l.replace('\n', ' ').replace('\t', ' '):
                txt += char

    # Try to use my own tagger from txt and see if it matches the words in the original word tier
    # If they don't match just use the list of words from the tier and feed them to the tagger (this option is less accurate)

    my_tags = my_tagger(txt)
    if len(my_tags) == len(words):

        # True for every mismatch between words in words_tier and words produced by my_tagger
        mismatches = [
            True for i, tag in enumerate(my_tags) if tag[0] != words[i]
        ]

        # If everything matches up we can use my_tags, else we resort to the vanilla nltk one
        if True not in mismatches:
            POS_tags = my_tags

        else:
            POS_tags = nltk.pos_tag(words)

    else:
        print(title)
        POS_tags = nltk.pos_tag(words)

    pos_intervals = [
        tgt.Interval(interval.start_time, interval.end_time, POS_tags[i][1])
        for i, interval in enumerate(word_intervals)
    ]

    pos_tier.add_annotations(pos_intervals)

    tg.add_tier(pos_tier)

    tgt.write_to_file(tg,
                      os.path.join(output_path, title + '.TextGrid'),
                      format='short')
示例#21
0
def stitch_textgrid(batch_title, sequenced_title, input2b_path, input2_path,
                    output3_path):
    combined_intervals = []

    new_tg = tgt.TextGrid()

    new_phone_tier = tgt.IntervalTier()
    final_phone_tier = tgt.IntervalTier()
    new_word_tier = tgt.IntervalTier()

    last_dur = 0.0

    for i, title in enumerate(sequenced_title):

        wave_file = wave.open(os.path.join(input2b_path, title + '.wav'), 'rb')
        frameRate = wave_file.getframerate()
        n_frames = wave_file.getnframes()
        dur = n_frames / frameRate

        f0_start_time = 0.0
        f0_end_time = dur

        tg = tgt.read_textgrid(os.path.join(input2_path, title + '.TextGrid'))

        # Load name of all tiers
        tier_names = tg.get_tier_names()

        words_tier_name = [name for name in tier_names if 'words' in name][0]
        words_tier = tg.get_tier_by_name(words_tier_name)

        phones_tier_name = [name for name in tier_names if 'phones' in name][0]
        phones_tier = tg.get_tier_by_name(phones_tier_name)

        word_annotations = words_tier.get_annotations_between_timepoints(
            f0_start_time, f0_end_time)
        phone_annotations = phones_tier.get_annotations_between_timepoints(
            f0_start_time, f0_end_time)

        word_intervals = []
        for interval in word_annotations:
            interval.end_time = interval.end_time + last_dur
            interval.start_time = interval.start_time + last_dur
            word_intervals.append(interval)
        if word_intervals[-1].end_time > last_dur + f0_end_time:
            word_intervals[-1].end_time = last_dur + f0_end_time

        phone_intervals = []
        for j, interval in enumerate(phone_annotations):
            interval.end_time = interval.end_time + last_dur
            interval.start_time = interval.start_time + last_dur

            if interval.text != 'sil' and interval.text != 'sp':
                phone_intervals.append(interval)

            elif i == len(sequenced_title) - 1 and j == len(
                    phone_annotations) - 1:
                phone_intervals.append(interval)
        if phone_intervals[-1].end_time > last_dur + f0_end_time:
            phone_intervals[-1].end_time = last_dur + f0_end_time

        new_word_tier.add_annotations(word_intervals)
        new_phone_tier.add_annotations(phone_intervals)

        last_dur += dur

    phones_tier_copy = new_phone_tier.get_copy_with_gaps_filled(
        start_time=None, end_time=None, empty_string='')

    # Replace all sil and sp intervals with <sil> tag
    #store these intervals to a list so that we can add them to the other tiers
    sil_intervals = []
    phone_intervals = []
    for interval in phones_tier_copy:
        if interval.text == '':
            interval.text = 'sil'
            sil_intervals.append(interval)
        else:
            phone_intervals.append(interval)

    final_phone_tier.add_annotations(phone_intervals)
    final_phone_tier.add_annotations(sil_intervals)

    final_phone_tier.name = phones_tier_name
    new_word_tier.name = words_tier_name

    new_tg.add_tier(new_word_tier)
    new_tg.add_tier(final_phone_tier)

    tgt.write_to_file(new_tg,
                      os.path.join(output3_path, batch_title + '.TextGrid'),
                      format='short')
def add_syllables(title, input_path, syllabification_file_path, output_path):

	# Load language syllable structure for the syllabifier
	with open(syllabification_file_path) as f:   
		language_syllables =  json.load(f)


	# Load the textgrid
	tg = tgt.read_textgrid(os.path.join(input_path,title+'.TextGrid'))

	# Load name of all tiers
	tier_names = tg.get_tier_names()

	# Select a tier whose name contains 'words'
	words_tier_name = [name for name in tier_names if 'words' in name][0]
	words_tier = tg.get_tier_by_name(words_tier_name)

	# Select a tier whose name contains 'phones'
	phones_tier_name = [name for name in tier_names if 'phones' in name][0]
	phones_tier = tg.get_tier_by_name(phones_tier_name)

	# Start an empty tier for syllables
	syllable_tier = tgt.IntervalTier()
	syll_tier_name = [name for name in tier_names if 'words' in name][0].replace('words', 'sylls')
	syllable_tier.name = syll_tier_name

	# Syllabify one word at a time
	for w in words_tier._get_annotations():
		
		# For the current word, get all of its phones
		phs = phones_tier.get_annotations_between_timepoints(w.start_time, w.end_time)
		for ph in phs: 
			if ph.text == 'spn':
				ph.text = 'aa1'


		# Transform the string of phones into a string of syllables
		# Format: ph1 ph2 . ph3 ph4 ph5 . ph6 etc.
		s = stringify(syllabify(' '.join([ph.text for ph in phs]), language_syllables))

		# From string of syllables to a nested lists of phone indeces
		# Format: [[ph1_idx, ph2_idx, etc.], [ph3_idx, ph4_idx, etc.], etc.]

		sylls = [syll.split() for syll in s.split('.')]
		i = 0
		sylls_indeces = []
		for j, syll in enumerate(sylls):
			syll_indeces = []
			for k in range(0, len(syll)):
				syll_indeces.append(int(i))
				i += 1
			sylls_indeces.append(syll_indeces)

		# Extract the relevant intervals using the indeces
		sylls_intervals = [[phs[index] for index in ph_group] for ph_group in sylls_indeces]

		# Extract the stress for each syllable:
		# Format: [['0'], ['1'], etc.]
		sylls_stresses = [[char for char in ''.join(ph_group) if char.isdigit()==True] for ph_group in sylls]
		sylls_stresses = [ph_group if ph_group != [] else ['0'] for ph_group in sylls_stresses]

		#print(w)
		#print(sylls_indeces)
		#print(sylls_stresses)
		#print(sylls_intervals)

		syllable_intervals = [tgt.Interval(interval[0].start_time, interval[-1].end_time, str(sylls_stresses[i][0])) for i, interval in enumerate(sylls_intervals)]

		#print(syllable_intervals)
		syllable_tier.add_annotations(syllable_intervals)

	tg.add_tier(syllable_tier)

	tgt.write_to_file(tg, os.path.join(output_path,title+'.TextGrid'), format='short')