예제 #1
0
 def dumpSCP(self, f, targetdir):
     """ Dump SCP to file...
     """
     try:
         for filename in self.wavfilelist:
             f.write(os.path.join(self.wavlocation, filename) + " " + \
                     os.path.join(targetdir, ".".join([parse_path(filename)[2], MFCC_EXT])) + "\n")
         f.flush()
     except AttributeError:
         with codecs.open(f, "w", encoding="utf-8") as outfh:
             for filename in self.wavfilelist:
                 outfh.write(os.path.join(self.wavlocation, filename) + " " + \
                             os.path.join(targetdir, ".".join([parse_path(filename)[2], MFCC_EXT])) + "\n")
예제 #2
0
 def dumpSCP(self, f, targetdir):
     """ Dump SCP to file...
     """
     try:
         for filename in self.wavfilelist:
             f.write(os.path.join(self.wavlocation, filename) + " " + \
                     os.path.join(targetdir, ".".join([parse_path(filename)[2], MFCC_EXT])) + "\n")
         f.flush()
     except AttributeError:
         with codecs.open(f, "w", encoding="utf-8") as outfh:
             for filename in self.wavfilelist:
                 outfh.write(os.path.join(self.wavlocation, filename) + " " + \
                             os.path.join(targetdir, ".".join([parse_path(filename)[2], MFCC_EXT])) + "\n")
예제 #3
0
    def _loadpath_word(self, path):
        """ Load from multiple files...
        """
        log.debug(
            unicode(self) + " loading transcriptions from multiple files.")

        wordlevel = {}

        for filename in type_files(os.listdir(path), LAB_EXT):

            with codecs.open(os.path.join(path, filename),
                             encoding="utf-8") as infh:
                text = infh.read()

            #parsing by assuming words are whitespace delimited:
            wordlist = text.split()
            if len(wordlist) == 0:
                raise Exception("File '%s' is empty..." %
                                (os.path.join(path, filename)))

            #assuming unique basenames... (a reasonable assumption)
            key = parse_path(filename)[2]
            if key in wordlevel:
                raise Exception("basename '%s' is not unique..." % (key))
            wordlevel[key] = " ".join(wordlist)

        if len(wordlevel) == 0:
            raise Exception("No transcriptions found in '%s'..." % (path))
        return wordlevel
예제 #4
0
    def allLabelsInTranscr(self, audiofeats):
        """ Go through filelabels in 'audiofeats', checking whether all labels
            are present in the transcriptions...
        """
        missinglabels = []
        if self.wordlevel is not None:
            translabels = list(self.wordlevel.keys())
        else:
            translabels = list(self.phonelevel.keys())
        filelabels = [parse_path(filename)[2] \
                      for filename in audiofeats.getWavFilelist()]

        if len(filelabels) != len(translabels):
            print("WARNING: %s transcriptions and %s audio files" %
                  (len(translabels), len(filelabels)))

        for filelabel in filelabels:
            if filelabel not in translabels:
                missinglabels.append(filelabel)
        if len(missinglabels) > 0:
            print("MISSING TRANSCRIPTIONS:")
            for lin in missinglabels:
                print("\t" + lin)
            return False
        else:
            return True
예제 #5
0
    def _load_mlffile(self, filepath):
        """ Load from HTK MLF style transcriptions file...
            TODO: Also read time information...
        """
        log.debug(unicode(self) + " loading transcriptions from mlf file.")

        quoted = re.compile('".*"')
        dot = re.compile('\.\n')

        t_items = {}
        items = []

        with codecs.open(filepath, encoding="utf-8") as infh:
            for line in infh:
                if re.match(quoted, line):  #new label...
                    items = []
                    key = parse_path(line.strip().strip('"'))[2]
                    if key in t_items:
                        raise Exception("Non unique names present...")
                elif re.match(dot, line):  #end of label...
                    t_items[key] = " ".join(items)
                else:  #item on line...
                    items.append(
                        line.split()[-1])  #ignores time information at present

        return t_items
예제 #6
0
    def _loadpath_phone(self, path):
        """ Load from multiple files...
        """
        log.debug(
            unicode(self) + " loading transcriptions from multiple files.")

        phonelevel = {}
        boundaries = {}

        filenames = []
        try:
            for ext in Utterance.SUPPORTED_EXTS:
                filenames.extend(type_files(os.listdir(path), ext))
        except OSError:
            raise

        if len(set(filenames)) != len(filenames):
            raise Exception("Non unique basenames exist....")

        filenames.sort()

        for filename in filenames:
            key = parse_path(filename)[2]
            utt = Utterance(os.path.join(path, filename))
            phonelevel[key] = " ".join([entry[1] for entry in utt.entries])
            b = [float_to_htk_int(entry[0]) for entry in utt.entries]
            if all(b) == False:
                boundaries[key] = None
            else:
                boundaries[key] = b

        return phonelevel, boundaries
예제 #7
0
    def _loadpath_word(self, path):
        """ Load from multiple files...
        """
        log.debug(unicode(self) + " loading transcriptions from multiple files.")

        wordlevel = {}

        for filename in type_files(os.listdir(path), LAB_EXT):

            with codecs.open(os.path.join(path, filename), encoding="utf-8") as infh:
                text = infh.read()

            #parsing by assuming words are whitespace delimited:
            wordlist = text.split()
            if len(wordlist) == 0:
                raise Exception("File '%s' is empty..."
                                % (os.path.join(path, filename)))
            
            #assuming unique basenames... (a reasonable assumption)
            key = parse_path(filename)[2]
            if key in wordlevel:
                raise Exception("basename '%s' is not unique..." % (key))
            wordlevel[key] = " ".join(wordlist)

        if len(wordlevel) == 0:
            raise Exception("No transcriptions found in '%s'..."
                            % (path))
        return wordlevel
예제 #8
0
    def allLabelsInTranscr(self, audiofeats):
        """ Go through filelabels in 'audiofeats', checking whether all labels
            are present in the transcriptions...
        """
        missinglabels = []
        if self.wordlevel is not None:
            translabels = list(self.wordlevel.keys())
        else:
            translabels = list(self.phonelevel.keys())
        filelabels = [parse_path(filename)[2] \
                      for filename in audiofeats.getWavFilelist()]

        if len(filelabels) != len(translabels):
            print("WARNING: %s transcriptions and %s audio files" % (len(translabels), len(filelabels)))

        for filelabel in filelabels:
            if filelabel not in translabels:
                missinglabels.append(filelabel)
        if len(missinglabels) > 0:
            print("MISSING TRANSCRIPTIONS:")
            for lin in missinglabels:
                print("\t" + lin)
            return False
        else:
            return True
예제 #9
0
    def _load_mlffile(self, filepath):
        """ Load from HTK MLF style transcriptions file...
            TODO: Also read time information...
        """
        log.debug(unicode(self) + " loading transcriptions from mlf file.")

        quoted = re.compile('".*"')
        dot = re.compile('\.\n')

        t_items = {}
        items = []

        with codecs.open(filepath, encoding="utf-8") as infh:
            for line in infh:
                if re.match(quoted, line): #new label...
                    items = []
                    key = parse_path(line.strip().strip('"'))[2]
                    if key in t_items:
                        raise Exception("Non unique names present...")
                elif re.match(dot, line):  #end of label...
                    t_items[key] = " ".join(items)
                else:                      #item on line...
                    items.append(line.split()[-1])   #ignores time information at present
        
        return t_items
예제 #10
0
    def _loadpath_phone(self, path):
        """ Load from multiple files...
        """
        log.debug(unicode(self) + " loading transcriptions from multiple files.")

        phonelevel = {}
        boundaries = {}

        filenames = []
        try:
            for ext in Utterance.SUPPORTED_EXTS:
                filenames.extend(type_files(os.listdir(path), ext))
        except OSError:
            raise

        if len(set(filenames)) != len(filenames):
            raise Exception("Non unique basenames exist....")

        filenames.sort()
        
        for filename in filenames:
            key = parse_path(filename)[2]
            utt = Utterance(os.path.join(path, filename))
            phonelevel[key] = " ".join([entry[1] for entry in utt.entries])
            b = [float_to_htk_int(entry[0]) for entry in utt.entries]
            if all(b) == False:
                boundaries[key] = None
            else:
                boundaries[key] = b

        return phonelevel, boundaries