def words(self, fileids=None): """ :param fileids: A list specifying the fileids that should be used. :return: the given file(s) as a list of words and punctuation symbols. :rtype: list(str) """ return concat([MTEFileReader(os.path.join(self._root, f)).words() for f in self.__fileids(fileids)])
def raw(self, fileids=None): """ :param fileids: A list specifying the fileids that should be used. :return: the given file(s) as a single string. :rtype: str """ return concat([self.open(f).read() for f in self.__fileids(fileids)])
def lemma_words(self, fileids=None): """ :param fileids: A list specifying the fileids that should be used. :return: the given file(s) as a list of words, the corresponding lemmas and punctuation symbols, encoded as tuples (word, lemma) :rtype: list(tuple(str,str)) """ return concat([MTEFileReader(os.path.join(self._root, f)).lemma_words() for f in self.__fileids(fileids)])
def paras(self, fileids=None): """ :param fileids: A list specifying the fileids that should be used. :return: the given file(s) as a list of paragraphs, each encoded as a list of sentences, which are in turn encoded as lists of word string :rtype: list(list(list(str))) """ return concat([MTEFileReader(os.path.join(self._root, f)).paras() for f in self.__fileids(fileids)])
def sents(self, fileids=None): """ :param fileids: A list specifying the fileids that should be used. :return: the given file(s) as a list of sentences or utterances, each encoded as a list of word strings :rtype: list(list(str)) """ return concat([MTEFileReader(os.path.join(self._root, f)).sents() for f in self.__fileids(fileids)])
def lemma_paras(self, fileids=None): """ :param fileids: A list specifying the fileids that should be used. :return: the given file(s) as a list of paragraphs, each encoded as a list of sentences, which are in turn encoded as a list of tuples of the word and the corresponding lemma (word, lemma) :rtype: list(List(List(tuple(str, str)))) """ return concat([MTEFileReader(os.path.join(self._root, f)).lemma_paras() for f in self.__fileids(fileids)])
def words(self, fileids=None): """ :param fileids: A list specifying the fileids that should be used. :return: the given file(s) as a list of words and punctuation symbols. :rtype: list(str) """ return concat([ MTEFileReader(os.path.join(self._root, f)).words() for f in self.__fileids(fileids) ])
def lemma_words(self, fileids=None): """ :param fileids: A list specifying the fileids that should be used. :return: the given file(s) as a list of words, the corresponding lemmas and punctuation symbols, encoded as tuples (word, lemma) :rtype: list(tuple(str,str)) """ return concat([ MTEFileReader(os.path.join(self._root, f)).lemma_words() for f in self.__fileids(fileids) ])
def tagged_sents(self, fileids=None, tagset="msd", tags=""): """ :param fileids: A list specifying the fileids that should be used. :param tagset: The tagset that should be used in the returned object, either "universal" or "msd", "msd" is the default :param tags: An MSD Tag that is used to filter all parts of the used corpus that are not more precise or at least equal to the given tag :return: the given file(s) as a list of sentences or utterances, each each encoded as a list of (word,tag) tuples :rtype: list(list(tuple(str, str))) """ if tagset == "universal" or tagset == "msd": return concat([MTEFileReader(os.path.join(self._root, f)).tagged_sents(tagset, tags) for f in self.__fileids(fileids)]) else: print("Unknown tagset specified.")