Exemplo n.º 1
0
 def header(self, fileids=None, **kwargs):
     """
     Returns header(s) of specified fileids.
     """
     return concat([self._view(self.add_root(fileid),
                               mode=NKJPCorpusReader.HEADER_MODE, **kwargs).handle_query()
                    for fileid in fileids])
Exemplo n.º 2
0
 def sents(self, fileids=None, **kwargs):
     """
     Returns sentences in specified fileids.
     """
     return concat([self._view(self.add_root(fileid),
                               mode=NKJPCorpusReader.SENTS_MODE, **kwargs).handle_query()
                    for fileid in fileids])
Exemplo n.º 3
0
 def words(self, fileids=None, **kwargs):
     return concat(
         [
             self._view(fileid, tags=False, **kwargs)
             for fileid in self._list_morph_files(fileids)
         ]
     )
Exemplo n.º 4
0
 def tagged_paras(self, fileids=None, **kwargs):
     return concat(
         [
             self._view(fileid, mode=IPIPANCorpusView.PARAS_MODE, **kwargs)
             for fileid in self._list_morph_files(fileids)
         ]
     )
Exemplo n.º 5
0
 def raw(self, fileids=None, **kwargs):
     """
     Returns words in specified fileids.
     """
     return concat([self._view(self.add_root(fileid),
                               mode=NKJPCorpusReader.RAW_MODE, **kwargs).handle_query()
                    for fileid in fileids])
 def parsed_sents2(self, fileids=None):
   return concat([JapaneseCorpusView(fileid, enc,
                                     False, False, False, True,
                                     self._syntax_parser,
                                     self._word_tokenizer,
                                     self._sent_tokenizer,
                                     self._case_parser)
                  for (fileid, enc) in self.abspaths(fileids, True)])
Exemplo n.º 7
0
 def raw(self, fileids=None):
     """
     :return: the given file(s) as a single string.
     :rtype: str
     """
     if fileids is None: fileids = self._fileids
     elif isinstance(fileids, basestring): fileids = [fileids]
     return concat([self.open(f).read() for f in fileids])
def fixed_parsed_sents(self, fileids=None, top_label="root"):
    from nltk.corpus.reader.util import concat
    from nltk.corpus.reader.dependency import DependencyCorpusView
    from nltk.parse import DependencyGraph
    
    sents=concat([DependencyCorpusView(fileid, False, True, True, encoding=enc)
                  for fileid, enc in self.abspaths(fileids, include_encoding=True)])
    return [DependencyGraph(sent, top_relation_label=top_label, cell_separator="\t") for sent in sents]
Exemplo n.º 9
0
 def tagged_words(self, fileids=None, **kwargs):
     """
     Call with specified tags as a list, e.g. tags=['subst', 'comp'].
     Returns tagged words in specified fileids.
     """
     tags = kwargs.pop('tags', [])
     return concat([self._view(self.add_root(fileid),
                               mode=NKJPCorpusReader.WORDS_MODE, tags=tags, **kwargs).handle_query()
                    for fileid in fileids])
Exemplo n.º 10
0
 def raw(self, fileids=None):
     """
     Return the corpora in their raw form.
     """
     if fileids is None:
         fileids = self._fileids
     elif isinstance(fileids, string_types):
         fileids = [fileids]
     return concat([self.open(f).read() for f in fileids])
Exemplo n.º 11
0
 def _views(self, fileids=None, sent=False, tag=False, strip_space=True, stem=False):
     """A helper function that instantiates BNCWordViews or the list of words/sentences."""
     f = BNCWordView if self._lazy else self._words
     return concat(
         [
             f(fileid, sent, tag, strip_space, stem)
             for fileid in self.abspaths(fileids)
         ]
     )
Exemplo n.º 12
0
 def sents(self, fileids=None, **kwargs):
     return concat(
         [
             self._view(
                 fileid, mode=IPIPANCorpusView.SENTS_MODE, tags=False, **kwargs
             )
             for fileid in self._list_morph_files(fileids)
         ]
     )
Exemplo n.º 13
0
 def aligned_sents(self, fileids=None):
     """
     :return: the given file(s) as a list of AlignedSent objects.
     :rtype: list of C{AlignedSent}
     """
     return concat([AlignedSentCorpusView(fileid, enc, True, True,
                                          self._word_tokenizer,
                                          self._sent_tokenizer,
                                          self._alignedsent_block_reader)
                    for (fileid, enc) in self.abspaths(fileids, True)])
Exemplo n.º 14
0
 def parsed_docs(self, fileids=None):
     """
     @return: A list of parsed corpus documents.
     @rtype: C{list} of C{StreamBackedCorpusView}
     @param fileids: A list of corpus files.
     @type fileids: C{list} of C{str} or regular expression
     """        
     return concat([StreamBackedCorpusView(fileid,
                                           self._read_parsed_block,
                                           encoding=enc)
                    for (fileid, enc) in self.abspaths(fileids, True)])
Exemplo n.º 15
0
 def words(self, fileids=None):
     """
     @return: the given file(s) as a list of words
         and punctuation symbols.
     @rtype: C{list} of C{str}
     """
     return concat([self._alignedsent_corpus_view(fileid, enc, False, False,
                                          self._word_tokenizer,
                                          self._sent_tokenizer,
                                          self._alignedsent_block_reader)
                    for (fileid, enc) in self.abspaths(fileids, True)])
Exemplo n.º 16
0
 def words(self, fileids=None):
     """
     :return: the given file(s) as a list of words
         and punctuation symbols.
     :rtype: list of str
     """
     return concat([AlignedSentCorpusView(fileid, enc, False, False,
                                          self._word_tokenizer,
                                          self._sent_tokenizer,
                                          self._alignedsent_block_reader)
                    for (fileid, enc) in self.abspaths(fileids, True)])
Exemplo n.º 17
0
    def docs(self, fileids=None):
        """
        Returns the full Tweet objects, as specified by `Twitter
        documentation on Tweets
        <https://dev.twitter.com/docs/platform-objects/tweets>`_

        :return: the given file(s) as a list of dictionaries deserialised
        from JSON.
        :rtype: list(dict)
        """
        return concat([self.CorpusView(path, self._read_tweets, encoding=enc)
                       for (path, enc, fileid) in self.abspaths(fileids, True, True)])
Exemplo n.º 18
0
 def sents(self, fileids=None):
     """
     :return: the given file(s) as a list of
         sentences or utterances, each encoded as a list of word
         strings.
     :rtype: list of (list of str)
     """
     return concat([AlignedSentCorpusView(fileid, enc, False, True,
                                          self._word_tokenizer,
                                          self._sent_tokenizer,
                                          self._alignedsent_block_reader)
                    for (fileid, enc) in self.abspaths(fileids, True)])
Exemplo n.º 19
0
 def raw(self, fileids=None):
     """
     @return: A list of corpus file contents.
     @rtype: C{list} of C{str}
     @param fileids: A list of corpus files.
     @type fileids: C{list} of C{str} or regular expression
     """
     if fileids is None:
         fileids = self._fileids
     elif isinstance(fileids, basestring): 
         fileids = [fileids]
     return concat([self.open(f).read() for f in fileids])
Exemplo n.º 20
0
 def sents(self, fileids=None):
     """
     @return: the given file(s) as a list of
         sentences or utterances, each encoded as a list of word
         strings.
     @rtype: C{list} of (C{list} of C{str})
     """
     return concat([self._alignedsent_corpus_view(fileid, enc, False, True,
                                          self._word_tokenizer,
                                          self._sent_tokenizer,
                                          self._alignedsent_block_reader)
                    for (fileid, enc) in self.abspaths(fileids, True)])
Exemplo n.º 21
0
def lemmatized_sents(corpus,fileids=None):
	"""
	Retorna árboles cuyas hojas son parejas (word,lemma)
	"""
	from nltk import tree
	from nltk.corpus.reader.util import concat
	def lemmatized(element):
		if element:
			subtrees = map(lemmatized, element)
			subtrees = [t for t in subtrees if t]
			return tree.Tree(element.tag, subtrees)
		elif element.get('elliptic') == 'yes': return None
		else: return tree.Tree(element.get('pos') or element.get('ne') or 'unk', [(element.get('wd'),element.get('lem'))])
	if not fileids: fileids = corpus.xmlreader.fileids()
	return LazyMap(lemmatized, concat([list(corpus.xmlreader.xml(fileid)) for fileid in fileids]))
Exemplo n.º 22
0
 def sents(self, fileids=None, speaker='ALL', sent=True, stem=False, 
         relation=None, pos=False, strip_space=True, replace=False):
     """
     @return: the given file(s) as a list of sentences
     @rtype: C{list} of (C{list} of C{str})
     
     @param speaker: If specified, select specitic speakers defined in 
         the corpus. Default is 'ALL'. Common choices are 'CHI' (all 
         children) and 'MOT' (mothers)
     @param stem: If true, then use word stems instead of word strings.
     @param relation: If true, then return tuples of C{(str,relation_list)}
     @param pos: If true, then return tuples of C{(stem, part_of_speech)}
     @param strip_space: If true, then strip trailing spaces from word 
         tokens. Otherwise, leave the spaces on the tokens.
     @param replace: If true, then use the replaced word instead 
         of the original word (e.g., 'wat' will be replaced with 'watch')
     """
     return concat([self._get_words(fileid, speaker, sent, stem, relation, 
         pos, strip_space, replace) for fileid in self.abspaths(fileids)])
Exemplo n.º 23
0
    def words(self, fileids=None, speaker='ALL', sent=None, stem=False,
            relation=False, pos=False, strip_space=True, replace=False):
        """
        :return: the given file(s) as a list of words
        :rtype: list(str)

        :param speaker: If list is specified, select specitic speakers defined
        	in the corpus. Default is 'All' (all participants). Common choices
        	are ['CHI'] (all children), ['MOT'] (mothers), ['CHI','MOT'] (exclude
        	researchers)
        :param stem: If true, then use word stems instead of word strings.
        :param relation: If true, then return tuples of (stem, index,
            dependent_index)
        :param strip_space: If true, then strip trailing spaces from word
            tokens. Otherwise, leave the spaces on the tokens.
        :param replace: If true, then use the replaced word instead
            of the original word (e.g., 'wat' will be replaced with 'watch')
        """
        return concat([self._get_words(fileid, speaker, sent, stem, relation,
            pos, strip_space, replace) for fileid in self.abspaths(fileids)])
Exemplo n.º 24
0
    def tagged_sents(self, fileids=None, speaker='ALL', sent=True, stem=False,
            relation=None, pos=True, strip_space=True, replace=False):
        """
        :return: the given file(s) as a list of
            sentences, each encoded as a list of ``(word,tag)`` tuples.
        :rtype: list(list(tuple(str,str)))

        :param speaker: If list is specified, select specitic speakers defined
        	in the corpus. Default is 'All' (all participants). Common choices
        	are ['CHI'] (all children), ['MOT'] (mothers), ['CHI','MOT'] (exclude
        	researchers)
        :param stem: If true, then use word stems instead of word strings.
        :param relation: If true, then return tuples of ``(str,pos,relation_list)``.
            If there is manually-annotated relation info, it will return tuples of
            tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)``
        :param strip_space: If true, then strip trailing spaces from word
            tokens. Otherwise, leave the spaces on the tokens.
        :param replace: If true, then use the replaced word instead
            of the original word (e.g., 'wat' will be replaced with 'watch')
        """
        return concat([self._get_words(fileid, speaker, sent, stem, relation,
            pos, strip_space, replace) for fileid in self.abspaths(fileids)])
Exemplo n.º 25
0
    def tagged_words(self, fileids=None, speaker='ALL', stem=False,
            relation=False, strip_space=True, replace=False):
        """
        :return: the given file(s) as a list of tagged
            words and punctuation symbols, encoded as tuples
            ``(word,tag)``.
        :rtype: list(tuple(str,str))

        :param speaker: If specified, select specific speaker(s) defined
            in the corpus. Default is 'ALL' (all participants). Common choices
            are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
            researchers)
        :param stem: If true, then use word stems instead of word strings.
        :param relation: If true, then return tuples of (stem, index,
            dependent_index)
        :param strip_space: If true, then strip trailing spaces from word
            tokens. Otherwise, leave the spaces on the tokens.
        :param replace: If true, then use the replaced (intended) word instead
            of the original word (e.g., 'wat' will be replaced with 'watch')
        """
        sent=None
        pos=True
        return concat([self._get_words(fileid, speaker, sent, stem, relation,
            pos, strip_space, replace) for fileid in self.abspaths(fileids)])
Exemplo n.º 26
0
 def tagged_sents(self, fileids=None):
     if not fileids:
         fileids = self.xmlreader.fileids()
     return LazyMap(tagged, concat([list(self.xmlreader.xml(fileid)) for fileid in fileids]))
Exemplo n.º 27
0
 def raw(self, fileids=None):
     if fileids is None: fileids = self._fileids
     elif isinstance(fileids, string_types): fileids = [fileids]
     return concat([self.open(f).read() for f in fileids])
Exemplo n.º 28
0
	def stemmed_words(self, fileids=None):
		return { t[0].lower(): t[1].lower() for t in concat(self.stemmed_sents(fileids)) }
Exemplo n.º 29
0
 def _views(self, fileids=None, sent=False, tag=False, strip_space=True, stem=False):
     """A helper function that instantiates BNCWordViews or the list of words/sentences."""
     f = BNCWordView if self._lazy else self._words
     return concat([f(fileid, sent, tag, strip_space, stem) for fileid in self.abspaths(fileids)])
Exemplo n.º 30
0
 def freqs(self, fileids=None):
     '''
     Return trigram frequencies for a language from the corpus        
     '''
     return concat([self.CorpusView(path, self._read_trigram_block) 
                    for path in self.abspaths(fileids=fileids)])
Exemplo n.º 31
0
 def tagged_words(self, fileids=None):
     # XXX: use LazyConcatenation?
     return concat(self.tagged_sents(fileids))
Exemplo n.º 32
0
 def sents(self, fileids=None, **kwargs):
     return concat([self._view(fileid,
         mode=IPIPANCorpusView.SENTS_MODE, tags=False, **kwargs)
         for fileid in self._list_morph_files(fileids)])
Exemplo n.º 33
0
		def freqs(self, fileids=None):
			return concat([self.CorpusView(path, self._read_trigram_block) for path in self.abspaths(fileids=fileids)])
Exemplo n.º 34
0
 def tagged_paras(self, fileids=None, **kwargs):
     return concat([self._view(fileid, mode=IPIPANCorpusView.PARAS_MODE,
         **kwargs)
         for fileid in self._list_morph_files(fileids)])
Exemplo n.º 35
0
 def sents(self, fileids=None):
     # FIXME: not lazy!
     if not fileids:
         fileids = self.xmlreader.fileids()
     return LazyMap(untagged, concat([list(self.xmlreader.xml(fileid)) for fileid in fileids]))
Exemplo n.º 36
0
 def tagged_sents(self, fileids=None):
     if not fileids:
         fileids = self.xmlreader.fileids()
     return LazyMap(
         tagged,
         concat([list(self.xmlreader.xml(fileid)) for fileid in fileids]))
Exemplo n.º 37
0
 def tagged_words(self, fileids=None):
     return concat(self.tagged_sents(fileids))
Exemplo n.º 38
0
 def tagged_words(self, fileids=None):
     return concat(self.tagged_sents(fileids))
Exemplo n.º 39
0
 def tagged_words(self, fileids=None, **kwargs):
     return concat(
         [self._view(fileid, **kwargs) for fileid in self._list_morph_files(fileids)]
     )