Exemplo n.º 1
0
    def parse(self, corenlppath = False, operations = False, copula_head = True,
              speaker_segmentation = False, memory_mb = False, *args, **kwargs):
        """
        Parse an unparsed corpus, saving to disk

           >>> parsed = corpus.parse(speaker_segmentation = True)

        :param corenlppath: folder containing corenlp jar files
        :type corenlppath: str
                
        :param operations: which kinds of annotations to do
        :type operations: str
        
        :param speaker_segmentation: add speaker name to parser output if your corpus is script-like:
        :type speaker_segmentation: bool

        :param memory_mb: Amount of memory in MB for parser
        :type memory_mb: int

        :param copula_head: Make copula head in dependency parse
        :type copula_head: bool

        :returns: The newly created :class:`corpkit.corpus.Corpus`
        """
        from make import make_corpus
        from corpus import Corpus
        #from process import determine_datatype
        #dtype, singlefile = determine_datatype(self.path)
        if self.datatype != 'plaintext':
            raise ValueError('parse method can only be used on plaintext corpora.')
        kwargs.pop('parse', None)
        kwargs.pop('tokenise', None)
        return Corpus(make_corpus(self.path, parse = True, tokenise = False, 
              corenlppath = corenlppath, operations = operations, copula_head = copula_head,
              speaker_segmentation = speaker_segmentation, memory_mb = memory_mb, *args, **kwargs))
Exemplo n.º 2
0
    def tokenise(self, *args, **kwargs):
        """
        Tokenise a plaintext corpus, saving to disk

        :param nltk_data_path: path to tokeniser if not found automatically
        :type nltk_data_path: str

        :Example:

        >>> tok = corpus.tokenise()
        >>> tok
        <corpkit.corpus.Corpus instance: speeches-tokenised; 9 subcorpora>

        :returns: The newly created :class:`corpkit.corpus.Corpus`
        """
        
        from corpkit import make_corpus
        from corpus import Corpus
        #from process import determine_datatype
        #dtype, singlefile = determine_datatype(self.path)
        if self.datatype != 'plaintext':
            raise ValueError('parse method can only be used on plaintext corpora.')
        kwargs.pop('parse', None)
        kwargs.pop('tokenise', None)

        return Corpus(make_corpus(self.path, parse = False, tokenise = True, *args, **kwargs))
Exemplo n.º 3
0
    def tokenise(self, *args, **kwargs):
        """
        Tokenise a plaintext corpus, saving to disk

        :param nltk_data_path: path to tokeniser if not found automatically
        :type nltk_data_path: str

        :Example:

        >>> tok = corpus.tokenise()
        >>> tok
        <corpkit.corpus.Corpus instance: speeches-tokenised; 9 subcorpora>

        :returns: The newly created :class:`corpkit.corpus.Corpus`
        """

        from corpkit import make_corpus
        from corpus import Corpus
        #from process import determine_datatype
        #dtype, singlefile = determine_datatype(self.path)
        if self.datatype != 'plaintext':
            raise ValueError(
                'parse method can only be used on plaintext corpora.')
        kwargs.pop('parse', None)
        kwargs.pop('tokenise', None)

        return Corpus(
            make_corpus(self.path, parse=False, tokenise=True, *args,
                        **kwargs))
Exemplo n.º 4
0
    def parse(self,
              corenlppath=False,
              operations=False,
              copula_head=True,
              speaker_segmentation=False,
              memory_mb=False,
              *args,
              **kwargs):
        """
        Parse an unparsed corpus, saving to disk

        :param corenlppath: folder containing corenlp jar files
        :type corenlppath: str
                
        :param operations: which kinds of annotations to do
        :type operations: str
        
        :param speaker_segmentation: add speaker name to parser output if your corpus is script-like:
        :type speaker_segmentation: bool

        :param memory_mb: Amount of memory in MB for parser
        :type memory_mb: int

        :param copula_head: Make copula head in dependency parse
        :type copula_head: bool

        :Example:

        >>> parsed = corpus.parse(speaker_segmentation = True)
        >>> parsed
        <corpkit.corpus.Corpus instance: speeches-parsed; 9 subcorpora>


        :returns: The newly created :class:`corpkit.corpus.Corpus`
        """
        from make import make_corpus
        from corpus import Corpus
        #from process import determine_datatype
        #dtype, singlefile = determine_datatype(self.path)
        if self.datatype != 'plaintext':
            raise ValueError(
                'parse method can only be used on plaintext corpora.')
        kwargs.pop('parse', None)
        kwargs.pop('tokenise', None)
        return Corpus(
            make_corpus(self.path,
                        parse=True,
                        tokenise=False,
                        corenlppath=corenlppath,
                        operations=operations,
                        copula_head=copula_head,
                        speaker_segmentation=speaker_segmentation,
                        memory_mb=memory_mb,
                        *args,
                        **kwargs))
Exemplo n.º 5
0
    def tokenise(self, *args, **kwargs):
        """
        Tokenise a plaintext corpus, saving to disk

           >>> tok = corpus.tokenise()

        :param nltk_data_path: path to tokeniser if not found automatically
        :type nltk_data_path: str

        :returns: The newly created :class:`corpkit.corpus.Corpus`
        """

        from corpkit import make_corpus
        from corpus import Corpus

        # from process import determine_datatype
        # dtype, singlefile = determine_datatype(self.path)
        if self.datatype != "plaintext":
            raise ValueError("parse method can only be used on plaintext corpora.")
        kwargs.pop("parse", None)
        kwargs.pop("tokenise", None)

        return Corpus(make_corpus(self.path, parse=False, tokenise=True, *args, **kwargs))