Exemplo n.º 1
0
    def ARPosTag(self, List):        
        patterns = [
            ('^(الله|لله|ربنا|رب|إله)$','لفظ جلالة'),
            ('^(به|فيه|عنه|إليه|اليه|كل|بعض)$','حرف'),
            ('^(هذا|هذه|هذان|هاتان|هؤلاء|تلك|أولئك)$', 'اسم إشارة'),
            ('^(ثم|حتا|أو|أم|لكن|لا|مع)$', 'حرف عطف'),
            ('^(من|إلى|الى|عن|على|في|فى)$', 'حرف جر'),
            ('^(هى|هو|هي|هما|هم|هن)$', 'ضمير غائب'),
            ('^(أنت|أنتما|أنتم|أنتن|إياك|إياكما|إياكم|إياكن)$', 'ضمير متكلم'),
            ('^(كان|اصبح|أصبح|أمسى|امسى|ظل|اضحى|أضحى|بات|صار|ليس|ما زال|ما برح|ما انفك|ما دام|ما فتئ)$','كان وأخواتها'),
            ('^(إن|أن|ان|كأن|لكن|لعل|ليت)$','إن وأخواتها'),
            ('^(هل|من|أي|ما|ماذا|متى|أين|كيف|كم|لماذا|أنى|أيان)$', 'حرف /اسم استفهام'),
            ('^(حين|صباح|ظهر|ساعة|سنة|أمس|مساء)$', 'ظرف زمان'),
            ('^(فوق|تحت|أمام|وراء|حيث|دون)$', 'ظرف مكان'),
            ('^(الذي|التي|اللذان|اللتان|الذين|اللاتي|اللواتي|اللائي)$', 'اسم موصول'),
            ('([ا-ي]{3}ان)|([ا-ي]{3}ى)|([ا-ي]{3}ء)|[أا]حمر|[أا]صفر|[أا]خضر|رمادي|[أا]سود|[أا]زرق','صفة'),
            #('^([ا-ي]{2}ا[ا-ي])$|^([ا-ي]{2}و[ا-ي])$|^([ا-ي]{2}ي[ا-ي])$','صفة مشبهه باسم فاعل'),
            ('^([ا-ي]{3}ة)$|^(م[ا-ي]{2}و[ا-ي])$','اسم مفعول'),
            ('^(م[ا-ي]{3})$','اسمي الزمان والمكان'),
            ('^س?[نايت][ا-ي]{3,4}$|^[ا-ي]{3,4}$|^س?[نايت][ا-ي]ا[ا-ي]{2}$|^س?[نايت]ن[ا-ي]{3}$|^س?[نايت]ت[ا-ي]ا[ا-ي]{2}$|^[نايت]ست[ا-ي]{3}$|^[نايت]ت[ا-ي]{4}$','فعل'),
            ('^((وال)|(فال)|(بال)|(كال)|(ال)).+|^ت[ا-ي]{2}ي[ا-ي]$|^[ا-ي]{2}[واي][ا-ي]$', 'اسم'),
            ('.+((ائي)|(انك)|(انه)|(اؤك)|(اؤه)|(اءك)|(اءه)|(هما)|(كما)|(ات)|(ة))$|^[ا-ي]ا[ا-ي]{2}ة?$', 'اسم'),
            ('','اسم'),
        ]
        reg = RegexpTagger(patterns)

        tmpList = []
        for k in List:
            tmp = araby.strip_tashkeel(k)
            tmp2=''
            for i in self.s2:
                if tmp.endswith(i):
                    a=2
                    tmp2=tmp[0:-a]
                else:
                    tmp2=tmp
            tmpList.append(tmp2)        
        return reg.tag(tmpList)  
Exemplo n.º 2
0
class MaltParser(ParserI):

    def __init__(self, tagger=None):
        self.config_malt()
        self.mco = 'malt_temp'
        self._trained = False
        
        if tagger is not None:
            self.tagger = tagger
        else:
            self.tagger = RegexpTagger(
            [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
             (r'(The|the|A|a|An|an)$', 'AT'),   # articles
             (r'.*able$', 'JJ'),                # adjectives
             (r'.*ness$', 'NN'),                # nouns formed from adjectives
             (r'.*ly$', 'RB'),                  # adverbs
             (r'.*s$', 'NNS'),                  # plural nouns
             (r'.*ing$', 'VBG'),                # gerunds
             (r'.*ed$', 'VBD'),                 # past tense verbs
             (r'.*', 'NN')                      # nouns (default)
             ])
    
    def config_malt(self, bin=None, verbose=False):
        """
        Configure NLTK's interface to the C{malt} package.  This
        searches for a directory containing the malt jar
        
        :param bin: The full path to the C{malt} binary.  If not
            specified, then nltk will search the system for a C{malt}
            binary; and if one is not found, it will raise a
            C{LookupError} exception.
        :type bin: str
        """
        #: A list of directories that should be searched for the malt
        #: executables.  This list is used by L{config_malt} when searching
        #: for the malt executables.
        _malt_path = ['.',
                     '/usr/lib/malt-1*',
                     '/usr/share/malt-1*',
                     '/usr/local/bin',
                     '/usr/local/malt-1*',
                     '/usr/local/bin/malt-1*',
                     '/usr/local/malt-1*',
                     '/usr/local/share/malt-1*']
        
        # Expand wildcards in _malt_path:
        malt_path = reduce(add, map(glob.glob, _malt_path))

        # Find the malt binary.
        self._malt_bin = find_binary('malt.jar', bin,
            searchpath=malt_path, env_vars=['MALTPARSERHOME'],
            url='http://w3.msi.vxu.se/~jha/maltparser/index.html',
            verbose=verbose)

    def parse(self, sentence, verbose=False):
        """
        Use MaltParser to parse a sentence. Takes a sentence as a list of
        words; it will be automatically tagged with this MaltParser instance's
        tagger.
        
        :param sentence: Input sentence to parse
        :type sentence: L{list} of L{string}
        :return: C{DependencyGraph} the dependency graph representation of the sentence
        """
        taggedwords = self.tagger.tag(sentence)
        return self.tagged_parse(taggedwords, verbose)

    def raw_parse(self, sentence, verbose=False):
        """
        Use MaltParser to parse a sentence. Takes a sentence as a string;
        before parsing, it will be automatically tokenized and tagged with this
        MaltParser instance's tagger.
        
        :param sentence: Input sentence to parse
        :type sentence: L{string}
        :return: C{DependencyGraph} the dependency graph representation of the sentence
        """
        words = word_tokenize(sentence)
        return self.parse(words, verbose)
      
    def tagged_parse(self, sentence, verbose=False):
        """
        Use MaltParser to parse a sentence. Takes a sentence as a list of
        (word, tag) tuples; the sentence must have already been tokenized and
        tagged.
        
        :param sentence: Input sentence to parse
        :type sentence: L{list} of (word, tag) L{tuple}s.
        :return: C{DependencyGraph} the dependency graph representation of the sentence
        """

        if not self._malt_bin:
            raise Exception("MaltParser location is not configured.  Call config_malt() first.")
        if not self._trained:
            raise Exception("Parser has not been trained.  Call train() first.")
            
        input_file = os.path.join(tempfile.gettempdir(), 'malt_input.conll')
        output_file = os.path.join(tempfile.gettempdir(), 'malt_output.conll')
        
        execute_string = 'java -jar %s -w %s -c %s -i %s -o %s -m parse'
        if not verbose:
            execute_string += ' > ' + os.path.join(tempfile.gettempdir(), "malt.out")
        
        f = None
        try:
            f = open(input_file, 'w')

            for (i, (word,tag)) in enumerate(sentence):
                f.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % 
                        (i+1, word, '_', tag, tag, '_', '0', 'a', '_', '_'))
            f.write('\n')
            f.close()
        
            cmd = ['java', '-jar %s' % self._malt_bin, '-w %s' % tempfile.gettempdir(), 
                   '-c %s' % self.mco, '-i %s' % input_file, '-o %s' % output_file, '-m parse']

            self._execute(cmd, 'parse', verbose)
            
            return DependencyGraph.load(output_file)
        finally:
            if f: f.close()
    
    def train(self, depgraphs, verbose=False):
        """
        Train MaltParser from a list of C{DependencyGraph}s
        
        :param depgraphs: list of C{DependencyGraph}s for training input data
        """
        input_file = os.path.join(tempfile.gettempdir(),'malt_train.conll')

        f = None
        try:
            f = open(input_file, 'w')
            f.write('\n'.join([dg.to_conll(10) for dg in depgraphs]))
        finally:
            if f: f.close()
            
        self.train_from_file(input_file, verbose=verbose)

    def train_from_file(self, conll_file, verbose=False):
        """
        Train MaltParser from a file
        
        :param conll_file: str for the filename of the training input data
        """
        if not self._malt_bin:
            raise Exception("MaltParser location is not configured.  Call config_malt() first.")

        # If conll_file is a ZipFilePathPointer, then we need to do some extra massaging
        f = None
        if hasattr(conll_file, 'zipfile'):
            zip_conll_file = conll_file
            conll_file = os.path.join(tempfile.gettempdir(),'malt_train.conll')
            conll_str = zip_conll_file.open().read()
            f = open(conll_file,'w')
            f.write(conll_str)
            f.close()        

        cmd = ['java', '-jar %s' % self._malt_bin, '-w %s' % tempfile.gettempdir(), 
               '-c %s' % self.mco, '-i %s' % conll_file, '-m learn']
        
#        p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
#                             stderr=subprocess.STDOUT,
#                             stdin=subprocess.PIPE)
#        (stdout, stderr) = p.communicate()
                
        self._execute(cmd, 'train', verbose)
        
        self._trained = True
        
    def _execute(self, cmd, type, verbose=False):
        if not verbose: 
            temp_dir = os.path.join(tempfile.gettempdir(), '')
            cmd.append(' > %smalt_%s.out 2> %smalt_%s.err' % ((temp_dir, type)*2))
        malt_exit = os.system(' '.join(cmd))
Exemplo n.º 3
0
from nltk.tag import RegexpTagger
# define regex tag patterns
patterns = [
        (r'.*ing$', 'VBG'),               # gerunds
        (r'.*ed$', 'VBD'),                # simple past
        (r'.*es$', 'VBZ'),                # 3rd singular present
        (r'.*ould$', 'MD'),               # modals
        (r'.*\'s$', 'NN$'),               # possessive nouns
        (r'.*s$', 'NNS'),                 # plural nouns
        (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'.*', 'NN')                     # nouns (default) ... 
]
rt = RegexpTagger(patterns)

print rt.evaluate(test_data)
print rt.tag(tokens)


## N gram taggers
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger

ut = UnigramTagger(train_data)
bt = BigramTagger(train_data)
tt = TrigramTagger(train_data)

print ut.evaluate(test_data)
print ut.tag(tokens)

print bt.evaluate(test_data)
Exemplo n.º 4
0
class MaltParser(ParserI):

    def __init__(self, tagger=None, mco=None, working_dir=None, additional_java_args=None):
        """
        An interface for parsing with the Malt Parser.

        :param mco: The name of the pre-trained model. If provided, training
            will not be required, and MaltParser will use the model file in
            ${working_dir}/${mco}.mco.
        :type mco: str
        """
        self.config_malt()
        self.mco = 'malt_temp' if mco is None else mco
        self.working_dir = tempfile.gettempdir() if working_dir is None\
                           else working_dir
        self.additional_java_args = [] if additional_java_args is None else additional_java_args
        self._trained = mco is not None

        if tagger is not None:
            self.tagger = tagger
        else:
            self.tagger = RegexpTagger(
            [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
             (r'(The|the|A|a|An|an)$', 'AT'),   # articles
             (r'.*able$', 'JJ'),                # adjectives
             (r'.*ness$', 'NN'),                # nouns formed from adjectives
             (r'.*ly$', 'RB'),                  # adverbs
             (r'.*s$', 'NNS'),                  # plural nouns
             (r'.*ing$', 'VBG'),                # gerunds
             (r'.*ed$', 'VBD'),                 # past tense verbs
             (r'.*', 'NN')                      # nouns (default)
             ])

    def config_malt(self, bin=None, verbose=False):
        """
        Configure NLTK's interface to the ``malt`` package.  This
        searches for a directory containing the malt jar

        :param bin: The full path to the ``malt`` binary.  If not
            specified, then nltk will search the system for a ``malt``
            binary; and if one is not found, it will raise a
            ``LookupError`` exception.
        :type bin: str
        """
        #: A list of directories that should be searched for the malt
        #: executables.  This list is used by ``config_malt`` when searching
        #: for the malt executables.
        _malt_path = ['.',
                     '/usr/lib/malt-1*',
                     '/usr/share/malt-1*',
                     '/usr/local/bin',
                     '/usr/local/malt-1*',
                     '/usr/local/bin/malt-1*',
                     '/usr/local/malt-1*',
                     '/usr/local/share/malt-1*']

        # Expand wildcards in _malt_path:
        malt_path = reduce(add, map(glob.glob, _malt_path))

        # Find the malt binary.
        self._malt_bin = find_binary('malt.jar', bin,
            searchpath=malt_path, env_vars=['MALTPARSERHOME'],
            url='http://www.maltparser.org/',
            verbose=verbose)

    def parse(self, sentence, verbose=False):
        """
        Use MaltParser to parse a sentence. Takes a sentence as a list of
        words; it will be automatically tagged with this MaltParser instance's
        tagger.

        :param sentence: Input sentence to parse
        :type sentence: list(str)
        :return: ``DependencyGraph`` the dependency graph representation of the sentence
        """
        return self.batch_parse([sentence], verbose)[0]

    def batch_parse(self, sentences, verbose=False):
        """
        Use MaltParser to parse multiple sentence. Takes multiple sentences as a
        list where each sentence is a list of words.
        Each sentence will be automatically tagged with this MaltParser instance's
        tagger.

        :param sentences: Input sentences to parse
        :type sentence: list(list(str))
        :return: list(``DependencyGraph``) the dependency graph representation
                 of each sentence
        """
        tagged_sentences = [self.tagger.tag(sentence) for sentence in sentences]
        return self.tagged_batch_parse(tagged_sentences, verbose)

    def raw_parse(self, sentence, verbose=False):
        """
        Use MaltParser to parse a sentence. Takes a sentence as a string;
        before parsing, it will be automatically tokenized and tagged with this
        MaltParser instance's tagger.

        :param sentence: Input sentence to parse
        :type sentence: str
        :return: ``DependencyGraph`` the dependency graph representation of the sentence
        """
        words = word_tokenize(sentence)
        return self.parse(words, verbose)

    def tagged_parse(self, sentence, verbose=False):
        """
        Use MaltParser to parse a sentence. Takes a sentence as a list of
        (word, tag) tuples; the sentence must have already been tokenized and
        tagged.

        :param sentence: Input sentence to parse
        :type sentence: list(tuple(str, str))
        :return: ``DependencyGraph`` the dependency graph representation of the sentence
        """
        return self.tagged_batch_parse([sentence], verbose)[0]

    def tagged_batch_parse(self, sentences, verbose=False):
        """
        Use MaltParser to parse multiple sentences. Takes multiple sentences
        where each sentence is a list of (word, tag) tuples.
        The sentences must have already been tokenized and tagged.

        :param sentences: Input sentences to parse
        :type sentence: list(list(tuple(str, str)))
        :return: list(``DependencyGraph``) the dependency graph representation
                 of each sentence
        """

        if not self._malt_bin:
            raise Exception("MaltParser location is not configured.  Call config_malt() first.")
        if not self._trained:
            raise Exception("Parser has not been trained.  Call train() first.")

        input_file = tempfile.NamedTemporaryFile(prefix='malt_input.conll',
                                                 dir=self.working_dir,
                                                 delete=False)
        output_file = tempfile.NamedTemporaryFile(prefix='malt_output.conll',
                                                 dir=self.working_dir,
                                                 delete=False)

        try:
            for sentence in sentences:
                for (i, (word, tag)) in enumerate(sentence, start=1):
                    input_str = '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
                        (i, word, '_', tag, tag, '_', '0', 'a', '_', '_')
                    input_file.write(input_str.encode("utf8"))
                input_file.write(b'\n\n')
            input_file.close()

            cmd = ['java'] + self.additional_java_args + ['-jar', self._malt_bin,
                   '-w', self.working_dir,
                   '-c', self.mco, '-i', input_file.name,
                   '-o', output_file.name, '-m', 'parse']

            ret = self._execute(cmd, verbose)
            if ret != 0:
                raise Exception("MaltParser parsing (%s) failed with exit "
                                "code %d" % (' '.join(cmd), ret))

            return DependencyGraph.load(output_file.name)
        finally:
            input_file.close()
            os.remove(input_file.name)
            output_file.close()
            os.remove(output_file.name)

    def train(self, depgraphs, verbose=False):
        """
        Train MaltParser from a list of ``DependencyGraph`` objects

        :param depgraphs: list of ``DependencyGraph`` objects for training input data
        """
        input_file = tempfile.NamedTemporaryFile(prefix='malt_train.conll',
                                                 dir=self.working_dir,
                                                 delete=False)
        try:
            input_str = ('\n'.join(dg.to_conll(10) for dg in depgraphs))
            input_file.write(input_str.encode("utf8"))
            input_file.close()
            self.train_from_file(input_file.name, verbose=verbose)
        finally:
            input_file.close()
            os.remove(input_file.name)

    def train_from_file(self, conll_file, verbose=False):
        """
        Train MaltParser from a file

        :param conll_file: str for the filename of the training input data
        """
        if not self._malt_bin:
            raise Exception("MaltParser location is not configured.  Call config_malt() first.")

        # If conll_file is a ZipFilePathPointer, then we need to do some extra
        # massaging
        if isinstance(conll_file, ZipFilePathPointer):
            input_file = tempfile.NamedTemporaryFile(prefix='malt_train.conll',
                                                     dir=self.working_dir,
                                                     delete=False)
            try:
                conll_str = conll_file.open().read()
                conll_file.close()
                input_file.write(conll_str)
                input_file.close()
                return self.train_from_file(input_file.name, verbose=verbose)
            finally:
                input_file.close()
                os.remove(input_file.name)

        cmd = ['java', '-jar', self._malt_bin, '-w', self.working_dir,
               '-c', self.mco, '-i', conll_file, '-m', 'learn']

        ret = self._execute(cmd, verbose)
        if ret != 0:
            raise Exception("MaltParser training (%s) "
                            "failed with exit code %d" %
                            (' '.join(cmd), ret))

        self._trained = True

    @staticmethod
    def _execute(cmd, verbose=False):
        output = None if verbose else subprocess.PIPE
        p = subprocess.Popen(cmd, stdout=output, stderr=output)
        return p.wait()
Exemplo n.º 5
0
class MaltParser(ParserI):
    def __init__(self, tagger=None):
        self.config_malt()
        self.mco = 'malt_temp'
        self._trained = False

        if tagger is not None:
            self.tagger = tagger
        else:
            self.tagger = RegexpTagger([
                (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
                (r'(The|the|A|a|An|an)$', 'AT'),  # articles
                (r'.*able$', 'JJ'),  # adjectives
                (r'.*ness$', 'NN'),  # nouns formed from adjectives
                (r'.*ly$', 'RB'),  # adverbs
                (r'.*s$', 'NNS'),  # plural nouns
                (r'.*ing$', 'VBG'),  # gerunds
                (r'.*ed$', 'VBD'),  # past tense verbs
                (r'.*', 'NN')  # nouns (default)
            ])

    def config_malt(self, bin=None, verbose=False):
        """
        Configure NLTK's interface to the ``malt`` package.  This
        searches for a directory containing the malt jar

        :param bin: The full path to the ``malt`` binary.  If not
            specified, then nltk will search the system for a ``malt``
            binary; and if one is not found, it will raise a
            ``LookupError`` exception.
        :type bin: str
        """
        #: A list of directories that should be searched for the malt
        #: executables.  This list is used by ``config_malt`` when searching
        #: for the malt executables.
        _malt_path = [
            '.', '/usr/lib/malt-1*', '/usr/share/malt-1*', '/usr/local/bin',
            '/usr/local/malt-1*', '/usr/local/bin/malt-1*',
            '/usr/local/malt-1*', '/usr/local/share/malt-1*'
        ]

        # Expand wildcards in _malt_path:
        malt_path = reduce(add, map(glob.glob, _malt_path))

        # Find the malt binary.
        self._malt_bin = find_binary(
            'malt.jar',
            bin,
            searchpath=malt_path,
            env_vars=['MALTPARSERHOME'],
            url='http://w3.msi.vxu.se/~jha/maltparser/index.html',
            verbose=verbose)

    def parse(self, sentence, verbose=False):
        """
        Use MaltParser to parse a sentence. Takes a sentence as a list of
        words; it will be automatically tagged with this MaltParser instance's
        tagger.

        :param sentence: Input sentence to parse
        :type sentence: list(str)
        :return: ``DependencyGraph`` the dependency graph representation of the sentence
        """
        taggedwords = self.tagger.tag(sentence)
        return self.tagged_parse(taggedwords, verbose)

    def raw_parse(self, sentence, verbose=False):
        """
        Use MaltParser to parse a sentence. Takes a sentence as a string;
        before parsing, it will be automatically tokenized and tagged with this
        MaltParser instance's tagger.

        :param sentence: Input sentence to parse
        :type sentence: str
        :return: ``DependencyGraph`` the dependency graph representation of the sentence
        """
        words = word_tokenize(sentence)
        return self.parse(words, verbose)

    def tagged_parse(self, sentence, verbose=False):
        """
        Use MaltParser to parse a sentence. Takes a sentence as a list of
        (word, tag) tuples; the sentence must have already been tokenized and
        tagged.

        :param sentence: Input sentence to parse
        :type sentence: list(tuple(str, str))
        :return: ``DependencyGraph`` the dependency graph representation of the sentence
        """

        if not self._malt_bin:
            raise Exception(
                "MaltParser location is not configured.  Call config_malt() first."
            )
        if not self._trained:
            raise Exception(
                "Parser has not been trained.  Call train() first.")

        input_file = os.path.join(tempfile.gettempdir(), 'malt_input.conll')
        output_file = os.path.join(tempfile.gettempdir(), 'malt_output.conll')

        execute_string = 'java -jar %s -w %s -c %s -i %s -o %s -m parse'
        if not verbose:
            execute_string += ' > ' + os.path.join(tempfile.gettempdir(),
                                                   "malt.out")

        f = None
        try:
            f = open(input_file, 'w')

            for (i, (word, tag)) in enumerate(sentence):
                f.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %
                        (i + 1, word, '_', tag, tag, '_', '0', 'a', '_', '_'))
            f.write('\n')
            f.close()

            cmd = [
                'java',
                '-jar %s' % self._malt_bin,
                '-w %s' % tempfile.gettempdir(),
                '-c %s' % self.mco,
                '-i %s' % input_file,
                '-o %s' % output_file, '-m parse'
            ]

            self._execute(cmd, 'parse', verbose)

            return DependencyGraph.load(output_file)
        finally:
            if f: f.close()

    def train(self, depgraphs, verbose=False):
        """
        Train MaltParser from a list of ``DependencyGraph`` objects

        :param depgraphs: list of ``DependencyGraph`` objects for training input data
        """
        input_file = os.path.join(tempfile.gettempdir(), 'malt_train.conll')

        f = None
        try:
            f = open(input_file, 'w')
            f.write('\n'.join([dg.to_conll(10) for dg in depgraphs]))
        finally:
            if f: f.close()

        self.train_from_file(input_file, verbose=verbose)

    def train_from_file(self, conll_file, verbose=False):
        """
        Train MaltParser from a file

        :param conll_file: str for the filename of the training input data
        """
        if not self._malt_bin:
            raise Exception(
                "MaltParser location is not configured.  Call config_malt() first."
            )

        # If conll_file is a ZipFilePathPointer, then we need to do some extra massaging
        f = None
        if hasattr(conll_file, 'zipfile'):
            zip_conll_file = conll_file
            conll_file = os.path.join(tempfile.gettempdir(),
                                      'malt_train.conll')
            conll_str = zip_conll_file.open().read()
            f = open(conll_file, 'w')
            f.write(conll_str)
            f.close()

        cmd = [
            'java',
            '-jar %s' % self._malt_bin,
            '-w %s' % tempfile.gettempdir(),
            '-c %s' % self.mco,
            '-i %s' % conll_file, '-m learn'
        ]

        #        p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
        #                             stderr=subprocess.STDOUT,
        #                             stdin=subprocess.PIPE)
        #        (stdout, stderr) = p.communicate()

        self._execute(cmd, 'train', verbose)

        self._trained = True

    def _execute(self, cmd, type, verbose=False):
        if not verbose:
            temp_dir = os.path.join(tempfile.gettempdir(), '')
            cmd.append(' > %smalt_%s.out 2> %smalt_%s.err' %
                       ((temp_dir, type) * 2))
        malt_exit = os.system(' '.join(cmd))
Exemplo n.º 6
0
# are evaluated bottom up and thus, the last one defines the default tag
patterns = [
    (r".*ing$", "VBG"),  # Gerunds
    (r".*ed$", "VBD"),  # Simple past
    (r".*es$", "VBZ"),  # 3rd singular present
    (r".*ould$", "MD"),  # Modals
    (r".*'s$", "NN$"),  # Possesive pronouns
    (r".*s$", "NNS"),  # Plural nouns
    (r"^-?[0-9]+(.[0-9]+)?$", "CD"),  # Cardinal numbers
    (r".*", "NN")  # Nouns (default)
]

rt = RegexpTagger(regexps=patterns)

print(rt.evaluate(test_data))
print(rt.tag(tokens))

# 3. N-GRAM TAGGERS:
#    Contiguous sequences of n items from a sequence of text or speech. Items can be words, phonemes,
#    letters, characters or syllabes. Shingles: n-grams where items are just words.
#    UnigramTagger -> NGramTagger -> ContextTagger -> SequentialBackoffTagger

# Train the N-Gram taggers using the training_data (pre-tagged tokens, i.e. labeled observations)
ut = UnigramTagger(train=train_data)
bt = BigramTagger(train_data)
tt = TrigramTagger(train_data)

# Test the performance of each N-Gram tagger
print("1-Gram Tagger Accuracy: {}".format(ut.evaluate(test_data)))
print("2-Gram Tagger Accuracy: {}".format(bt.evaluate(test_data)))
print("3-Gram Tagger Accuracy: {}".format(tt.evaluate(test_data)))
Exemplo n.º 7
0
depparser = nltk.MaltParser(
    tagger=tagger,
    parser_dirname='D:\\Users\\Administrator\\Library\\maltparser-1.9.2')
rc = nltk.DrtGlueReadingCommand(depparser=depparser)
dt = nltk.DiscourseTester(['Every dog chases a boy', 'He runs'], rc)
dt.readings()

# TypeError: 'RegexpTagger' object is not callable
# 估计是版本不匹配造成的

import nltk

pattern = [(r'(March)$', 'MAR')]
tagger = nltk.RegexpTagger(pattern)
print(tagger.tag('He was born in March 1991'))
print(tagger.tag(nltk.word_tokenize('He was born in March 1991')))

# 下面是短乎上给出的修改建议,测试了依然不行。
# 具体可参考 https://www.zhihu.com/people/meng-hui-wei-lai-de-colin/activities
tagger = RegexpTagger([('^(chases|runs)$', 'VB'), ('^(a)$', 'ex_quant'),
                       ('^(every)$', 'univ_quant'), ('^(dog|boy)$', 'NN'),
                       ('^(He)$', 'PRP')])
depparser = nltk.MaltParser(
    tagger=tagger.tag,
    parser_dirname='D:\\Users\\Administrator\\Library\\maltparser')
rc = nltk.DrtGlueReadingCommand(depparser=depparser)
dt = nltk.DiscourseTester(
    [sent.split() for sent in ['Every dog chases a boy']], reading_command=rc)
dt.readings()
Exemplo n.º 8
0
    (r'.*ed$', 'VBD'),  # simple past
    (r'.*es$', 'VBZ'),  # 3rd singular present
    (r'.*ould$', 'MD'),  # modals
    (r'.*\'s$', 'NNS'),  # possessive nouns
    (r'.*s$', 'NNS'),  # plural nouns
    (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
    (r'.*', 'NN')  # nouns (default)
]

rt = RegexpTagger(patterns)

# accuracy on test data
print(rt.evaluate(test_data))

# tagging our sample headline
rt.tag(nltk.word_tokenize(sentence))

#%%
## N gram taggers
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger

ut = UnigramTagger(train_data)
bt = BigramTagger(train_data)
tt = TrigramTagger(train_data)

# testing performance of unigram tagger
print('unigram tagger: ')
print(ut.evaluate(test_data))
print(ut.tag(nltk.word_tokenize(sentence)))
import nltk
text = nltk.word_tokenize("It is refreshing to read a book about our planet by an author who does not allow \
facts to be __________ by politics: well aware of the political disputes about \
the effects of human activities on climate and biodiversity, this author does not \
permit them to __________ his comprehensive description of what we know \
about our biosphere. He emphasizes the enormous gaps in our knowledge, the \
sparseness of our observations, and the __________, calling attention to the \
many aspects of planetary evolution that must be better understood before we \
can accurately diagnose the condition of our planet.")

# print nltk.pos_tag(text)

from nltk.tag import RegexpTagger
# define regex tag patterns
patterns = [
        (r'.*who$', 'Clause'),  
        (r'.*what$', 'Clause'),  
        (r'.*It$', 'Clause'),          #
        (r'.*:$', 'Repeat'),                # simple past
        (r'.*not$', 'Reverse'),                # 3rd singular present
        (r'.*this$', 'Refer'),     
        (r'.*them$', 'Refer'),         
        (r'.*better$', 'Positive'),  
        (r'.*dispute$', 'Negative'), 
        (r'.*', 'NN')                     # nouns (default) ... 
]
rt = RegexpTagger(patterns)

print rt.tag(text)