def train(self, inst):
        """
        For the moment, human defined descriptors are a string with "," separator.
        """
        if isinstance(inst, (int, str)):
            # We guess we have a pk here
            inst = config.content_model_getter(inst)
        text = getattr(inst, config.SULCI_CONTENT_PROPERTY)
        descriptors = config.descriptors_getter(inst)
        if not descriptors or not text:
            sulci_logger.info(u"Skipping item without data")
            return
        validated_descriptors = set()
        # Retrieve descriptors
        for d in descriptors:
            if not d:
                continue
            # d = d.strip().replace(u"’", u"'")
            # We create the descriptor not in thesaurus for now
            # because descriptors in article and thesaurus are not
            # always matching. Will be improved.
            dsc, created = Descriptor.get_or_connect(name=d)
            dsc.count.hincrby(1)
            # Retrieve the primeval value
#                dsc = dsc.primeval
            validated_descriptors.add(dsc)
            if created:
                sulci_logger.info(u"Lairning descriptor not in thesaurus : %s" % unicode(dsc), "RED")
        # Retrieve keytentities :
        try:
            S = SemanticalTagger(
                text,
                thesaurus=self.thesaurus,
                pos_tagger=self.pos_tagger,
                lexicon=self.pos_tagger.lexicon
            )
            S.deduplicate_keyentities()  # During lairning, try to filter
        except ValueError:
            # SemanticalTagger raise ValueError if text is empty
            return
        current_triggers = set()
        for ke in S.keyentities:
            # Retrieve or create triggers
            t, created = Trigger.get_or_connect(original=unicode(ke))
            current_triggers.add(t)
            t.count.hincrby(1)
#            t.current_score = ke.trigger_score
        # For now, only create all the relations
        for d in validated_descriptors:
            for t in current_triggers:
                t.connect(d, 1)
예제 #2
0
 def handle(self, *args):
     if not self.PK:
         sulci_logger.info(u"A PK is needed. Use -k xxx", "RED")
     else:
         C = Corpus()
         L = Lexicon()
         P = PosTagger(lexicon=L)
         M = Lemmatizer(L)
         a = config.content_model_getter(self.PK)
         t = getattr(a, config.SULCI_CONTENT_PROPERTY)
         T = Thesaurus()
         S = SemanticalTagger(t, T, P, lexicon=L)
         if __debug__:
             S.debug()
         sulci_logger.info(u"Scored descriptors", "YELLOW", True)
         for d, value in S.descriptors:
             sulci_logger.info(u"%s %f" % (unicode(d), value), "BLUE")
         
     if self.IPDB:
         import ipdb; ipdb.set_trace()
 def train(self, inst):
     if isinstance(inst, (int, str)):
         # We guess we have a pk here
         inst = config.content_model_getter(inst)
     text = getattr(inst, config.SULCI_CONTENT_PROPERTY)
     try:
         S = SemanticalTagger(
             text,
             thesaurus=self.thesaurus,
             pos_tagger=self.pos_tagger,
             lexicon=self.pos_tagger.lexicon
         )
         S.deduplicate_keyentities()  # During lairning, try to filter
     except ValueError:
         # SemanticalTagger raise ValueError if text is empty
         return
     # We want also the unigrams
     # Note that the stopwords will not be returned
     ngrams = S.ngrams(min_length=1, max_length=5)
     for key, values in ngrams.iteritems():
         self.global_pmi.add_ngram(values['stemms'], amount=values['count'])
예제 #4
0
파일: sulci_train.py 프로젝트: quatre/sulci
    def handle(self, *args, **options):
        with UseDB(config.TRAINING_DATABASE):
            sulci_logger.info(u"STARTING TRAINING WITH DATABASE «%s»" % config.TRAINING_DATABASE, "RED", True)
            C = Corpus()
            L = Lexicon()
            M = Lemmatizer(L)
            P = PosTagger(lexicon=L)
            if self.LEXICON:
                L.make(self.FORCE)
            if self.SUBPROCESSES:
                import subprocess

                training_kind = (
                    self.LEXICAL
                    and "-e"
                    or self.LEMMATIZER
                    and "-r"
                    or self.SEMANTICAL
                    and "-n"
                    or self.PMI
                    and "-p"
                    or "-c"
                )  # CONTEXTUAL
                # Create slaves
                for i in xrange(0, self.SUBPROCESSES):
                    sulci_logger.info(u"Opening slave subprocess %d" % i, "BLUE", True)
                    sub_args = ["sulci_train.py", training_kind, "--mode=slave"]
                    if self.START is not None:
                        sub_args.append("--start=%s" % self.START)
                    subprocess.Popen(sub_args)
                # Set the mode to the trainer
                self.MODE = "master"
                # Wait to leave time to slave to launch
                time.sleep(1)
            if self.LEXICAL:
                T = LexicalTrainer(P, C, self.MODE)
                T.do()
            elif self.CONTEXTUAL:
                T = ContextualTrainer(P, C, self.MODE)
                T.do()
            elif self.LEMMATIZER:
                T = LemmatizerTrainer(M, self.MODE)
                T.do()
            elif self.PMI:
                T = Thesaurus()
                G = GlobalPMITrainer(T, P, self.MODE)
                G.do()
            elif self.SEMANTICAL:
                T = Thesaurus()
                S = SemanticalTrainer(T, P, self.MODE)
                if self.PK:
                    # Should not have PK in MODE == "master"
                    a = config.content_model_getter(self.PK)
                    S.train(a)
                else:
                    if self.FORCE:
                        S.begin()
                    S.do(start=self.START)
            #                if TRAINER_MODE == "master" and FORCE:
            #                    S.clean_connections()
            if self.ADD_CANDIDATE:
                if not self.PK:
                    print "A PK is needed. Use -k xxx"
                else:
                    a = config.content_model_getter(self.PK)
                    t = getattr(a, config.SULCI_CONTENT_PROPERTY)
                    T = TextCorpus()
                    T.prepare(t, P, M)
                    T.export(self.PK, self.FORCE, self.ADD_LEMMES)
            if self.IPDB:
                import ipdb

                ipdb.set_trace()