def test_default(self): text = SemanticalTagger("Une phrase avec un mot dingue. " "Une autre phrase avec le même mot dingue.") # We should get "mot dingue", who occurs twice: ngrams = text.filtered_ngrams() self.assertEqual(len(ngrams), 1) self.assertEqual( tuple(stemm.main_occurrence.lemme for stemm in ngrams[0][0]), (u"mot", u"dingue"))
def test_retrieve_also_unigrams(self): """ Passing min_count=1, all the ngrams >= bigram should be returned. """ text = SemanticalTagger("Une phrase avec un mot dingue. " "Une autre phrase avec le même mot dingue.") # We should get the same than by default, plus the # non stop words, so 5 + [phrase, mot, dingue] = 8 ngrams = text.ngrams(min_length=1) self.assertEqual(len(ngrams), 8)
def test_should_not_return_ngrams_longer_than_max_length(self): """ Passing min_count=1, all the ngrams >= bigram should be returned. """ text = SemanticalTagger("Une phrase avec un mot dingue. " "Une autre phrase avec le même mot dingue.") # We should only get: # - phrase, avec, un, mot # - mot, dingue ngrams = text.ngrams(max_length=4) self.assertEqual(len(ngrams), 2)
def __call__(self, request, *args, **kwargs): c = {} if request.method == "POST": form = SulciForm(request.POST) if form.is_valid(): db_name = form.cleaned_data["corpus"] with UseDB(db_name): t1 = time.time() content = form.cleaned_data["content"] limit = form.cleaned_data["limit"] min_score = form.cleaned_data["min_score"] if form.cleaned_data["debug"]: debug = [] handler = MemoryStorageHandler(10, target=debug) formatter = HTMLColorFormatter("%(message)s") handler.setFormatter(formatter) sulci_logger.addHandler(handler) S = SemanticalTagger(content) descriptors = [ (unicode(d), round(score, 2)) for d, score in S.get_descriptors(min_score)[:limit] ] if form.cleaned_data['keyentities']: sorted_ke = sorted( S.keyentities, key=lambda k: k.frequency_relative_pmi_confidence, reverse=True) keyentities = [ (unicode(k), round(k.frequency_relative_pmi_confidence * 100, 2)) for k in sorted_ke ] else: keyentities = None c = { "descriptors": descriptors, "keyentities": keyentities, } if form.cleaned_data["debug"]: S.debug() handler.flush() c["debug"] = [handler.format(d) for d in debug] t2 = time.time() c['time'] = round(t2 - t1, 2) else: c = {'errors': form.errors} else: form = SulciForm() for field_name, field in form.fields.iteritems(): c[field_name] = field.help_text return HttpResponse(json.dumps(c), content_type="application/json")
def train(self, inst): """ For the moment, human defined descriptors are a string with "," separator. """ if isinstance(inst, (int, str)): # We guess we have a pk here inst = config.content_model_getter(inst) text = getattr(inst, config.SULCI_CONTENT_PROPERTY) descriptors = config.descriptors_getter(inst) if not descriptors or not text: sulci_logger.info(u"Skipping item without data") return validated_descriptors = set() # Retrieve descriptors for d in descriptors: if not d: continue # d = d.strip().replace(u"’", u"'") # We create the descriptor not in thesaurus for now # because descriptors in article and thesaurus are not # always matching. Will be improved. dsc, created = Descriptor.get_or_connect(name=d) dsc.count.hincrby(1) # Retrieve the primeval value # dsc = dsc.primeval validated_descriptors.add(dsc) if created: sulci_logger.info(u"Lairning descriptor not in thesaurus : %s" % unicode(dsc), "RED") # Retrieve keytentities : try: S = SemanticalTagger( text, thesaurus=self.thesaurus, pos_tagger=self.pos_tagger, lexicon=self.pos_tagger.lexicon ) S.deduplicate_keyentities() # During lairning, try to filter except ValueError: # SemanticalTagger raise ValueError if text is empty return current_triggers = set() for ke in S.keyentities: # Retrieve or create triggers t, created = Trigger.get_or_connect(original=unicode(ke)) current_triggers.add(t) t.count.hincrby(1) # t.current_score = ke.trigger_score # For now, only create all the relations for d in validated_descriptors: for t in current_triggers: t.connect(d, 1)
def validate_file(self, filepath): raw_output, text_content = self.split_file_content(filepath) S = SemanticalTagger(text_content) flat_output = [] for ke in S.keyentities: flat_output.append(" ".join(stemm.main_occurrence.lemme for stemm in ke)) return self.compare_lists(raw_output, flat_output)
def test_default(self): text = SemanticalTagger("Une phrase avec un mot dingue. " "Une autre phrase avec le même mot dingue.") # We should get (stop words at end or beginning are skipped): expected_ngrams = set([ (u"phrase", u"avec", u"un", u"mot", u"dingue"), (u"phrase", u"avec", u"un", u"mot"), (u"phrase", u"avec", u"le", u"même", u"mot", u"dingue"), (u"phrase", u"avec", u"le", u"même", u"mot"), (u"mot", u"dingue"), ]) ngrams = text.ngrams() self.assertEqual(len(ngrams), 5) flat_ngrams = set() for ngram in ngrams: flat_ngrams.add( tuple(stemm.main_occurrence.lemme for stemm in ngram)) self.assertEqual(expected_ngrams, flat_ngrams)
def handle(self, *args): if not self.PK: sulci_logger.info(u"A PK is needed. Use -k xxx", "RED") else: C = Corpus() L = Lexicon() P = PosTagger(lexicon=L) M = Lemmatizer(L) a = config.content_model_getter(self.PK) t = getattr(a, config.SULCI_CONTENT_PROPERTY) T = Thesaurus() S = SemanticalTagger(t, T, P, lexicon=L) if __debug__: S.debug() sulci_logger.info(u"Scored descriptors", "YELLOW", True) for d, value in S.descriptors: sulci_logger.info(u"%s %f" % (unicode(d), value), "BLUE") if self.IPDB: import ipdb; ipdb.set_trace()
def train(self, inst): if isinstance(inst, (int, str)): # We guess we have a pk here inst = config.content_model_getter(inst) text = getattr(inst, config.SULCI_CONTENT_PROPERTY) try: S = SemanticalTagger( text, thesaurus=self.thesaurus, pos_tagger=self.pos_tagger, lexicon=self.pos_tagger.lexicon ) S.deduplicate_keyentities() # During lairning, try to filter except ValueError: # SemanticalTagger raise ValueError if text is empty return # We want also the unigrams # Note that the stopwords will not be returned ngrams = S.ngrams(min_length=1, max_length=5) for key, values in ngrams.iteritems(): self.global_pmi.add_ngram(values['stemms'], amount=values['count'])