Пример #1
0
    def testNotEqual(self):
        tu1 = model.KBTextualUnit(
            "test-corpus",  # corpus name
            "fr",  # language
            "test",  # normalized form
            ["test"],  # tokens
            ["test"],  # lemmas
            ["test"],  # stems
            ["N"])  # POS tags
        tu2 = model.KBTextualUnit(
            "test-corpus",  # corpus name
            "fr",  # language
            "test2",  # normalized form
            ["test2"],  # tokens
            ["test2"],  # lemmas
            ["test2"],  # stems
            ["N"])  # POS tags
        tu3 = model.KBTextualUnit(
            "test-corpus2",  # corpus name
            "fr",  # language
            "test",  # normalized form
            ["test"],  # tokens
            ["test"],  # lemmas
            ["test"],  # stems
            ["N"])  # POS tags

        self.failIf(self._tu != tu1)
        self.failUnless(self._tu != tu2)
        self.failUnless(self._tu != tu3)
Пример #2
0
 def setUp(self):
     self._tu = model.KBTextualUnit(
         "test-corpus",  # corpus name
         "fr",  # language
         "test",  # normalized form
         ["test"],  # tokens
         ["test"],  # lemmas
         ["test"],  # stems
         ["N"])  # POS tags
Пример #3
0
    def testNotEqual(self):
        tuc1 = model.KBTextualUnitCluster()
        tuc2 = model.KBTextualUnitCluster()
        tuc3 = model.KBTextualUnitCluster()
        tu1 = model.KBTextualUnit(
            "test-corpus",  # corpus name
            "fr",  # language
            "test",  # normalized form
            ["test"],  # tokens
            ["test"],  # lemmas
            ["test"],  # stems
            ["N"])  # POS tags
        tu2 = model.KBTextualUnit(
            "test-corpus",  # corpus name
            "fr",  # language
            "test2",  # normalized form
            ["test 2"],  # tokens
            ["test 2"],  # lemmas
            ["test 2"],  # stems
            ["N"])  # POS tags
        tu3 = model.KBTextualUnit(
            "test-corpus2",  # corpus name
            "fr",  # language
            "test",  # normalized form
            ["test"],  # tokens
            ["test"],  # lemmas
            ["test"],  # stems
            ["N"])  # POS tags

        self._tuc.addTextualUnit(tu1)
        self._tuc.addTextualUnit(tu2)
        self._tuc.centroid = tu2
        tuc1.addTextualUnit(tu1)
        tuc1.addTextualUnit(tu2)
        tuc1.centroid = tu2
        tuc2.addTextualUnit(tu1)
        tuc2.addTextualUnit(tu2)
        tuc2.centroid = tu1
        tuc3.addTextualUnit(tu2)
        tuc3.centroid = tu2

        self.failIf(self._tuc != tuc1)
        self.failUnless(self._tuc != tuc2)
        self.failUnless(self._tuc != tuc3)
Пример #4
0
    def testSetWrongCentroid(self):
        tu = model.KBTextualUnit(
            "test-corpus",  # corpus name
            "fr",  # language
            "test",  # normalized form
            ["test"],  # tokens
            ["test"],  # lemmas
            ["test"],  # stems
            ["N"])  # POS tags

        with self.assertRaises(exception.KBTextualUnitClusterException):
            self._tuc.centroid = tu
Пример #5
0
    def testAddExistingTextualUnit(self):
        tu1 = model.KBTextualUnit(
            "test-corpus",  # corpus name
            "fr",  # language
            "test",  # normalized form
            ["test"],  # tokens
            ["test"],  # lemmas
            ["test"],  # stems
            ["N"])  # POS tags
        tu2 = model.KBTextualUnit(
            "test-corpus",  # corpus name
            "fr",  # language
            "test",  # normalized form
            ["test"],  # tokens
            ["test"],  # lemmas
            ["test"],  # stems
            ["N"])  # POS tags

        self._tuc.addTextualUnit(tu1)

        with self.assertRaises(exception.KBTextualUnitClusterException):
            self._tuc.addTextualUnit(tu2)
Пример #6
0
    def testSetCentroid(self):
        tu = model.KBTextualUnit(
            "test-corpus",  # corpus name
            "fr",  # language
            "test",  # normalized form
            ["test"],  # tokens
            ["test"],  # lemmas
            ["test"],  # stems
            ["N"])  # POS tags

        self._tuc.addTextualUnit(tu)
        self._tuc.centroid = tu

        self.failUnless(self._tuc.centroid == tu)
Пример #7
0
    def testAddTextualUnit(self):
        tu1 = model.KBTextualUnit(
            "test-corpus",  # corpus name
            "fr",  # language
            "test",  # normalized form
            ["test"],  # tokens
            ["test"],  # lemmas
            ["test"],  # stems
            ["N"])  # POS tags
        tu2 = model.KBTextualUnit(
            "test-corpus",  # corpus name
            "fr",  # language
            "test2",  # normalized form
            ["test 2"],  # tokens
            ["test 2"],  # lemmas
            ["test 2"],  # stems
            ["N"])  # POS tags

        self._tuc.addTextualUnit(tu1)
        self._tuc.addTextualUnit(tu2)

        self.failUnless(self._tuc.numberOfTextualUnits() == 2)
        self.failUnless(self._tuc.textual_units == [tu1, tu2])
Пример #8
0
  def _updateCandidateDictionary(self,
                                 candiates,
                                 document,
                                 sentence_offset,
                                 starting_token,
                                 ending_token):
    """Adds or update a newly extracted candidate form to a candidate
    dictionary.

    Args:
      candidates: The candidate dictionary to update. Keys are mixtures of
        candidate forms and POS tags.
      document: The C{KBDocument} where the candidate is extracted from.
      sentence_offset: The index of the sentence of the document where the
        candidate is extracted.
      starting_token: The index of the first token of the candidate within the
        sentence it is extracted from.
      ending_token: The index of the last token of the candidate within the
        sentence it is extracted from.
    """

    tool_factory = core.KBBenchmark.singleton().run_tools[self._run_name]
    normalizer = tool_factory.normalizer(document.language)
    #---------------------------------------------------------------------------
    tokenized_sentence = document.full_text_sentence_tokens[sentence_offset]
    pos_tagged_sentence = document.full_text_sentence_pos_tags[sentence_offset]
    #---------------------------------------------------------------------------
    candidate_string = " ".join(tokenized_sentence[starting_token:ending_token])
    candidate_seen_form = candidate_string # FIXME tokenized form :{
    candidate_normalized_form = normalizer.normalize(candidate_string)
    candidate_normalized_tokens = candidate_normalized_form.split(" ")
    candidate_normalized_lemmas = document.full_text_token_lemmas[sentence_offset][starting_token:ending_token]
    candidate_normalized_stems = document.full_text_token_stems[sentence_offset][starting_token:ending_token]
    candidate_pos_tags = pos_tagged_sentence[starting_token:ending_token]
    #---------------------------------------------------------------------------
    candidate = model.KBTextualUnit(document.corpus_name,
                                    document.language,
                                    candidate_normalized_form,
                                    candidate_normalized_tokens,
                                    candidate_normalized_lemmas,
                                    candidate_normalized_stems,
                                    candidate_pos_tags)

    if candidate.identifier not in candidates:
      candidates[identifier] = candidate
    candidates[identifier].addOccurrence(candidate_seen_form,
                                         sentence_offset,
                                         starting_token)
  def _candidateExtraction(self, document):
    """Extracts the candidates of a given document.

    Args:
      document: The C{KBDocument} from which the candidates must be extracted.

    Returns:
      The C{list} of extracted, and filtered, candidates (C{KBTextualUnit}s).
    """

    candidates = super(FrenchRefinedNounPhraseExtractor,
                       self)._candidateExtraction(document)

    for index, candidate in enumerate(candidates):
      # WARNING works only for N+A? (one adjective at the right)
      # check if the adjective must be filtered out or not
      if not self._check_adjective(candidate):
        candidate_normalized_tokens = candidate.normalized_tokens[:-1]
        # create a new candidate without the adjective
        candidate_normalized_form = " ".join(candidate_normalized_tokens)
        candidate_normalized_lemmas = candidate.normalized_lemmas[:-1]
        candidate_normalized_stems = candidate.normalized_stems[:-1]
        candidate_pos_tags = candidate.pos_tags[:-1]
        new_candidate = model.KBTextualUnit(document.corpus_name,
                                            document.language,
                                            candidate_normalized_form,
                                            candidate_normalized_tokens,
                                            candidate_normalized_lemmas,
                                            candidate_normalized_stems,
                                            candidate_pos_tags)

        # add all the occurrences
        seen_forms = candidate.seen_forms
        for seen_form in seen_forms:
          new_seen_form =" ".join(seen_form.split(" ")[:-1]) # FIXME tokenized form, not seen form :{

          for sentence_offset, inner_sentence_offsets in seen_forms[seen_form].items():
            for inner_sentence_offset in inner_sentence_offsets:
              new_candidate.addOccurrence(new_seen_form,
                                          sentence_offset,
                                          inner_sentence_offset)

        candidates[index] = new_candidate

    return candidates.values()