Exemplo n.º 1
0
    def do_process(self, ctm_sentences, docx_sentences, **_):
        # scores = self.tfidf_scoring(ctm_sentences, docx_sentences)
        # scores = torch.tensor(scores)
        print("desribe ctm_s in do_process %s" % describe(ctm_sentences))
        print("desribe docx_s in do_process %s" % describe(docx_sentences))

        ctm_sentences = torch.stack(ctm_sentences, 0)
        docx_sentences = torch.stack(docx_sentences, 0)

        scores = cosine_similarity(ctm_sentences, docx_sentences)

        return scores
Exemplo n.º 2
0
    def do_scatter(self, sentences, size, overlap):
        """
        Args:

        Returns:
        """
        n_sen = len(sentences)
        slices = []
        scattered = []
        for i in range(0, n_sen, size - overlap):
            s = slice(i, min(n_sen, i + size))
            slices += [s]

            # print("[%d/%d] in group scorer describe sentences[s], %s" %
            #       (i, n_sen, str(s)))
            # print(describe(sentences[s]))
            try:
                scattered += [self.aggregate_fct(sentences[s])]
            except Exception as e:
                print("[in do_scatter] aggregate error")
                print("Aggregate fct: %s" % self.aggregate_fct.__name__)
                print("Sentences: %s" % str(sentences[s]))
                print("Scattered: %s" % str(scattered))

                raise e
                pass

        print("described scattered: %s " % describe(scattered))
        # stacked_scattered = torch.stack(scattered, 0)
        # return stacked_scattered, slices
        return scattered, slices
Exemplo n.º 3
0
    def prepare(self,
                sequences,
                pooling_fct='sentence_sum_pooling',
                min_len=2,
                stopwords=None,
                with_scores=False,
                **_):
        if stopwords == "fr":
            stopwords = autoalign.stopwords.fr
        elif stopwords == "fr_big":
            stopwords = autoalign.stopwords.fr_big
        elif stopwords is not None:
            raise ValueError("No such stopwords '%s'" % stopwords)

        if isinstance(pooling_fct, str):
            try:
                pooling_fct = getattr(autoalign.score.pooling, pooling_fct)
            except AttributeError:
                raise AttributeError("Unknown pooling function '%s'" %
                                     sentence_sum_pooling)

        embs = []
        for seq in sequences:
            if with_scores:
                seq = [w for w, s in seq]

            if stopwords is not None:
                seq = [_ for _ in seq if _ not in stopwords]

            emb = self.ses.seq2embs(seq)
            if len(emb) == 0:
                emb += [torch.zeros(self.ses.n_dim)]

            emb = pooling_fct(emb) if len(emb) > 0 else torch.zeros(
                self.ses.n_dim)
            if isinstance(emb, torch.Tensor):
                embs += [emb]
            elif isinstance(emb, list):
                embs += emb

        print("describing embs after SES.prepare %s" % describe(embs))
        return embs
Exemplo n.º 4
0
    def prepare(self,
                sentences,
                *args,
                lsa_components=10,
                with_scores=False,
                **kwargs):
        self.lsa = TruncatedSVD(n_components=lsa_components)

        if isinstance(sentences[0], list):
            if with_scores:
                sentences = [" ".join([_[0] for _ in s]) for s in sentences]
            else:
                sentences = [" ".join(s) for s in sentences]

        print("describe sentences in TFIDF prepare %s" % describe(sentences))
        vectors = self.vectorizer.fit_transform(sentences)
        vectors = self.lsa.fit_transform(vectors)
        vectors = self.normalizer.fit_transform(vectors)

        return torch.tensor(vectors).float()
Exemplo n.º 5
0
    def process(self,
                ctm_sentences,
                docx_sentences,
                *args,
                score_group=None,
                scatter_kwargs=None,
                ctm_scatter_kwargs=None,
                docx_scatter_kwargs=None,
                scorer_prepare_kwargs={},
                score_threshold=None,
                **kwargs):
        if ctm_scatter_kwargs is None and scatter_kwargs is not None:
            ctm_scatter_kwargs = scatter_kwargs

        if docx_scatter_kwargs is None and scatter_kwargs is not None:
            docx_scatter_kwargs = scatter_kwargs

        assert score_group is None or isinstance(score_group, autoalign.score.PaddingGroup) \
            or (ctm_scatter_kwargs is not None
                and docx_scatter_kwargs is not None)
        if docx_scatter_kwargs is None:
            docx_scatter_kwargs = {}
        if ctm_scatter_kwargs is None:
            ctm_scatter_kwargs = {}

        n_ctm, n_docx = len(ctm_sentences), len(docx_sentences)
        ctm_sentences = self.prepare(ctm_sentences,
                                     with_scores=True,
                                     **scorer_prepare_kwargs)
        docx_sentences = self.prepare(docx_sentences, **scorer_prepare_kwargs)
        # raise ValueError()
        print("@Scorer.process after prepare ", describe(ctm_sentences))
        # print("types: %s" % "\n- ".join(["%d %s" % (i, describe(_))
        # for (i, _) in enumerate(ctm_sentences)]))
        if score_group is not None:
            print("before ctm scatter: %d sentences" % len(ctm_sentences))
            ctm_sentences, ctm_group_slices = score_group.scatter(
                ctm_sentences, **ctm_scatter_kwargs)
            print("after ctm scatter: %d sentences, %d slices" %
                  (len(ctm_sentences), len(ctm_group_slices)))
            docx_sentences, docx_group_slices = score_group.scatter(
                docx_sentences, **docx_scatter_kwargs)

            n_ctm_groups = len(ctm_group_slices)
            n_docx_groups = len(docx_group_slices)
            assert len(ctm_sentences) == n_ctm_groups, "%d %d" % (
                len(ctm_sentences), n_ctm_groups)
            assert len(docx_sentences) == n_docx_groups
        # raise ValueError()
        scores = self.do_process(ctm_sentences, docx_sentences, *args,
                                 **kwargs)
        if score_threshold is not None:
            assert len(score_threshold) == 2
            scores_to_replace = scores.lt(score_threshold[0])
            scores[scores_to_replace] = score_threshold[1]
        print(describe(scores.size))
        # print("scores right after processing")
        # print(scores[:5, :5])
        # raise ValueError()
        if score_group is not None:
            # print("scattered scores: %s" % describe(scores))
            assert_size(scores, [n_ctm_groups, n_docx_groups])
            scores = score_group.gather(scores,
                                        ctm_slices=ctm_group_slices,
                                        docx_slices=docx_group_slices)
            # print("scores after gathering")
            # print(scores[:5, :5])
        assert_size(scores, [n_ctm, n_docx])
        return {"scores": scores}