def transform(
        self,
        corpus: Corpus,
        selector: Callable[[CorpusComponent],
                           bool] = lambda x: True) -> Corpus:
        """
        Annotate corpus objects with pair information (label, pair_id, pair_orientation), with an optional selector indicating which objects should be considered for pairing.

        :param corpus: target Corpus
        :param selector: a (lambda) function that takes a Corpus object and returns a bool (True = include)
        :return: annotated Corpus
        """
        pos_objs, neg_objs = self._get_pos_neg_objects(corpus, selector)
        obj_pairs = self._pair_objs(pos_objs, neg_objs)
        pair_orientations = self._assign_pair_orientations(obj_pairs)

        for pair_id, (pos_obj, neg_obj) in obj_pairs.items():
            pos_obj.add_meta(self.label_attribute_name, "pos")
            neg_obj.add_meta(self.label_attribute_name, "neg")
            pos_obj.add_meta(self.pair_id_attribute_name, pair_id)
            neg_obj.add_meta(self.pair_id_attribute_name, pair_id)
            pos_obj.add_meta(self.pair_orientation_attribute_name,
                             pair_orientations[pair_id])
            neg_obj.add_meta(self.pair_orientation_attribute_name,
                             pair_orientations[pair_id])

        for obj in corpus.iter_objs(self.obj_type):
            # unlabelled objects include both objects that did not pass the selector
            # and objects that were not selected in the pairing step
            if self.label_attribute_name not in obj.meta:
                obj.add_meta(self.label_attribute_name, None)
                obj.add_meta(self.pair_id_attribute_name, None)
                obj.add_meta(self.pair_orientation_attribute_name, None)

        return corpus
Exemplo n.º 2
0
    def transform(
        self,
        corpus: Corpus,
        selector: Callable[[CorpusComponent],
                           bool] = lambda x: True) -> Corpus:
        """
        Computes the vector matrix for the Corpus component objects and then stores it in a ConvoKitMatrix object,
        which is saved in the Corpus as `vector_name`.

        :param corpus: the target Corpus
        :param selector: a (lambda) function that takes a Corpus component object and returns True or False
            (i.e. include / exclude). By default, the selector includes all objects of the specified type in the Corpus.

        :return: the target Corpus annotated
        """
        objs = list(corpus.iter_objs(self.obj_type, selector))
        ids = [obj.id for obj in objs]
        docs = [self.text_func(obj) for obj in objs]

        matrix = self.vectorizer.transform(docs)
        try:
            column_names = self.vectorizer.get_feature_names()
        except AttributeError:
            column_names = np.arange(matrix.shape[1])
        corpus.set_vector_matrix(self.vector_name,
                                 matrix=matrix,
                                 ids=ids,
                                 columns=column_names)

        for obj in objs:
            obj.add_vector(self.vector_name)

        return corpus
    def transform(self, corpus: Corpus, selector: Callable[[CorpusComponent], bool] = lambda x: True) -> Corpus:
        """
        Annotate the corpus objects with the vectorized representation of the object's text, with an optional
        selector that filters for objects to be transformed. Objects that are not selected will get a metadata value
        of 'None' instead of the vector.

        :param corpus: the target Corpus
        :param selector: a (lambda) function that takes a Corpus object and returns True or False (i.e. include / exclude). By default, the selector includes all objects of the specified type in the Corpus.

        :return: the target Corpus annotated
        """
        objs = list(corpus.iter_objs(self.obj_type, selector))
        ids = [obj.id for obj in objs]
        docs = [self.text_func(obj) for obj in objs]

        matrix = self.vectorizer.transform(docs)
        try:
            column_names = self.vectorizer.get_feature_names()
        except AttributeError:
            column_names = np.arange(matrix.shape[1])
        corpus.set_vector_matrix(self.vector_name, matrix=matrix, ids=ids, columns=column_names)

        for obj in objs:
            obj.add_vector(self.vector_name)

        return corpus
    def transform(self, corpus: Corpus) -> Corpus:
        """
        Annotate corpus objects with pair information (label, pair_id, pair_orientation)
        :param corpus: target Corpus
        :return: annotated Corpus
        """
        pos_objs, neg_objs = self._get_pos_neg_objects(corpus)
        obj_pairs = self._pair_objs(pos_objs, neg_objs)
        pair_orientations = self._assign_pair_orientations(obj_pairs)

        for pair_id, (pos_obj, neg_obj) in obj_pairs.items():
            pos_obj.add_meta(self.label_feat_name, "pos")
            neg_obj.add_meta(self.label_feat_name, "neg")
            pos_obj.add_meta(self.pair_id_feat_name, pair_id)
            neg_obj.add_meta(self.pair_id_feat_name, pair_id)
            pos_obj.add_meta(self.pair_orientation_feat_name,
                             pair_orientations[pair_id])
            neg_obj.add_meta(self.pair_orientation_feat_name,
                             pair_orientations[pair_id])

        for obj in corpus.iter_objs(self.obj_type):
            # unlabelled objects include both objects that did not pass the selector
            # and objects that were not selected in the pairing step
            if self.label_feat_name not in obj.meta:
                obj.add_meta(self.label_feat_name, None)
                obj.add_meta(self.pair_id_feat_name, None)
                obj.add_meta(self.pair_orientation_feat_name, None)

        return corpus
Exemplo n.º 5
0
    def fit(self,
            corpus: Corpus,
            selector: Callable[[CorpusComponent], bool] = lambda x: True,
            y=None):
        """
        Fit the Transformer's internal classifier model on the vector matrix that represents one of
        the Corpus components, with an optional selector that selects for objects to be fit on.

        :param corpus: the target Corpus
        :param selector: a (lambda) function that takes a Corpus object and returns True or False
            (i.e. include / exclude). By default, the selector includes all objects of the specified type in the Corpus.
        :return: the fitted VectorClassifier
        """
        # collect texts for vectorization
        obj_ids = []
        y = []
        for obj in corpus.iter_objs(self.obj_type, selector):
            obj_ids.append(obj.id)
            y.append(self.labeller(obj))
        X = corpus.get_vectors(self.vector_name,
                               ids=obj_ids,
                               columns=self.columns)
        y = np.array(y)
        # print(corpus.get_vector_matrix(self.vector_name).matrix.shape)
        # print(X.shape)
        # print(y.shape)
        self.clf.fit(X, y)
        return self
Exemplo n.º 6
0
    def fit(self, corpus: Corpus, y=None):
        # collect texts for vectorization
        docs = []
        for obj in corpus.iter_objs(self.obj_type, self.selector):
            docs.append(self.text_func(obj))

        self.vectorizer.fit(docs)
    def transform(self, corpus: Corpus, selector: Callable[[CorpusObject], bool] = lambda x: True) -> Corpus:
        """
        Annotate the corpus objects with the classifier prediction and prediction score, with an optional selector
        that filters for objects to be classified. Objects that are not selected will get a metadata value of 'None'
        instead of the classifier prediction.

        :param corpus: the target Corpus
        :param selector: a (lambda) function that takes a Corpus object and returns True or False (i.e. include / exclude). By default, the selector includes all objects of the specified type in the Corpus.

        :return: the target Corpus annotated
        """
        objs = []
        X = []
        for obj in corpus.iter_objs(self.obj_type):
            if selector(obj):
                objs.append(obj)
                X.append(obj.meta[self.vector_name])
            else:
                obj.add_meta(self.clf_feat_name, None)
                obj.add_meta(self.clf_prob_feat_name, None)
        X = vstack(X)
        clfs, clfs_probs = self.clf.predict(X), self.clf.predict_proba(X)[:, 1]

        for idx, (clf, clf_prob) in enumerate(list(zip(clfs, clfs_probs))):
            obj = objs[idx]
            obj.add_meta(self.clf_feat_name, clf)
            obj.add_meta(self.clf_prob_feat_name, clf_prob)
        return corpus
Exemplo n.º 8
0
    def transform(self, corpus: Corpus) -> Corpus:
        for obj in corpus.iter_objs(self.obj_type):
            if self.selector(obj):
                obj.meta[self.vector_name] = self.vectorizer.transform(
                    [self.text_func(obj)])
            else:
                obj.meta[self.vector_name] = None

        return corpus
Exemplo n.º 9
0
 def fit(self, corpus: Corpus, y=None):
     # collect texts for vectorization
     X = []
     y = []
     for obj in corpus.iter_objs(self.obj_type, self.selector):
         X.append(obj.meta[self.vector_name])
         y.append(self.labeller(obj))
     X = vstack(X)
     self.clf.fit(X, y)
     return self
Exemplo n.º 10
0
    def summarize(self, corpus: Corpus, use_selector=True):
        objId_clf_prob = []

        for obj in corpus.iter_objs(
                self.obj_type,
                self.selector if use_selector else lambda _: True):
            objId_clf_prob.append((obj.id, obj.meta[self.clf_feat_name],
                                   obj.meta[self.clf_prob_feat_name]))

        return pd.DataFrame(list(objId_clf_prob),
                           columns=['id', self.clf_feat_name, self.clf_prob_feat_name])\
                        .set_index('id').sort_values(self.clf_prob_feat_name, ascending=False)
    def fit(self, corpus: Corpus, y=None, selector: Callable[[CorpusComponent], bool] = lambda x: True):
        """
        Fit the Transformer's internal vectorizer on the Corpus objects' texts, with an optional selector that filters for objects to be fit on.

        :param corpus: the target Corpus
        :param selector: a (lambda) function that takes a Corpus object and returns True or False (i.e. include / exclude). By default, the selector includes all objects of the specified type in the Corpus.
        :return: the fitted BoWTransformer
        """
        # collect texts for vectorization
        docs = [self.text_func(obj) for obj in corpus.iter_objs(self.obj_type, selector)]
        self.vectorizer.fit(docs)
        return self
    def _get_pos_neg_objects(self, corpus: Corpus, selector):
        """
        Get positively-labelled and negatively-labelled lists of objects

        :param corpus: target Corpus
        :return: list of positive objects, list of negative objects
        """
        pos_objects = []
        neg_objects = []
        for obj in corpus.iter_objs(self.obj_type, selector):
            if self.pos_label_func(obj):
                pos_objects.append(obj)
            elif self.neg_label_func(obj):
                neg_objects.append(obj)
        return pos_objects, neg_objects
    def summarize(self, corpus: Corpus, selector: Callable[[CorpusObject], bool] = lambda x: True):
        """
        Generate a DataFrame indexed by object id with the classifier predictions and scores

        :param corpus: the annotated Corpus
        :param selector: a (lambda) function that takes a Corpus object and returns True or False (i.e. include / exclude). By default, the selector includes all objects of the specified type in the Corpus.
        :return: a pandas DataFrame
        """
        objId_clf_prob = []

        for obj in corpus.iter_objs(self.obj_type, selector):
            objId_clf_prob.append((obj.id, obj.meta[self.clf_feat_name], obj.meta[self.clf_prob_feat_name]))

        return pd.DataFrame(list(objId_clf_prob),
                           columns=['id', self.clf_feat_name, self.clf_prob_feat_name])\
                        .set_index('id').sort_values(self.clf_prob_feat_name, ascending=False)
Exemplo n.º 14
0
    def transform(self, corpus: Corpus) -> Corpus:
        objs = []
        X = []
        for obj in corpus.iter_objs(self.obj_type):
            if self.selector(obj):
                objs.append(obj)
                X.append(obj.meta[self.vector_name])
            else:
                obj.meta[self.vector_name] = None
        X = vstack(X)
        clfs, clfs_probs = self.clf.predict(X), self.clf.predict_proba(X)[:, 1]

        for idx, (clf, clf_prob) in enumerate(list(zip(clfs, clfs_probs))):
            obj = objs[idx]
            obj.add_meta(self.clf_feat_name, clf)
            obj.add_meta(self.clf_prob_feat_name, clf_prob)
        return corpus
    def fit(self, corpus: Corpus, y=None, selector: Callable[[CorpusObject], bool] = lambda x: True):
        """
        Fit the Transformer's internal classifier model on the Corpus objects, with an optional selector that filters for objects to be fit on.

        :param corpus: the target Corpus
        :param selector: a (lambda) function that takes a Corpus object and returns True or False (i.e. include / exclude). By default, the selector includes all objects of the specified type in the Corpus.
        :return: the fitted BoWClassifier
        """
        # collect texts for vectorization
        X = []
        y = []
        for obj in corpus.iter_objs(self.obj_type, selector):
            X.append(obj.meta[self.vector_name])
            y.append(self.labeller(obj))
        X = vstack(X)
        self.clf.fit(X, y)
        return self
    def transform(
            self,
            corpus: Corpus,
            selector: Callable[[CorpusObject],
                               bool] = lambda x: True) -> Corpus:
        """
        Annotate the corpus objects with the vectorized representation of the object's text, with an optional
        selector that filters for objects to be transformed. Objects that are not selected will get a metadata value
        of 'None' instead of the vector.

        :param corpus: the target Corpus
        :param selector: a (lambda) function that takes a Corpus object and returns True or False (i.e. include / exclude). By default, the selector includes all objects of the specified type in the Corpus.

        :return: the target Corpus annotated
        """
        for obj in corpus.iter_objs(self.obj_type):
            if selector(obj):
                obj.meta[self.vector_name] = self.vectorizer.transform(
                    [self.text_func(obj)])
            else:
                obj.meta[self.vector_name] = None

        return corpus
Exemplo n.º 17
0
    def transform(self, corpus: Corpus, y=None, selector: Callable[[CorpusComponent], bool] = lambda obj: True) -> Corpus:
        """
        Annotate corpus objects with scores and rankings.

        :param corpus: target corpus
        :param selector: (lambda) function taking in a Corpus object and returning True / False; selects for Corpus objects to annotate.
        :return: annotated corpus
        """
        obj_iters = {"conversation": corpus.iter_conversations,
                     "speaker": corpus.iter_speakers,
                     "utterance": corpus.iter_utterances}
        obj_scores = [(obj.id, self.score_func(obj)) for obj in obj_iters[self.obj_type](selector)]
        df = pd.DataFrame(obj_scores, columns=["id", self.score_attribute_name]) \
            .set_index('id').sort_values(self.score_attribute_name, ascending=False)
        df[self.rank_attribute_name] = [idx + 1 for idx, _ in enumerate(df.index)]

        for obj in corpus.iter_objs(obj_type=self.obj_type):
            if obj.id in df.index:
                obj.add_meta(self.score_attribute_name, df.loc[obj.id][self.score_attribute_name])
                obj.add_meta(self.rank_attribute_name, df.loc[obj.id][self.rank_attribute_name])
            else:
                obj.add_meta(self.score_attribute_name, None)
                obj.add_meta(self.rank_attribute_name, None)
        return corpus