예제 #1
0
    def _count_vocab(self, analyzed_docs):
        """Create sparse feature matrix, and vocabulary where fixed_vocab=False
        """
        vocabulary = self.vocabulary_
        j_indices = _make_int_array()
        indptr = _make_int_array()
        indptr.append(0)
        for doc in analyzed_docs:
            for feature in doc:
                try:
                    j_indices.append(vocabulary[feature])
                except KeyError:
                    # Ignore out-of-vocabulary items for fixed_vocab=True
                    continue
            indptr.append(len(j_indices))

        j_indices = frombuffer_empty(j_indices, dtype=np.intc)
        indptr = np.frombuffer(indptr, dtype=np.intc)
        values = np.ones(len(j_indices))

        X = sp.csr_matrix((values, j_indices, indptr),
                          shape=(len(indptr) - 1, len(vocabulary)),
                          dtype=self.dtype)
        X.sum_duplicates()

        if self.binary:
            X.data.fill(1)

        return X
예제 #2
0
    def _count_vocab(self, analyzed_docs):
        """Create sparse feature matrix, and vocabulary where fixed_vocab=False
        """
        vocabulary = self.vocabulary_
        j_indices = _make_int_array()
        indptr = _make_int_array()
        indptr.append(0)
        for doc in analyzed_docs:
            for feature in doc:
                try:
                    j_indices.append(vocabulary[feature])
                except KeyError:
                    # Ignore out-of-vocabulary items for fixed_vocab=True
                    continue
            indptr.append(len(j_indices))

        j_indices = frombuffer_empty(j_indices, dtype=np.intc)
        indptr = np.frombuffer(indptr, dtype=np.intc)
        values = np.ones(len(j_indices))

        X = sp.csr_matrix((values, j_indices, indptr),
                          shape=(len(indptr) - 1, len(vocabulary)),
                          dtype=self.dtype)
        X.sum_duplicates()

        if self.binary:
            X.data.fill(1)

        return X
예제 #3
0
파일: bow.py 프로젝트: renanlage/flexical
    def _count_vocab(self, raw_documents, fixed_vocab):
        """Create sparse feature matrix, and vocabulary where fixed_vocab=False
        """
        if fixed_vocab:
            vocabulary = self.vocabulary_
        else:
            # Add a new value when a new vocabulary item is seen
            vocabulary = defaultdict()
            vocabulary.default_factory = vocabulary.__len__

        j_indices = _make_int_array()
        indptr = _make_int_array()
        values = _make_float_array() if self.apply_socal_mask else None
        indptr.append(0)

        for doc in raw_documents:
            # doc: meu cajado eh muito grande
            # [1, 1, 1, 0, 2]
            if self.apply_socal_mask is True:
                doc_mask = self.socal.mask(doc)

            for index, feature in enumerate(doc):
                try:
                    if feature in self.stopwords:
                        continue

                    # j_incides for a doc: [2, 10, 9, 102, 65]
                    if not fixed_vocab or feature in vocabulary:
                        j_indices.append(vocabulary[feature])

                        if self.apply_socal_mask:
                            values.append(doc_mask[index])

                except KeyError:
                    # Ignore out-of-vocabulary items for fixed_vocab=True
                    continue
            indptr.append(len(j_indices))

        if not fixed_vocab:
            # disable defaultdict behaviour
            vocabulary = dict(vocabulary)
            if not vocabulary:
                raise ValueError("empty vocabulary; perhaps the documents only"
                                 " contain stop words")

        j_indices = frombuffer_empty(j_indices, dtype=np.intc)
        indptr = np.frombuffer(indptr, dtype=np.intc)
        values = values if values else np.ones(len(j_indices))

        X = sp.csr_matrix((values, j_indices, indptr),
                          shape=(len(indptr) - 1, len(vocabulary)),
                          dtype=self.dtype)
        X.sum_duplicates()
        return vocabulary, X
예제 #4
0
    def _count_analyzed_vocab(self, analyzed_documents, fixed_vocab):
        """Create sparse feature matrix, and vocabulary where fixed_vocab=False.

        For consistency this is a slightly edited version of feature_extraction.text._count_vocab with the document
        analysis factored out.
        """
        if fixed_vocab:
            vocabulary = self.vocabulary_
        else:
            # Add a new value when a new vocabulary item is seen
            vocabulary = defaultdict()
            vocabulary.default_factory = vocabulary.__len__

        j_indices = []
        indptr = _make_int_array()
        values = _make_int_array()
        indptr.append(0)
        for analyzed_doc in analyzed_documents:
            feature_counter = {}
            for feature in analyzed_doc:
                try:
                    feature_idx = vocabulary[feature]
                    if feature_idx not in feature_counter:
                        feature_counter[feature_idx] = 1
                    else:
                        feature_counter[feature_idx] += 1
                except KeyError:
                    # Ignore out-of-vocabulary items for fixed_vocab=True
                    continue

            j_indices.extend(feature_counter.keys())
            values.extend(feature_counter.values())
            indptr.append(len(j_indices))

        j_indices = np.asarray(j_indices, dtype=np.intc)
        indptr = np.frombuffer(indptr, dtype=np.intc)
        values = np.frombuffer(values, dtype=np.intc)

        X = sp.csr_matrix((values, j_indices, indptr),
                          shape=(len(indptr) - 1, len(vocabulary)),
                          dtype=self.dtype)
        X.sort_indices()
        return vocabulary, X
    def _count_vocab(self, raw_documents, fixed_vocab):
        """Create sparse feature matrix, and vocabulary where fixed_vocab=False
        """
        if fixed_vocab:
            vocabulary = self.vocabulary_
        else:
            # Add a new value when a new vocabulary item is seen
            vocabulary = defaultdict()
            vocabulary.default_factory = vocabulary.__len__

        analyze = self.build_analyzer()
        j_indices = []
        indptr = _make_int_array()
        values = _make_float_array()
        indptr.append(0)
        for doc in raw_documents:
            feature_counter = {}
            current_num = 0
            for feature in analyze(doc):
                maybe_float = try_float(feature)
                if maybe_float > 0 and maybe_float <= 200:
                    current_num = maybe_float
                    continue
                try:
                    if current_num == 0:
                        continue
                    feature_idx = vocabulary[feature]
                    if feature_idx not in feature_counter:
                        feature_counter[feature_idx] = current_num / 200
                        current_num = 0
                except KeyError:
                    # Ignore out-of-vocabulary items for fixed_vocab=True
                    continue

            j_indices.extend(feature_counter.keys())
            values.extend(feature_counter.values())
            indptr.append(len(j_indices))

        if not fixed_vocab:
            # disable defaultdict behaviour
            vocabulary = dict(vocabulary)
            if not vocabulary:
                raise ValueError("empty vocabulary; perhaps the documents only"
                                 " contain stop words")

        j_indices = np.asarray(j_indices, dtype=np.intc)
        indptr = np.frombuffer(indptr, dtype=np.intc)
        values = np.frombuffer(values, dtype=np.float32)

        X = sp.csr_matrix((values, j_indices, indptr),
                          shape=(len(indptr) - 1, len(vocabulary)),
                          dtype=np.float32)
        X.sort_indices()
        return vocabulary, X
    def _count_vocab(self, raw_documents, fixed_vocab):
        """Create sparse feature matrix, and vocabulary where fixed_vocab=False
        """
        if fixed_vocab:
            raise NotImplementedError()

        vocabulary = self.trie_cls()

        analyze = self.build_analyzer()
        j_indices = _make_int_array()
        indptr = _make_int_array()
        indptr.append(0)
        for doc in raw_documents:
            for feature in analyze(doc):
                if feature not in vocabulary:
                    idx = len(vocabulary)
                    vocabulary[feature] = idx
                    j_indices.append(idx)
                else:
                    try:
                        j_indices.append(vocabulary[feature])
                    except KeyError:
                        # Ignore out-of-vocabulary items for fixed_vocab=True
                        continue
            indptr.append(len(j_indices))

        # some Python/Scipy versions won't accept an array.array:
        if j_indices:
            j_indices = np.frombuffer(j_indices, dtype=np.intc)
        else:
            j_indices = np.array([], dtype=np.int32)
        indptr = np.frombuffer(indptr, dtype=np.intc)
        values = np.ones(len(j_indices))

        X = sp.csr_matrix((values, j_indices, indptr),
                          shape=(len(indptr) - 1, len(vocabulary)),
                          dtype=self.dtype)
        X.sum_duplicates()
        return vocabulary, X
    def _count_vocab(self, raw_documents, fixed_vocab):
        """Create sparse feature matrix, and vocabulary where fixed_vocab=False
        """
        if fixed_vocab:
            raise NotImplementedError()

        vocabulary = self.trie_cls()

        analyze = self.build_analyzer()
        j_indices = _make_int_array()
        indptr = _make_int_array()
        indptr.append(0)
        for doc in raw_documents:
            for feature in analyze(doc):
                if feature not in vocabulary:
                    idx = len(vocabulary)
                    vocabulary[feature] = idx
                    j_indices.append(idx)
                else:
                    try:
                        j_indices.append(vocabulary[feature])
                    except KeyError:
                        # Ignore out-of-vocabulary items for fixed_vocab=True
                        continue
            indptr.append(len(j_indices))

        # some Python/Scipy versions won't accept an array.array:
        if j_indices:
            j_indices = np.frombuffer(j_indices, dtype=np.intc)
        else:
            j_indices = np.array([], dtype=np.int32)
        indptr = np.frombuffer(indptr, dtype=np.intc)
        values = np.ones(len(j_indices))

        X = sp.csr_matrix((values, j_indices, indptr),
                          shape=(len(indptr) - 1, len(vocabulary)),
                          dtype=self.dtype)
        X.sum_duplicates()
        return vocabulary, X
예제 #8
0
    def _count_vocab(self, raw_documents, fixed_vocab, y=None):
        if not y:
            return super(DeltaTfidfVectorizer,
                         self)._count_vocab(raw_documents, fixed_vocab)

        if fixed_vocab:
            vocabulary = self.vocabulary_
        else:
            # Add a new value when a new vocabulary item is seen
            vocabulary = defaultdict()
            vocabulary.default_factory = vocabulary.__len__

        analyze = self.build_analyzer()
        j_indices = []
        indptr = _make_int_array()
        values = _make_int_array()
        pos_values = _make_int_array()
        neg_values = _make_int_array()
        indptr.append(0)

        for i, doc in enumerate(raw_documents):
            feature_counter = defaultdict(int)
            pos_feature_counter = defaultdict(int)
            neg_feature_counter = defaultdict(int)
            for feature in analyze(doc):
                try:
                    feature_idx = vocabulary[feature]
                    feature_counter[feature_idx] += 1
                    pos_feature_counter[feature_idx] += int(y[i] == 1)
                    neg_feature_counter[feature_idx] += int(y[i] == -1)
                except KeyError:
                    # Ignore out-of-vocabulary items for fixed_vocab=True
                    continue

            j_indices.extend(feature_counter.keys())
            values.extend(feature_counter.values())
            pos_values.extend(pos_feature_counter.values())
            neg_values.extend(neg_feature_counter.values())
            indptr.append(len(j_indices))

        if not fixed_vocab:
            # disable defaultdict behaviour
            vocabulary = dict(vocabulary)
            if not vocabulary:
                raise ValueError("empty vocabulary; perhaps the documents only"
                                 " contain stop words")

        j_indices = np.asarray(j_indices, dtype=np.intc)
        indptr = np.frombuffer(indptr, dtype=np.intc)
        values = np.frombuffer(values, dtype=np.intc)
        pos_values = np.frombuffer(pos_values, dtype=np.intc)
        neg_values = np.frombuffer(neg_values, dtype=np.intc)

        X = sp.csr_matrix((values, j_indices, indptr),
                          shape=(len(indptr) - 1, len(vocabulary)),
                          dtype=self.dtype)
        X.sort_indices()

        X_pos = sp.csr_matrix((pos_values, j_indices, indptr),
                              shape=(len(indptr) - 1, len(vocabulary)),
                              dtype=self.dtype)
        X_pos.sort_indices()

        X_neg = sp.csr_matrix((neg_values, j_indices, indptr),
                              shape=(len(indptr) - 1, len(vocabulary)),
                              dtype=self.dtype)
        X_neg.sort_indices()

        return vocabulary, X, X_pos, X_neg
    def _count_vocab(self, raw_documents, fixed_vocab, y=None):
        if not y:
            return super(DeltaTfidfVectorizer, self)._count_vocab(raw_documents, fixed_vocab)

        if fixed_vocab:
            vocabulary = self.vocabulary_
        else:
            # Add a new value when a new vocabulary item is seen
            vocabulary = defaultdict()
            vocabulary.default_factory = vocabulary.__len__

        analyze = self.build_analyzer()
        j_indices = []
        indptr = _make_int_array()
        values = _make_int_array()
        pos_values = _make_int_array()
        neg_values = _make_int_array()
        indptr.append(0)

        for i, doc in enumerate(raw_documents):
            feature_counter = defaultdict(int)
            pos_feature_counter = defaultdict(int)
            neg_feature_counter = defaultdict(int)
            for feature in analyze(doc):
                try:
                    feature_idx = vocabulary[feature]
                    feature_counter[feature_idx] += 1
                    pos_feature_counter[feature_idx] += int(y[i] == 1)
                    neg_feature_counter[feature_idx] += int(y[i] == -1)
                except KeyError:
                    # Ignore out-of-vocabulary items for fixed_vocab=True
                    continue

            j_indices.extend(feature_counter.keys())
            values.extend(feature_counter.values())
            pos_values.extend(pos_feature_counter.values())
            neg_values.extend(neg_feature_counter.values())
            indptr.append(len(j_indices))

        if not fixed_vocab:
            # disable defaultdict behaviour
            vocabulary = dict(vocabulary)
            if not vocabulary:
                raise ValueError("empty vocabulary; perhaps the documents only"
                                 " contain stop words")

        j_indices = np.asarray(j_indices, dtype=np.intc)
        indptr = np.frombuffer(indptr, dtype=np.intc)
        values = np.frombuffer(values, dtype=np.intc)
        pos_values = np.frombuffer(pos_values, dtype=np.intc)
        neg_values = np.frombuffer(neg_values, dtype=np.intc)

        X = sp.csr_matrix((values, j_indices, indptr),
                          shape=(len(indptr) - 1, len(vocabulary)),
                          dtype=self.dtype)
        X.sort_indices()

        X_pos = sp.csr_matrix((pos_values, j_indices, indptr),
                          shape=(len(indptr) - 1, len(vocabulary)),
                          dtype=self.dtype)
        X_pos.sort_indices()

        X_neg = sp.csr_matrix((neg_values, j_indices, indptr),
                          shape=(len(indptr) - 1, len(vocabulary)),
                          dtype=self.dtype)
        X_neg.sort_indices()

        return vocabulary, X, X_pos, X_neg