示例#1
0
from scipy.sparse import csr_matrix
from sklearn.base import TransformerMixin
from scipy.stats import norm
from logging import getLogger, StreamHandler
from numpy import ndarray, memmap
from typing import Union
from DocumentFeatureSelection import init_logger
import numpy as np
import joblib
import logging

logger = getLogger(init_logger.LOGGER_NAME)
logger = init_logger.init_logger(logger)


def bns(X: Union[memmap, csr_matrix],
        feature_index: int,
        sample_index: int,
        unit_distribution: np.ndarray,
        true_index: int = 0,
        verbose: bool = False):
    if true_index == 0:
        false_index = 1
    elif true_index == 1:
        false_index = 0
    else:
        raise Exception('true index must be either of 0 or 1')

    # trueラベルで出現した回数
    # tp is frequency of features in the specified positive label
    tp = X[true_index, feature_index]
from collections import Counter
from DocumentFeatureSelection.models import SetDocumentInformation, AvailableInputTypes, PersistentDict
from DocumentFeatureSelection import init_logger
from DocumentFeatureSelection.common.utils import init_cache_object
from sklearn.feature_extraction import DictVectorizer
from typing import Dict, List, Tuple, Any, Union
from sqlitedict import SqliteDict
import logging
import joblib
import itertools
import tempfile
logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME))
N_FEATURE_SWITCH_STRATEGY = 1000000

'''
def decode_into_utf8(string:str)->bytes:
    """* what you can do
    - convert string into etf-8
    """
    return string.encode('utf-8')'''

def generate_document_dict(document_key:str,
                           documents:List[Union[List[str], Tuple[Any]]])->Tuple[str,Counter]:
    """This function gets Document-frequency count in given list of documents
    """
    assert isinstance(documents, list)
    word_frequencies = [Counter(document) for document in documents]
    document_frequencies = Counter()
    for word_frequency in word_frequencies: document_frequencies.update(list(word_frequency.keys()))

    return (document_key, document_frequencies)
from collections import Counter
from DocumentFeatureSelection.models import SetDocumentInformation, AvailableInputTypes
from DocumentFeatureSelection import init_logger
from sklearn.feature_extraction import DictVectorizer
from typing import Dict, List, Tuple, Any, Union
from sqlitedict import SqliteDict
import logging
import joblib
import itertools
logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME))
N_FEATURE_SWITCH_STRATEGY = 1000000


def decode_into_utf8(string:str)->bytes:
    """* what you can do
    - convert string into etf-8
    """
    return string.encode('utf-8')

def generate_document_dict(document_key:str,
                           documents:List[Union[List[str], Tuple[Any]]])->Tuple[str,Counter]:
    """This function gets Document-frequency count in given list of documents
    """
    assert isinstance(documents, list)
    word_frequencies = [Counter(document) for document in documents]
    document_frequencies = Counter()
    for word_frequency in word_frequencies: document_frequencies.update(word_frequency.keys())

    return (document_key, document_frequencies)