Exemplo n.º 1
0
def is_transformer(cls, verbose=False):
    """Determine if `cls` corresponds to something that resembles an sklearn general transformer.
    If True, returns the valid (input, output) types.

    Examples:

    >>> from sklearn.feature_extraction.text import CountVectorizer
    >>> is_transformer(CountVectorizer)
    (True, (List(Sentence()), MatrixContinuousSparse()))
    >>> from sklearn.decomposition.pca import PCA
    >>> is_transformer(PCA)
    (True, (MatrixContinuousDense(), MatrixContinuousDense()))

    """
    if not is_algorithm(cls, verbose=verbose):
        return False, None

    allowed_inputs = set()
    allowed_outputs = set()

    for input_type in [
            kb.MatrixContinuousDense(),
            kb.MatrixContinuousSparse(),
            kb.List(kb.Sentence()),
    ]:
        for output_type in [
                kb.MatrixContinuousDense(),
                kb.MatrixContinuousSparse(),
                kb.List(kb.Sentence()),
        ]:
            try:
                X = DATA_TYPE_EXAMPLES[input_type]

                clf = cls()
                X = clf.fit_transform(X)

                assert is_data_type(X, output_type)

                allowed_inputs.add(input_type)
                allowed_outputs.add(output_type)
            except Exception as e:
                if verbose:
                    warnings.warn(str(e))

    if len(allowed_outputs) != 1:
        return False, None

    inputs = combine_types(*allowed_inputs)

    if allowed_inputs:
        return True, (inputs, list(allowed_outputs)[0])
    else:
        return False, None
Exemplo n.º 2
0
def is_tagger(cls, verbose=False):
    """Determine if `cls` corresponds to something that resembles an nltk pos tagger.
    If True, returns the valid (input, output) types.

    Examples:

    >>> from nltk.tag import AffixTagger
    >>> from nltk.tokenize import PunktSentenceTokenizer
    >>> is_tagger(AffixTagger)
    (True, (List(List(Word())), List(List(Postag()))))
    >>> is_tagger(LogisticRegression)
    (False, None)

    """
    if not _is_tagger(cls, verbose=verbose):
        return False, None

    inputs = []
    output = kb.List(kb.List(kb.Postag()))

    for input_type in [kb.List(kb.List(kb.Word()))]:
        try:
            X = DATA_TYPE_EXAMPLES[input_type]

            X_train = [[(word, word) for word in sentence] for sentence in X]

            tagger = cls(train=X_train)
            # tagger = cls()
            y = tagger.tag_sents(X)

            assert DATA_RESOLVERS[output](y)
            inputs.append(input_type)
        except Exception as e:
            if verbose:
                warnings.warn(str(e))

    inputs = combine_types(*inputs)

    if inputs:
        return True, (inputs, output)
    else:
        is_ptt = is_pretrained_tagger(cls, verbose)
        is_ckr = is_chunker(cls, verbose)

        if is_ptt[0]:
            return is_ptt
        if is_ckr[0]:
            return is_ckr
        return False, None
Exemplo n.º 3
0
def is_chunker(cls, verbose=False):
    """Determine if `cls` corresponds to something that resembles an nltk chunker.
    If True, returns the valid (input, output) types.

    Examples:

    >>> from nltk.chunk.named_entity import NEChunkParserTagger
    >>> from nltk.tokenize import PunktSentenceTokenizer
    >>> is_chunker(NEChunkParserTagger)
    (True, (List(List(Tuple(Word(), Word()))), List(List(Tuple(Tuple(Word(), Word()), Word())))))
    >>> is_chunker(PunktSentenceTokenizer)
    (False, None)

    """
    if not _is_tagger(cls, verbose=verbose):
        return False, None

    inputs = []
    output = kb.List(kb.List(kb.Chunktag()))

    for input_type in [kb.List(kb.List(kb.Postag()))]:
        try:
            X = DATA_TYPE_EXAMPLES[input_type]

            X_train = [[((word, postag), postag) for word, postag in sentence]
                       for sentence in X]

            chunker = cls(train=X_train)
            y = chunker.tag_sents(X)

            assert DATA_RESOLVERS[output](y)
            inputs.append(input_type)
        except Exception as e:
            if verbose:
                warnings.warn(str(e))

    inputs = combine_types(*inputs)

    if inputs:
        return True, (inputs, output)
    else:
        return False, None
Exemplo n.º 4
0
def is_pretrained_tagger(cls, verbose=False):
    """Determine if `cls` corresponds to something that resembles an nltk sentence tokenizer.
    If True, returns the valid (input, output) types.

    Examples:

    >>> from nltk.tag import AffixTagger
    >>> from nltk.tag.perceptron import PerceptronTagger
    >>> is_pretrained_tagger(PerceptronTagger)
    (True, (List(Word()), List(Tuple(Word(), Word()))))
    >>> is_pretrained_tagger(AffixTagger)
    (False, None)

    """
    if not _is_tagger(cls, verbose=verbose):
        return False, None

    inputs = []
    output = kb.List(kb.Postag())

    for input_type in [kb.List(kb.Word())]:
        try:
            X = DATA_TYPE_EXAMPLES[input_type]

            tagger = cls()

            y = tagger.tag(X)

            assert DATA_RESOLVERS[output](y)
            inputs.append(input_type)
        except Exception as e:
            if verbose:
                warnings.warn(str(e))

    inputs = combine_types(*inputs)

    if inputs:
        return True, (inputs, output)
    else:
        return False, None
Exemplo n.º 5
0
def is_sent_tokenizer(cls, verbose=False):
    """Determine if `cls` corresponds to something that resembles an nltk sentence tokenizer.
    If True, returns the valid (input, output) types.

    Examples:

    >>> from sklearn.linear_model import LogisticRegression
    >>> from nltk.tokenize import PunktSentenceTokenizer
    >>> is_sent_tokenizer(PunktSentenceTokenizer)
    (True, (Document(), List(Sentence())))
    >>> is_sent_tokenizer(LogisticRegression)
    (False, None)

    """
    if not _is_sent_tokenizer(cls, verbose=verbose):
        return False, None

    inputs = []
    output = kb.List(kb.Sentence())

    for input_type in [kb.Document()]:
        try:
            X = DATA_TYPE_EXAMPLES[input_type]

            tokenizer = cls()
            y = tokenizer.tokenize(X)

            assert DATA_RESOLVERS[output](y)
            inputs.append(input_type)
        except Exception as e:
            if verbose:
                warnings.warn(str(e))

    inputs = combine_types(*inputs)

    if inputs:
        return True, (inputs, output)
    else:
        return False, None
Exemplo n.º 6
0
        obj = set(obj)

        return len(obj) > 0.1 * original_length and all(
            isinstance(x, str) for x in obj)
    except:
        return False


from autogoal import kb

DATA_RESOLVERS = {
    kb.MatrixContinuousDense(): is_matrix_continuous_dense,
    kb.MatrixContinuousSparse(): is_matrix_continuous_sparse,
    kb.CategoricalVector(): is_categorical,
    kb.ContinuousVector(): is_continuous,
    kb.List(kb.Sentence()): is_string_list,
}

DATA_TYPE_EXAMPLES = {
    kb.MatrixContinuousDense(): np.random.rand(10, 10),
    kb.MatrixContinuousSparse(): sp.rand(10, 10),
    kb.CategoricalVector(): np.asarray(["A"] * 5 + ["B"] * 5),
    kb.ContinuousVector(): np.random.rand(10),
    kb.DiscreteVector(): np.random.randint(0, 10, (10, ), dtype=int),
    kb.List(kb.Sentence()): ["abc bcd def feg geh hij jkl lmn nop pqr"] * 10,
}


def is_algorithm(cls, verbose=False):
    if hasattr(cls, "fit") and hasattr(cls, "predict"):
        return "estimator"
Exemplo n.º 7
0
    @abc.abstractmethod
    def transform(self, X, y=None):
        pass


GENERATION_RULES = dict(
    LatentDirichletAllocation=dict(ignore_params=set(["evaluate_every"])),
    RadiusNeighborsClassifier=dict(ignore=True, ),
    KNeighborsTransformer=dict(ignore_params=set(["metric"])),
    RadiusNeighborsTransformer=dict(ignore_params=set(["metric"])),
    LocalOutlierFactor=dict(ignore_params=set(["metric"])),
    RadiusNeighborsRegressor=dict(ignore_params=set(["metric"])),
    LabelBinarizer=dict(
        ignore_params=set(["neg_label", "pos_label"]),
        input_annotation=kb.List(kb.Category()),
    ),
    HashingVectorizer=dict(ignore_params=set(
        ["token_pattern", "analyzer", "input", "decode_error"])),
    SpectralBiclustering=dict(ignore_params=set(["n_components", "n_init"])),
    SpectralCoclustering=dict(ignore_params=set(["n_init"])),
    KMeans=dict(ignore_params=set(["n_init"])),
    MiniBatchKMeans=dict(ignore_params=set(["batch_size", "n_init"])),
    DictionaryLearning=dict(ignore=True),
    MiniBatchDictionaryLearning=dict(ignore=True),
    LassoLars=dict(ignore_params=["alpha"]),
    TheilSenRegressor=dict(ignore_params=["max_subpopulation"]),
    TSNE=dict(ignore=True, ignore_params=["perplexity"]),
)

Exemplo n.º 8
0
import warnings
import inspect
import re
import numpy as np
import scipy.sparse as sp

from autogoal import kb
from autogoal.contrib.sklearn._utils import is_matrix_continuous_dense,\
                                            is_matrix_continuous_sparse,\
                                            is_categorical,\
                                            is_continuous,\
                                            is_string_list

DATA_TYPE_EXAMPLES = {
    kb.Postag(): ("lorem", "ipsum"),  # (str, str) Tagged token
    kb.List(kb.Postag()): [("lorem", "ipsum")] *
    10,  # [(str, str), (str, str)] List of tagged tokens
    kb.List(kb.List(kb.Postag())): [
        [("lorem", "ipsum")] * 2
    ],  # [[(str, str), (str, str)], [(str, str), (str, str)]] List of Tagged Sentences
    kb.Chunktag():
    (("lorem", "ipsum"), "ipsum"),  # ((str, str), str) IOB Tagged token
    kb.List(kb.Chunktag()): [(("lorem", "ipsum"), "ipsum")] *
    10,  # [((str, str), str), ((str, str), str)] List of IOB Tagged token
    kb.List(kb.List(kb.Chunktag())): [
        [(("lorem", "ipsum"), "ipsum")] * 2
    ],  # [[((str, str), str), ((str, str), str)], [((str, str), str), ((str, str), str)]] List of IOB Tagged Sentences
    kb.Stem():
    "ips",
    kb.Word():
    "ipsum",