Exemplo n.º 1
0
 def __init__(
     self,
     tokenizer: algorithm(Sentence(), List(Word())),
     feature_extractor: algorithm(Word(), Flags()),
     include_text: Boolean(),
 ):
     self.tokenizer = tokenizer
     self.feature_extractor = feature_extractor
     self.include_text = include_text
Exemplo n.º 2
0
def test_meta_pipeline_graph():
    # Test List algorithm generation
    build_pipeline_graph(input=List(Word()),
                         output=List(Word()),
                         registry=[WordToWordAlgorithm])
    
    # Test Tuple breakdown feature
    build_pipeline_graph(input=Tuple(Word(), Matrix()),
                         output=Text(),
                         registry=[WordToWordAlgorithm])
    
    # Test Tuple breakdown feature and List algorithm generation
    build_pipeline_graph(input=Tuple(List(Word()), Matrix()),
                         output=List(Word()),
                         registry=[WordToWordAlgorithm])
Exemplo n.º 3
0
    def __init__(
            self,
            lowercase: Boolean(),
            stopwords_remove: Boolean(),
            binary: Boolean(),
            inner_tokenizer: algorithm(Sentence(), List(Word())),
            inner_stemmer: algorithm(Word(), Stem()),
            inner_stopwords: algorithm(List(Word()), List(Word())),
    ):
        self.stopwords_remove = stopwords_remove
        self.inner_tokenizer = inner_tokenizer
        self.inner_stemmer = inner_stemmer
        self.inner_stopwords = inner_stopwords

        SklearnTransformer.__init__(self)
        _CountVectorizer.__init__(self, lowercase=lowercase, binary=binary)
Exemplo n.º 4
0
def test_simple_pipeline_graph():
    graph = build_pipeline_graph(input=MatrixContinuousDense()
                                 ,output= MatrixContinuousDense()
                                 ,registry=[ExactAlgorithm, HigherInputAlgorithm, LowerOutputAlgorithm]).graph
    assert_graph(graph, 3, 3, 6)
    
    graph = build_pipeline_graph(input=List(Text())
                                 ,output= Document()
                                 ,registry=[WordToWordAlgorithm, 
                                            TextToWordAlgorithm, 
                                            WordToWordListAlgorithm,
                                            WordListToSentenceAlgorithm,
                                            WordListToSentenceAlgorithm,
                                            SentenceListToDocumentAlgorithm,
                                            TextListToDocumentAlgorithm]).graph
    assert_graph(graph, 2, 2, 12)
    
    graph = build_pipeline_graph(input=List(Word())
                                 ,output=Document()
                                 ,registry=[WordToWordAlgorithm, 
                                            TextToWordAlgorithm, 
                                            WordToWordListAlgorithm,
                                            WordListToSentenceAlgorithm,
                                            WordListToSentenceAlgorithm,
                                            SentenceListToDocumentAlgorithm,
                                            TextListToDocumentAlgorithm]).graph
    assert_graph(graph, 2, 1, 10)
Exemplo n.º 5
0
 def run(
     self, input: Word(domain='general', language='spanish')) -> Summary():
     """This method use Word2Vect of gensim for tranform a word in embedding vector.
     """
     try:
         return wikipedia.summary(input)
     except:
         return ""
Exemplo n.º 6
0
 def run(
     self, input: Word(domain="general", language="spanish")
 ) -> ContinuousVector():
     """This method use Word2Vec of gensim for tranform a word in embedding vector.
     """
     try:
         return self.model.get_vector(input.lower())
     except KeyError:
         return np.zeros(400)
Exemplo n.º 7
0
 def __init__(
     self,
     extractors: Distinct(
         algorithm(Word(), Flags()), exceptions=["MultipleFeatureExtractor"]
     ),
     merger: algorithm(List(Flags()), Flags()),
 ):
     self.extractors = extractors
     self.merger = merger
Exemplo n.º 8
0
def test_build_pipeline_graph():
    test_meta_pipeline_graph()
    test_simple_pipeline_graph()
    
    #test failed graph generation
    assert_pipeline_graph_failed(Text(), Word(), [])
    assert_pipeline_graph_failed(Text(), 
                                 Document(), 
                                 [WordToWordAlgorithm, 
                                  TextToWordAlgorithm, 
                                  WordToWordListAlgorithm,
                                  SentenceListToDocumentAlgorithm,
                                  TextListToDocumentAlgorithm])
Exemplo n.º 9
0
    def run(self, input: Sentence()) -> Tuple(List(Word()), List(Flags())):
        tokenized = self.nlp(input)

        tokens = []
        flags = []

        for token in tokenized:
            token_flags = {}
            if self.extract_lemma:
                token_flags["lemma"] = token.lemma_
            if self.extract_pos_tag:
                token_flags["pos"] = token.pos_

                for kv in token.tag_.split("|"):
                    kv = kv.split("=")
                    if len(kv) == 2:
                        token_flags["tag_" + kv[0]] = kv[1]
                    else:
                        token_flags["tag_" + kv[0]] = True

            if self.extract_dep:
                token_flags["dep"] = token.dep_
            if self.extract_entity:
                token_flags["ent_type"] = token.ent_type_
                token_flags["ent_kb_id"] = token.ent_kb_id_
            if self.extract_details:
                token_flags["is_alpha"] = token.is_alpha
                token_flags["is_ascii"] = token.is_ascii
                token_flags["is_digit"] = token.is_digit
                token_flags["is_lower"] = token.is_lower
                token_flags["is_upper"] = token.is_upper
                token_flags["is_title"] = token.is_title
                token_flags["is_punct"] = token.is_punct
                token_flags["is_left_punct"] = token.is_left_punct
                token_flags["is_right_punct"] = token.is_right_punct
                token_flags["is_space"] = token.is_space
                token_flags["is_bracket"] = token.is_bracket
                token_flags["is_quote"] = token.is_quote
                token_flags["is_currency"] = token.is_currency
                token_flags["like_url"] = token.like_url
                token_flags["like_num"] = token.like_num
                token_flags["like_email"] = token.like_email
                token_flags["is_oov"] = token.is_oov
                token_flags["is_stop"] = token.is_stop
            if self.extract_sentiment:
                token_flags["sentiment"] = token.sentiment

            tokens.append(token.text)
            flags.append(token_flags)

        return tokens, flags
Exemplo n.º 10
0
    def generated_classifier_from_dataset(data: Collection,
                                          number_of_models: int = 5):
        models = []
        lines, classes = load_training_entities(data)
        unique_clases = reduce(lambda x, y: x | y, [set(c) for c in classes])

        for _ in range(number_of_models):

            classifier = AutoML(
                input=ag_List(ag_List(Word())),
                output=ag_List(ag_List(Postag())),
            )

            classifier.fit([[w.text for w in l] for l in lines], classes)
            models.append(classifier)

        return SentencesAnnotator(models=models,
                                  collection_base=data,
                                  unique_classes=unique_clases)
Exemplo n.º 11
0
 def __init__(self, tokenizer: algorithm(Sentence(), List(Word()))):
     self.tokenizer = tokenizer
 def run(self, input: List(Word())) -> List(Vector()):
     pass
 def run(self, input: Sentence()) -> List(Word()):
     pass
Exemplo n.º 14
0
 def run(self, input: Word()) -> Flags():
     r_exp = self._regex()
     b = re.fullmatch(r_exp, input) if self.full else re.search(r_exp, input) 
     return {f"is_{self._name}_regex": bool(b)}
Exemplo n.º 15
0
parser.add_argument("--token", default=None)
parser.add_argument("--channel", default=None)

args = parser.parse_args()

print(args)

# ## Experimentation

# Instantiate the classifier.
# Note that the input and output types here are defined to match the problem statement,
# i.e., entity recognition.

classifier = AutoML(
    search_algorithm=PESearch,
    input=List(List(Word())),
    output=List(List(Postag())),
    search_iterations=args.iterations,
    score_metric=meddocan.F1_beta,
    cross_validation_steps=1,
    search_kwargs=dict(
        pop_size=args.popsize,
        search_timeout=args.global_timeout,
        evaluation_timeout=args.timeout,
        memory_limit=args.memory * 1024 ** 3,
    ),
)

# This custom logger is used for debugging purposes, to be able later to recover
# the best pipelines and all the errors encountered in the experimentation process.
Exemplo n.º 16
0
 def run(self, input:Word()) -> List(Word()):
     pass
Exemplo n.º 17
0
 def __init__(self,
     tokenizer: algorithm(Sentence(), List(Word())),
     token_feature_extractor: algorithm(Word(), Flags()),
     # token_sentence_encoder: algorithm(Word(), )
 ):
     pass
Exemplo n.º 18
0
 def run(
     self, input: Tuple(Sentence(), List(Entity()))
 ) -> Tuple(List(Word()), List(Postag())):
     pass
Exemplo n.º 19
0
 def run(self, input:Text()) -> Word():
     pass
Exemplo n.º 20
0
 def run(self, input:List(Word())) -> Sentence():
     pass
Exemplo n.º 21
0
 def run(self, input: Word()) -> Flags():
     flags = [extractor.run(input) for extractor in self.extractors]
     return self.merger.run(flags)
Exemplo n.º 22
0
 def run(
     self, input: Word(domain='general', language='spanish')) -> Flags():
     """This method use Word2Vect of gensim for tranform a word in embedding vector.
     """
     return dict(in_wikipedia=bool(wikipedia.search(input)))
Exemplo n.º 23
0
 def run(self, input:Word()) -> Word():
     pass