def stop_word_tagger_hub(input_dict): """ Apply the *stop_word_tagger* object on the Annotated Document Corpus (*adc*): 1. first select only annotations of type Token Annotation *element_annotation*, 2. apply the stop_word tagger 3. create new annotations *output_feature* with the outputs of the stop word tagger. :param adc: Annotated Document Corpus (workflows.textflows.DocumentCorpus) :param stop_word_tagger: A python dictionary containing the stop word tagger object and its arguments. :param element_annotation: Which annotated part of document to be searched for stopwords. :param output_features: How to annotate the newly discovered stop word features. :returns adc: Annotated Document Corpus (workflows.textflows.DocumentCorpus) """ if isinstance(input_dict['stop_word_tagger'],LatinoObject): from ...latino.library_gen import latino_tag_adcstopwords input_dict['tagger']=input_dict['stop_word_tagger'] #TODO temporary return executeFunction.apply_async([latino_tag_adcstopwords,input_dict],queue="windows").wait() if settings.USE_WINDOWS_QUEUE \ else latino_tag_adcstopwords(input_dict) else: adc = input_dict['adc'] tagger_dict = input_dict['stop_word_tagger'] input_annotation = input_dict['element_annotation'] output_annotation = input_dict['output_feature'] return universal_word_tagger_hub(adc,tagger_dict,input_annotation,output_annotation)
def stop_word_tagger_hub(input_dict): """ Apply the *stop_word_tagger* object on the Annotated Document Corpus (*adc*): 1. first select only annotations of type Token Annotation *element_annotation*, 2. apply the stop_word tagger 3. create new annotations *output_feature* with the outputs of the stop word tagger. :param adc: Annotated Document Corpus (workflows.textflows.DocumentCorpus) :param stop_word_tagger: A python dictionary containing the stop word tagger object and its arguments. :param element_annotation: Which annotated part of document to be searched for stopwords. :param output_features: How to annotate the newly discovered stop word features. :returns adc: Annotated Document Corpus (workflows.textflows.DocumentCorpus) """ if isinstance(input_dict['stop_word_tagger'], LatinoObject): from ...latino.library_gen import latino_tag_adcstopwords input_dict['tagger'] = input_dict['stop_word_tagger'] #TODO temporary return executeFunction.apply_async([latino_tag_adcstopwords,input_dict],queue="windows").wait() if settings.USE_WINDOWS_QUEUE \ else latino_tag_adcstopwords(input_dict) else: adc = input_dict['adc'] tagger_dict = input_dict['stop_word_tagger'] input_annotation = input_dict['element_annotation'] output_annotation = input_dict['output_feature'] return universal_word_tagger_hub(adc, tagger_dict, input_annotation, output_annotation)
def stem_lemma_tagger_hub(input_dict): if isinstance(input_dict['tagger'],LatinoObject): #check if this is a latino object from ...latino.library_gen import latino_tag_adc_stem_lemma return latino_tag_adc_stem_lemma(input_dict) if not settings.USE_WINDOWS_QUEUE \ else executeFunction.apply_async([latino_tag_adc_stem_lemma,input_dict],queue="windows").wait() else: adc = input_dict['adc'] tagger_dict = input_dict['tagger'] input_annotation = input_dict['element_annotation'] output_annotation = input_dict['output_feature'] return universal_word_tagger_hub(adc,tagger_dict,input_annotation,output_annotation)
def stem_lemma_tagger_hub(input_dict): if isinstance(input_dict['tagger'], LatinoObject): #check if this is a latino object from ...latino.library_gen import latino_tag_adc_stem_lemma return latino_tag_adc_stem_lemma(input_dict) if not settings.USE_WINDOWS_QUEUE \ else executeFunction.apply_async([latino_tag_adc_stem_lemma,input_dict],queue="windows").wait() else: adc = input_dict['adc'] tagger_dict = input_dict['tagger'] input_annotation = input_dict['element_annotation'] output_annotation = input_dict['output_feature'] return universal_word_tagger_hub(adc, tagger_dict, input_annotation, output_annotation)
def tokenizer_hub(input_dict): """ Apply the *tokenizer* object on the Annotated Document Corpus (*adc*): 1. first select only annotations of type *input_annotation*, 2. apply the tokenizer 3. create new annotations *output_annotation* with the outputs of the tokenizer. :param adc: Annotated Document Corpus (workflows.textflows.DocumentCorpus) :param tokenizer: A python dictionary containing the Tokenizer object and its arguments. :param input_annotation: Which annotated part of document to be splitted. :param output_annotation: How to annotate the newly discovered tokens. :returns adc: Annotated Document Corpus (workflows.textflows.DocumentCorpus) """ tokenizer_dict = input_dict['tokenizer'] if type(tokenizer_dict) != dict: from workflows.tasks import executeFunction from tf_latino.latino.library_gen import latino_tokenize_words return latino_tokenize_words(input_dict) if not settings.USE_WINDOWS_QUEUE \ else executeFunction.apply_async([latino_tokenize_words,input_dict],queue="windows").wait() else: tokenizer = tokenizer_dict['object'] args = tokenizer_dict.get('args', []) kwargs = tokenizer_dict.get('kargs', {}) input_annotation = input_dict['input_annotation'] output_annotation = input_dict['output_annotation'] adc = input_dict['adc'] docs_count = len(adc.documents) for i, document in enumerate(adc.documents): if document.features['contentType'] == "Text": if not document.text: pass for annotation, subtext in document.get_annotations_with_text( input_annotation): #all annotations of this type new_token_spans = tokenizer.span_tokenize( subtext, *args, **kwargs) for starts_at, ends_at in new_token_spans: document.annotations.append( Annotation(annotation.span_start + starts_at, annotation.span_start + ends_at - 1, output_annotation)) if i % 100 == 0: print int((i + 1) * 1.0 / docs_count * 100) #widget.progress = int((i+1)*1.0/*100) #widget.save() return {'adc': adc}
def stem_lemma_tagger_hub(input_dict): if input_dict[ 'tagger'].__class__.__name__ == "LatinoObject": #check if this is a latino object from tf_latino.latino.library_gen import latino_tag_adc_stem_lemma from workflows.tasks import executeFunction return latino_tag_adc_stem_lemma(input_dict) if not settings.USE_WINDOWS_QUEUE \ else executeFunction.apply_async([latino_tag_adc_stem_lemma,input_dict],queue="windows").wait() else: adc = input_dict['adc'] tagger_dict = input_dict['tagger'] input_annotation = input_dict['element_annotation'] pos_annotation = input_dict.get('pos_annotation') output_annotation = input_dict['output_feature'] return universal_word_tagger_hub(adc, tagger_dict, input_annotation, output_annotation, pos_annotation)
def tokenizer_hub(input_dict): """ Apply the *tokenizer* object on the Annotated Document Corpus (*adc*): 1. first select only annotations of type *input_annotation*, 2. apply the tokenizer 3. create new annotations *output_annotation* with the outputs of the tokenizer. :param adc: Annotated Document Corpus (workflows.textflows.DocumentCorpus) :param tokenizer: A python dictionary containing the Tokenizer object and its arguments. :param input_annotation: Which annotated part of document to be splitted. :param output_annotation: How to annotate the newly discovered tokens. :returns adc: Annotated Document Corpus (workflows.textflows.DocumentCorpus) """ tokenizer_dict = input_dict['tokenizer'] if type(tokenizer_dict)!=dict: from ...latino.library_gen import latino_tokenize_words return latino_tokenize_words(input_dict) if not settings.USE_WINDOWS_QUEUE \ else executeFunction.apply_async([latino_tokenize_words,input_dict],queue="windows").wait() else: tokenizer=tokenizer_dict['object'] args=tokenizer_dict.get('args',[]) kwargs=tokenizer_dict.get('kargs',{}) input_annotation = input_dict['input_annotation'] output_annotation = input_dict['output_annotation'] adc = input_dict['adc'] docs_count=len(adc.documents) for i,document in enumerate(adc.documents): if document.features['contentType'] == "Text": if not document.text: pass for annotation,subtext in document.get_annotations_with_text(input_annotation): #all annotations of this type new_token_spans=tokenizer.span_tokenize(subtext,*args,**kwargs) for starts_at,ends_at in new_token_spans: document.annotations.append(Annotation(annotation.span_start+starts_at,annotation.span_start+ends_at-1,output_annotation)) if i%100==0: print int((i+1)*1.0/docs_count*100) #widget.progress = int((i+1)*1.0/*100) #widget.save() return {'adc': adc}
def pos_tagger_hub(input_dict): if isinstance(input_dict['pos_tagger'],LatinoObject): #check if this is a latino object from ...latino.library_gen import latino_pos_tag adc= executeFunction.apply_async([latino_pos_tag,input_dict],queue="windows").wait()['adc'] \ if settings.USE_WINDOWS_QUEUE else latino_pos_tag(input_dict) else: adc= universal_sentence_tagger_hub(input_dict)['adc'] number_of_letters=int(input_dict['num_of_letters']) if number_of_letters!=-1: element_annotation_name = input_dict['element_annotation'] output_annotation_name = input_dict['output_feature'] for doc in adc.documents: for annotation in doc.get_annotations(element_annotation_name): if not output_annotation_name in annotation.features: print input_dict['pos_tagger'],annotation.features print doc.features else: annotation.features[output_annotation_name]=annotation.features[output_annotation_name][0:number_of_letters] return {'adc': adc }
def pos_tagger_hub(input_dict): if input_dict['pos_tagger'].__class__.__name__=="LatinoObject": #check if this is a latino object from tf_latino.latino.library_gen import latino_pos_tag from workflows.tasks import executeFunction adc= executeFunction.apply_async([latino_pos_tag,input_dict],queue="windows").wait()['adc'] \ if settings.USE_WINDOWS_QUEUE else latino_pos_tag(input_dict) else: adc= universal_sentence_tagger_hub(input_dict)['adc'] number_of_letters=int(input_dict['num_of_letters']) if number_of_letters!=-1: element_annotation_name = input_dict['element_annotation'] output_annotation_name = input_dict['output_feature'] for doc in adc.documents: for annotation in doc.get_annotations(element_annotation_name): if not output_annotation_name in annotation.features: print input_dict['pos_tagger'],annotation.features print doc.features else: annotation.features[output_annotation_name]=annotation.features[output_annotation_name][0:number_of_letters] return {'adc': adc }