예제 #1
0
 def __init__(self, language="es"):
     """
     Init method
     :param language: input language
     """
     self.__stemmer = snowballstemmer.stemmer("spanish")
     Token.set_extension("stem", default="", force=True)
예제 #2
0
    def __init__(self):
        super().__init__()

        if not Doc.has_extension(self.name):
            Doc.set_extension(self.name, default=[])

        if not Token.has_extension('is_lexical'):
            Token.set_extension('is_lexical', default=False)
def enable_spacy_extensions():
    """Enables custom extensions for spaCy for dealing with citations."""
    Token.set_extension('is_in_text_citation', default=False, force=True)
    Span.set_extension('tokens_without_citations',
                       getter=get_span_tokens_without_citations,
                       force=True)
    Span.set_extension('text_without_citations',
                       getter=get_span_text_without_citations,
                       force=True)
    Span.set_extension('text_with_ws_without_citations',
                       getter=get_span_text_with_ws_wo_cites,
                       force=True)
 def __init__(self, nlp, name, source: str = None, domain: str = None):
     Token.set_extension(BabelnetAnnotator.__FIELD,
                         default=None,
                         force=True)
     self.__lang = nlp.lang
     self.__bn_lang = bn.Language.fromISO(nlp.lang)
     self.__source = source
     self.__domain = domain
     self.__bn_domain = None
     if domain:
         self.__bn_domain = bn.BabelDomain.valueOfName(domain)
     self.__bn_source = None
     if source:
         self.__bn_source = getattr(bn.BabelSenseSource, source)
예제 #5
0
 def text_to_instance(self,
                      source_string: str,
                      target_string: str = None,
                      paragraph_id: str = None,
                      turn_id: int = 0) -> Instance:  # type: ignore
     # pylint: disable=arguments-differ
     tokenized_source = self._tokenizer.tokenize(source_string)
     tokenized_source.insert(0, Token(START_SYMBOL))
     tokenized_source.append(Token(END_SYMBOL))
     source_field = TextField(tokenized_source, self._token_indexers)
     if target_string is not None:
         tokenized_target = self._tokenizer.tokenize(target_string)
         tokenized_target.insert(0, Token(START_SYMBOL))
         tokenized_target.append(Token(END_SYMBOL))
         target_field = TextField(tokenized_target, self._token_indexers)
         return Instance({"source_tokens": source_field})
     else:
         return Instance({"source_tokens": source_field})
예제 #6
0
 def __init__(self, language: str = "es"):
     """
     Init method
     :param language: language of the annotation
     """
     self.__sentiment_words = load_dict(language, "sentiment_words.csv")
     self.__boosters = load_dict(language, "boosters.csv")
     self.__negations = load_dict(language, "negations.csv")
     Span.set_extension("sentiment_weight", default=0.0, force=True)
     Token.set_extension("sentiment_weight", default=0.0, force=True)
     Token.set_extension("negation_weight", default=1.0, force=True)
     Token.set_extension("booster_weight", default=0.0, force=True)
예제 #7
0
 def __init__(self, nlp, name):
     self.__lang = nlp.lang
     Token.set_extension(WordnetAnnotator.__FIELD, default=None, force=True)
     load_wordnet_domains()
예제 #8
0
    def _set_spacy_extensions(self):
        def synset_getter(token):

            if not Disambiguator._wn:
                from nltk.corpus import wordnet as wn
                Disambiguator._wn = wn

            else:
                wn = Disambiguator._wn

            offset = token._.offset
            if offset:
                return wn.synset_from_pos_and_offset(offset[-1],
                                                     int(offset[3:-1]))
            else:
                return None

        def bnid_getter(token):
            return babelnet_map.get(token._.offset)

        from spacy.tokens import Doc, Token
        Doc.set_extension('lang', default='en')

        Token.set_extension('lemma_preset_', default=None)
        Token.set_extension('pos_preset_', default=None)

        Token.set_extension('lemma_preset_else_spacy',
                            getter=lambda t: t._.lemma_preset_ or t.lemma_)
        Token.set_extension('pos_preset_else_spacy',
                            getter=lambda t: t._.pos_preset_ or t.pos_)

        Token.set_extension('offset', default=None)
        Token.set_extension('synset', getter=synset_getter)
        Token.set_extension('bnid', getter=bnid_getter)
        Token.set_extension('disambiguator_internals', default=None)
 def __init__(self, lang: str = 'es'):
     Token.set_extension(WordnetAnnotator.__FIELD, default=None, force=True)
     load_wordnet_domains()
     self.__lang = lang
import spacy
from spacy.tokens.token import Token

nlp = spacy.load('en_core_web_lg')

input_file = '../../../tasks/02-structural-linguistics/examiner-headlines.txt'
output_file = './2.1-correct_headlines.txt'

big_letter_tag = ['NN', 'PRP', 'VB', 'JJ', 'RB']
Token.set_extension('need_capitalize', default=False)


def _check_token_needs_capitalizing(token):
    pos_big_letter = any(token.tag_.startswith(tag) for tag in big_letter_tag)
    # розрізняти prepositions та subordinate conjunctions
    pos_conjunction = token.pos_ == 'ADP' and len(list(token.children)) == 0
    # hyphen is handled separately
    return pos_big_letter or pos_conjunction


def mark_doc(line: str):
    doc = nlp(line)

    # first word capitalization
    doc[0]._.set('need_capitalize', True)

    for token in doc[1:-1]:
        if token.text != '-':
            needs_capitalization = _check_token_needs_capitalizing(token)
            if needs_capitalization:
                token._.set('need_capitalize', True)
예제 #11
0
import spacy
from blingfire import text_to_sentences
from spacy.tokens import Span, Doc
from spacy.tokens.token import Token

from data_model import span_to_dict
from relationships_resolver import SimpleResolutionResolver, VicinityResolutionResolver

Span.set_extension('id', default=None, force=True)
Span.set_extension('links', default=[], force=True)
Span.set_extension('linkable', default=False, force=True)
Span.set_extension('bounding_boxes', default=[], force=True)
Span.set_extension('formattedText', default="", force=True)

Token.set_extension('id', default=None, force=True)
Token.set_extension('links', default=[], force=True)
Token.set_extension('linkable', default=False, force=True)
Token.set_extension('bounding_boxes', default=[], force=True)
Token.set_extension('formattedText', default="", force=True)


def decode(response):
    try:
        return response.json()
    except ValueError as e:
        return "Error: " + str(e)


def entities_classes():
    return [
예제 #12
0
 def __init__(self, nlp, name, K=10, xnet='wordnet'):
     self.__lang = nlp.lang
     self.__K = K
     Token.set_extension(self.__FIELD, default=None, force=True)
     Ares.load_data(nlp.lang, xnet)