def __init__(self, nlp, label='GPE'):
        """Initialise the pipeline component. The shared nlp instance is used
        to initialise the matcher with the shared vocab, get the label ID and
        generate Doc objects as phrase match patterns.
        """
        # Make request once on initialisation and store the data
        r = requests.get('https://restcountries.eu/rest/v2/all')
        r.raise_for_status()  # make sure requests raises an error if it fails
        countries = r.json()

        # Convert API response to dict keyed by country name for easy lookup
        # This could also be extended using the alternative and foreign language
        # names provided by the API
        self.countries = {c['name']: c for c in countries}
        self.label = nlp.vocab.strings[label]  # get entity label ID

        # Set up the PhraseMatcher with Doc patterns for each country name
        patterns = [nlp(c) for c in self.countries.keys()]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add('COUNTRIES', None, *patterns)

        # Register attribute on the Token. We'll be overwriting this based on
        # the matches, so we're only setting a default value, not a getter.
        # If no default value is set, it defaults to None.
        Token.set_extension('is_country', default=False)
        Token.set_extension('country_capital')
        Token.set_extension('country_latlng')
        Token.set_extension('country_flag')

        # Register attributes on Doc and Span via a getter that checks if one of
        # the contained tokens is set to is_country == True.
        Doc.set_extension('has_country', getter=self.has_country)
        Span.set_extension('has_country', getter=self.has_country)
    def __init__(self, nlp, companies=tuple(), label='ORG'):
        """Initialise the pipeline component. The shared nlp instance is used
        to initialise the matcher with the shared vocab, get the label ID and
        generate Doc objects as phrase match patterns.
        """
        self.label = nlp.vocab.strings[label]  # get entity label ID

        # Set up the PhraseMatcher – it can now take Doc objects as patterns,
        # so even if the list of companies is long, it's very efficient
        patterns = [nlp(org) for org in companies]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add('TECH_ORGS', None, *patterns)

        # Register attribute on the Token. We'll be overwriting this based on
        # the matches, so we're only setting a default value, not a getter.
        Token.set_extension('is_tech_org', default=False)

        # Register attributes on Doc and Span via a getter that checks if one of
        # the contained tokens is set to is_tech_org == True.
        Doc.set_extension('has_tech_org', getter=self.has_tech_org)
        Span.set_extension('has_tech_org', getter=self.has_tech_org)
示例#3
0
    def __init__(self,
                 nlp,
                 quickumls_fp,
                 best_match=True,
                 ignore_syntax=False,
                 **kwargs):
        """Instantiate SpacyQuickUMLS object

            This creates a QuickUMLS spaCy component which can be used in modular pipelines.  
            This module adds entity Spans to the document where the entity label is the UMLS CUI and the Span's "underscore" object is extended to contains "similarity" and "semtypes" for matched concepts.

        Args:
            nlp: Existing spaCy pipeline.  This is needed to update the vocabulary with UMLS CUI values
            quickumls_fp (str): Path to QuickUMLS data
            best_match (bool, optional): Whether to return only the top match or all overlapping candidates. Defaults to True.
            ignore_syntax (bool, optional): Wether to use the heuristcs introduced in the paper (Soldaini and Goharian, 2016). TODO: clarify,. Defaults to False
            **kwargs: QuickUMLS keyword arguments (see QuickUMLS in core.py)
        """

        self.quickumls = QuickUMLS(
            quickumls_fp,
            # By default, the QuickUMLS objects creates its own internal spacy pipeline but this is not needed
            # when we're using it as a component in a pipeline
            spacy_component=True,
            **kwargs)

        # save this off so that we can get vocab values of labels later
        self.nlp = nlp

        # keep these for matching
        self.best_match = best_match
        self.ignore_syntax = ignore_syntax

        # let's extend this with some proprties that we want
        Span.set_extension('similarity', default=-1.0)
        Span.set_extension('semtypes', default=-1.0)
示例#4
0
    def __init__(self,
                 nlp,
                 attrs=('has_nfh', 'is_nfh', 'nfh', 'is_deter_nfh', 'nfh_head',
                        'is_implicit'),
                 force_extension=True):
        """Initialise the pipeline component.

        nlp (Language): The shared nlp object. Used to initialise the matcher
            with the shared `Vocab`, and create `Doc` match patterns.
        RETURNS (callable): A spaCy pipeline component.
        """
        download_models()

        home = path.expanduser("~")

        with open(path.join(home, NFH_DIR, IDENTIFICATION_NFH), 'rb') as f:
            self.identification = pickle.load(f)
            self.feature_extractor = FeatureExtractor(3)

        archive_model = load_archive(path.join(home, NFH_DIR, RESOLUTION_NFH))
        self.resolution_predictor = Predictor.from_archive(
            archive_model, 'nfh_classification')

        self._has_nfh, self._is_nfh, self._nfh, self._is_deter_nfh, \
            self._nfh_head, self._is_implicit = attrs
        self._nfh_items = 'nfh_items'

        # Add attributes
        Doc.set_extension(self._has_nfh,
                          getter=self.has_nfh,
                          force=force_extension)
        Span.set_extension(self._has_nfh,
                           getter=self.has_nfh,
                           force=force_extension)

        Doc.set_extension(self._nfh,
                          getter=self.iter_nfh,
                          force=force_extension)
        Span.set_extension(self._nfh,
                           getter=self.iter_nfh,
                           force=force_extension)

        Span.set_extension(self._is_nfh, default=False, force=force_extension)
        Token.set_extension(self._is_nfh, default=False, force=force_extension)
        Token.set_extension(self._is_deter_nfh,
                            default=False,
                            force=force_extension)
        Token.set_extension(self._nfh_head,
                            default=None,
                            force=force_extension)
        Token.set_extension(self._is_implicit,
                            default=False,
                            force=force_extension)

        Doc.set_extension(self._nfh_items, default=[], force=force_extension)
示例#5
0
文件: knp.py 项目: shafiahmed/camphr
 def install_extensions():
     K = KNP_USER_KEYS
     Token.set_extension(K.morph.element, default=None, force=True)
     for k in [
             K.bunsetsu.element,
             K.tag.element,
             K.bunsetsu.list_,
             K.morph.list_,
             K.tag.list_,
     ]:
         Span.set_extension(k, default=None, force=True)
     for k in [BUNSETSU, TAG]:
         Span.set_extension(getattr(KNP_USER_KEYS, k).spans,
                            getter=get_knp_span(k))
         Span.set_extension(getattr(KNP_USER_KEYS, k).parent,
                            getter=get_knp_parent(k))
         Span.set_extension(getattr(KNP_USER_KEYS, k).children,
                            getter=get_knp_children(k))
示例#6
0
def _install_extensions():
    K = KNP_USER_KEYS
    Token.set_extension(K.morph.element, default=None, force=True)
    for k in ["bunsetsu", "tag"]:
        Token.set_extension(getattr(K.morph, k), getter=token_to_knp_span(k))
    for k in ["bunsetsu", "morph", "tag"]:
        for feature in ["element", "list_"]:
            key = getattr(getattr(K, k), feature)
            Span.set_extension(key, default=None, force=True)
    for k in ["bunsetsu", "morph", "tag"]:
        for feature in ["spans", "list_"]:
            key = getattr(getattr(K, k), feature)
            Doc.set_extension(key, getter=get_all_knp_features_from_sents(k, feature))
    for k in [BUNSETSU, TAG]:
        Span.set_extension(getattr(KNP_USER_KEYS, k).spans, getter=get_knp_span(k))
        Span.set_extension(getattr(KNP_USER_KEYS, k).parent, getter=get_knp_parent(k))
        Span.set_extension(
            getattr(KNP_USER_KEYS, k).children, getter=get_knp_children(k)
        )
示例#7
0
    def __init__(self, spacy_instance, stop_words=None):
        self.nlp = spacy_instance
        self.stop_words = spacy_instance.Defaults.stop_words if stop_words is None else stop_words
        self.whitelist_words = {'pajamas'}
        self.whitelist_grammar = {'UPPERCASE_SENTENCE_START'}
        self.spell = SpellChecker()

        Span.set_extension('has_grammar_errors', default=False, force=True)
        Span.set_extension('grammar_recommendation', default=[], force=True)

        Span.set_extension('has_spelling_errors', default=False, force=True)
        Token.set_extension('correct_spelling_candidates',
                            default=[],
                            force=True)
示例#8
0
    def __init__(self,
                 nlp,
                 keywords,
                 label,
                 tokentag,
                 doctag=None,
                 spantag=None):
        nlp.vocab.strings.add(label)
        self.label = nlp.vocab.strings[label]
        self._label_str = label
        self._token_tag = tokentag
        self._doctag = doctag
        self._spantag = spantag
        self._keywordtag = "is_keyword"
        self._labeltag = "label_"
        # Set up the PhraseMatcher – it can now take Doc objects as patterns,
        # so even if the list of companies is long, it's very efficient
        patterns = [nlp(key) for key in keywords]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add(self._token_tag, None, *patterns)

        # Register attribute on the Token. We'll be overwriting this based on
        # the matches, so we're only setting a default value, not a getter.
        Token.set_extension(self._token_tag, default=False)
        if not Token.has_extension(self._keywordtag):
            Token.set_extension(self._keywordtag, default=False)
            Token.set_extension(self._labeltag, default=None)
        # Register attributes on Doc and Span via a getter that checks if one of
        # the contained tokens is set to is_tech_org == True.
        Doc.set_extension(self._doctag,
                          getter=lambda tokens: any(
                              [t._.get(self._token_tag) for t in tokens]))
        Span.set_extension(self._spantag,
                           getter=lambda tokens: any(
                               [t._.get(self._token_tag) for t in tokens]))
        if not Span.has_extension("dep_"):
            Span.set_extension("dep_", default="")
            Span.set_extension("head_", default=None)
示例#9
0
from spacy.lang.ja import Japanese
from spacy.tokens import Span

nlp = Japanese()


# メソッドを定義
def to_html(span, tag):
    # スパンのテキストをHTMLタグに入れて返す
    return f"<{tag}>{span.text}</{tag}>"


# to_htmlをスパンの「to_html」拡張属性に登録
Span.set_extension("to_html", method=to_html)

# テキストを処理し、「strong」タグを用いてスパンのto_htmlメソッドを呼びだす
doc = nlp("おはようございます、 これは文章です。")
span = doc[0:3]
print(span._.to_html("strong"))
# most of the code in this file has been written by Christian Overdijk.
import spacy
from spacy.matcher import Matcher

#from import_data_json import import_data_json
from spacy.tokens import Span

def entity_detector(doc):
    entity_list = []
    for d in doc:
        if d.text[:3] == 'LOC':
            entity_list.append(d)
    return entity_list

Span.set_extension("entities", method=entity_detector, force=True)

# Takes data as pandas DataFrame (1 column)
# Needs trained nlp model (spaCy)
# Word_distane = number of tokens before and after found entities that are included with result
# Max_token_dist = maximum number op tokens between enities in one sentence before discriptions are considered different ans split
# Returns: list of result split per input, and than per sentence
def get_location_descriptions_json(data, nlpmodel, word_dist=7, max_token_dist=14):
    results = []

    for article in data:
        article_results = []

        doc = nlpmodel(article)

        for sent in doc.sents:
            sentence_results = []
示例#11
0
    position -= 1
    while position >= 0:
        start = constituent_data.starts[position]
        end = constituent_data.ends[position]

        if start <= span.start and span.end <= end:
            return doc[start:end]
        if end < span.sent.start:
            break
        position -= 1

    return None


#%%

Span.set_extension('labels', getter=get_labels)
Span.set_extension('parse_string', getter=parse_string)
Span.set_extension('constituents', getter=get_subconstituents)
Span.set_extension('parent', getter=get_parent_span)
Span.set_extension('children', getter=get_child_spans)

Token.set_extension(
    'labels', getter=lambda token: get_labels(token.doc[token.i:token.i + 1]))
Token.set_extension(
    'parse_string',
    getter=lambda token: parse_string(token.doc[token.i:token.i + 1]))
Token.set_extension(
    'parent',
    getter=lambda token: get_parent_span(token.doc[token.i:token.i + 1]))
import spacy
from spacy.tokens import Span

nlp = spacy.load("de_core_news_sm")


def get_wikipedia_url(span):
    # Generiere eine Wikipedia-URL, wenn die Span eins der Labels hat
    if span.label_ in ("PER", "ORG", "LOC"):
        entity_text = span.text.replace(" ", "_")
        return "https://de.wikipedia.org/w/index.php?search=" + entity_text


# Registriere die Span-Erweiterung "wikipedia_url" mit Getter-Funktion get_wikipedia_url
Span.set_extension("wikipedia_url", getter=get_wikipedia_url)

doc = nlp(
    "In seiner mehr als fünfzigjährigen Karriere und von seinen ersten Aufnahmen "
    "bis hin zu seinem letzten Album, gehörte David Bowie zu den Vorreitern der "
    "Gegenwartskultur.")
for ent in doc.ents:
    # Drucke den Text und die Wikipedia-URL der Entität
    print(ent.text, ent._.wikipedia_url)
示例#13
0
    # if token_span_doc.vocab.has_vector(token_span_doc.text):
    #     return token_span_doc.vocab.get_vector(token_span_doc.text)
    doc = token_span_doc.doc
    use_model_url = doc._.use_model_url
    preprocessor_url = doc._.preprocessor_url
    # if not use_model_url:
    model = UniversalSentenceEncoder.get_model(use_model_url, preprocessor_url)
    vector = model.embed_one(token_span_doc)
    return vector


# install/register the extensions
Doc.set_extension('use_model_url', default=None, force=True)
Doc.set_extension('preprocessor_url', default=None, force=True)
Token.set_extension('universal_sentence_encoding', getter=get_vector, force=True)
Span.set_extension('universal_sentence_encoding', getter=get_vector, force=True)
Doc.set_extension('universal_sentence_encoding', getter=get_vector, force=True)

# the pipeline stage factory
@Language.factory('universal_sentence_encoder', default_config={
    'use_model_url': None,
    'preprocessor_url': None,
    'model_name': None,
    'enable_cache': True,
    'debug': False
})
def use_model_factory(nlp, name, use_model_url, preprocessor_url, model_name, enable_cache, debug):
    preprocessor_url_config = None
    if debug:
        print('use_model_factory:', nlp, 'use_model_url', use_model_url, 'model_name', model_name)
    if use_model_url:
from spacy.tokens import Span


def get_anatomical_location(span):
    for modifier in span._.modifiers:
        if modifier.category == "ANATOMY":
            return modifier.span
    return None


Span.set_extension("anatomical_location", getter=get_anatomical_location)
示例#15
0
            nat.append(ent._.nationality)
    return nat


def extract_foreign(doc):
    is_foreign = []
    for ent in doc.ents:
        if ent._.travel_status:
            is_foreign.append({
                "place": acronym_to_country(ent.text),
                "is_foreign": not (ent.text in l),
            })
    return is_foreign


Span.set_extension("travel_status", getter=get_travel_status, force=True)
Span.set_extension("nationality", getter=get_nat, force=True)
Token.set_extension("relationship", getter=get_rel, force=True)

app = Flask(__name__)

default_result = {
    "nationality": [],
    "travel": [],
    "relationship": [],
    "place_attributes": [],
}


@functools.lru_cache(30000)
def record_processor(sent):
示例#16
0
 def __init__(self, nlp, name="my_pipe"):
     self.name = name
     Span.set_extension("my_ext", getter=self._get_my_ext)
     Doc.set_extension("my_ext", default=None)
#Define the getter function
def get_has_number(doc):
    #Return if any of the tokens in the doc return True for token.like_num
    return any(token.like_num for token in doc)


#Register the Doc property extension 'has_number' with the getter get_has_number
Doc.set_extension('has_number', getter=get_has_number)

#Process the text and check the custom has_number attribute
doc = nlp("The museum closed for five years in 2012.")
print('has_number:', doc._.has_number)

#Part 2

from spacy.tokens import Span


#Define the method
def to_html(span, tag):
    #Wrap the span text in a HTML tag and return it
    return "<{tag}>{text}</{tag}>".format(tag=tag, text=span.text)


#Register the Span property extension 'to_html' with the method to_html
Span.set_extension('to_html', method=to_html)

#Process the text and call the to_html method on the span with the tag name 'strong'
doc = nlp("Hello world, this is a sentence.")
span = doc[0:2]
print(span._.to_html('strong'))
示例#18
0
from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Tuple

import spacy
from spacy.tokens import Doc, Span, Token
from spacy.util import filter_spans
from toolz import curry
from typing_extensions import Literal

from camphr.consts import JUMAN_LINES
from camphr.utils import get_juman_command

from .consts import KNP_USER_KEYS
from .noun_chunker import knp_noun_chunker

LOC2IOB = {"B": "B", "I": "I", "E": "I", "S": "B"}
Span.set_extension(JUMAN_LINES, default=None)
SKIP_TOKENS = {"@"}


TAG = "tag"
BUNSETSU = "bunsetsu"
MORPH = "morph"
L_KNP_OBJ = Literal["tag", "bunsetsu", "morph"]


def _take_juman_lines(n: int, juman_lines: List[str]) -> Tuple[List[str], List[str]]:
    lines = []
    count = 0
    for line in juman_lines:
        lines.append(line)
        head = line.split(" ")[0]
示例#19
0
    if doc[matches[0][1] + 1].text == "hoogte":
        # print("------------------------------------------------")
        # print(doc[matches[0][1]+1], " : ", doc[matches[0][1]+1].pos_)
        doc[matches[0][1] + 1].pos_ = "ADP"
        # print(doc[matches[0][1]+1], " : ", doc[matches[0][1]+1].pos_)


def delete_match(matcher, doc, id, matches):
    if doc[matches[0][1]].text == "van" or doc[matches[0][1]].text == "met":
        # print("------------------------------------------------")
        # print(doc[matches[0][1]], " : ", doc[matches[0][1]].pos_)
        doc[matches[0][1]].pos_ = "X"
        # print(doc[matches[0][1]], " : ", doc[matches[0][1]].pos_)


Span.set_extension("entities", default=[])


# Takes data as pandas DataFrame (1 column)
# Needs trained nlp model (spaCy)
# Word_distane = number of tokens before and after found entities that are included with result
# Max_token_dist = maximum number op tokens between enities in one sentence before discriptions are considered different ans split
# Returns: list of result split per input, and than per sentence
def get_location_descriptions(data,
                              nlpmodel,
                              word_dist=4,
                              max_token_dist=8,
                              entity_filter=ENTITY_LIST):
    results = []

    for article in data:
        for match_id, start, end in matcher(doc)
    ]
    return doc


def get_pi_url(span):
    """Get a URL for PI if the span has one of the labels"""
    if span.label_ in 'DRUG':
        entity_text = span.text.replace(' ', '_')
        url = "https://www.ebs.tga.gov.au/ebs/picmi/picmirepository.nsf/PICMI?OpenForm&t=pi&q=" + entity_text
        web_list.append(url)
        return url


# Set the Span extension pi_url using get getter get_pi_url
Span.set_extension('PI_url', getter=get_pi_url, force=True)

# Add the component to the pipeline
nlp.add_pipe(drug_component)
print(nlp.pipe_names)

# Make the sentence lowercase, process the text and print the entity text, label and PI_url attributes
doc = nlp(sentence.lower())
print([(ent.text, ent.label_, ent._.PI_url) for ent in doc.ents])
for web in list(set(web_list)):
    # Note that I used 'set' here so that there are no repeats of urls
    # (e.g. when drugs are mentioned more than once in the text) in the 'web_list' list
    try:
        # Get the chrome driver location
        chrome_path = r"C:\Users\Andrew\path_to\chromedriver.exe"
        # add chrome_path to webdriver
示例#21
0
    def parse_conll_text_as_spacy(
        self,
        text: str,
        ner_tag_pattern: str = "^((?:name|NE)=)?([BILU])-([A-Z_]+)|O$",
        ner_map: Dict[str, str] = None,
    ) -> Doc:
        """Parses a given CoNLL-U string into a spaCy doc. Parsed sentence section must be separated by a new line (\n\n).
        Note that we do our best to retain as much information as possible but that not all CoNLL-U fields are
        supported in spaCy. We add a Token._.conll_misc_field extension to save CoNLL-U MISC field, and a
        Token._.conll_deps_graphs_field extension to save CoNLL-U DEPS field. The metadata (lines starting with #)
        is saved in Span._.conll_metadata of sentence Spans.

        This method has been adapted from the work by spaCy.
        See: https://github.com/explosion/spaCy/blob/a1c5b694be117ac92e21f9860309821ad6da06f7/spacy/cli/converters/conllu2json.py#L179

        Multi-word tokens and empty nodes are not supported.

        :param text: CoNLL-U formatted text
        :param ner_tag_pattern: Regex pattern for entity tag in the MISC field
        :param ner_map: Map old NER tag names to new ones, '' maps to O
        :return: a spacy Doc containing all the tokens and sentences from the CoNLL file including
         the custom CoNLL extensions
        """
        if not Token.has_extension("conll_misc_field"):
            Token.set_extension("conll_misc_field", default="_")
        if not Token.has_extension("conll_deps_graphs_field"):
            Token.set_extension("conll_deps_graphs_field", default="_")
        if not Span.has_extension("conll_metadata"):
            Span.set_extension("conll_metadata", default=None)

        docs = []
        for chunk in text.split("\n\n"):
            lines = [
                l for l in chunk.splitlines() if l and not l.startswith("#")
            ]
            words, spaces, tags, poses, morphs, lemmas, miscs = [], [], [], [], [], [], []
            heads, deps, deps_graphs = [], [], []
            for i in range(len(lines)):
                line = lines[i]
                parts = line.split("\t")

                if any(not p for p in parts):
                    raise ValueError(
                        "According to the CoNLL-U Format, fields cannot be empty. See"
                        " https://universaldependencies.org/format.html")

                id_, word, lemma, pos, tag, morph, head, dep, deps_graph, misc = parts

                if any(" " in f
                       for f in (id_, pos, tag, morph, head, dep, deps_graph)):
                    raise ValueError(
                        "According to the CoNLL-U Format, only FORM, LEMMA, and MISC fields can contain"
                        " spaces. See https://universaldependencies.org/format.html"
                    )

                if "." in id_ or "-" in id_:
                    raise NotImplementedError(
                        "Multi-word tokens and empty nodes are not supported in spacy_conll"
                    )

                words.append(word)

                if "SpaceAfter=No" in misc:
                    spaces.append(False)
                else:
                    spaces.append(True)

                id_ = int(id_) - 1
                lemmas.append(lemma)
                poses.append(pos)
                tags.append(pos if tag == "_" else tag)
                morphs.append(morph if morph != "_" else "")
                heads.append((int(head) - 1) if head not in ("0",
                                                             "_") else id_)
                deps.append("ROOT" if dep == "root" else dep)
                deps_graphs.append(deps_graph)
                miscs.append(misc)

            doc = Doc(
                self.nlp.vocab,
                words=words,
                spaces=spaces,
                tags=tags,
                pos=poses,
                morphs=morphs,
                lemmas=lemmas,
                heads=heads,
                deps=deps,
            )

            # Set custom Token extensions
            for i in range(len(doc)):
                doc[i]._.conll_misc_field = miscs[i]
                doc[i]._.conll_deps_graphs_field = deps_graphs[i]

            ents = get_entities(lines, ner_tag_pattern, ner_map)
            doc.ents = spans_from_biluo_tags(doc, ents)

            # The deprel relations ensure that this CoNLL chunk is one sentence
            # Deprel cannot therefore not be empty or each word is considered a separate sentence
            if len(list(doc.sents)) != 1:
                raise ValueError(
                    "Your data is in an unexpected format. Make sure that it follows the CoNLL-U format"
                    " requirements. See https://universaldependencies.org/format.html. Particularly make"
                    " sure that the DEPREL field is filled in.")

            # Save the metadata in a custom sentence Span attribute so that the formatter can use it
            metadata = "\n".join(
                [l for l in chunk.splitlines() if l.startswith("#")])
            # We really only expect one sentence
            for sent in doc.sents:
                sent._.conll_metadata = f"{metadata}\n" if metadata else ""

            docs.append(doc)

        # Add CoNLL custom extensions
        return self.nlp.get_pipe("conll_formatter")(Doc.from_docs(docs))
示例#22
0
urls= ['http://www.gutenberg.org/cache/epub/29444/pg29444.txt',
       'http://www.gutenberg.org/cache/epub/31428/pg31428.txt',
       'http://www.gutenberg.org/cache/epub/4908/pg4908.txt',
       'http://www.gutenberg.org/cache/epub/33504/pg33504.txt',
       'http://www.gutenberg.org/cache/epub/39017/pg39017.txt',
       'http://www.gutenberg.org/cache/epub/37157/pg37157.txt',
       'http://www.gutenberg.org/cache/epub/15207/pg15207.txt',
       'http://www.gutenberg.org/cache/epub/10773/pg10773.txt',
       'http://www.gutenberg.org/cache/epub/31624/pg31624.txt',
       'http://www.gutenberg.org/cache/epub/5192/pg5192.txt',
       'http://www.gutenberg.org/cache/epub/40030/pg40030.txt',
       'http://www.gutenberg.org/cache/epub/50992/pg50992.txt',
       'http://www.gutenberg.org/cache/epub/50880/pg50880.txt',
       'http://www.gutenberg.org/cache/epub/36525/pg36525.txt',
       'http://www.gutenberg.org/cache/epub/47167/pg47167.txt',
       'http://www.gutenberg.org/cache/epub/47464/pg47464.txt',
       'http://www.gutenberg.org/cache/epub/33397/pg33397.txt',
       'http://www.gutenberg.org/cache/epub/29782/pg29782.txt',
       'http://www.gutenberg.org/cache/epub/32857/pg32857.txt',
       'http://www.gutenberg.org/cache/epub/26262/pg26262.txt']
docs = [requests.get(u).text for u in urls]

lxr = LexRank(docs, stopwords=STOPWORDS['en'])
nlp =  spacy.load('en_core_web_md')


def tfidf(span):return sum([tok.prob for tok in span])*1.0/len(span)
Span.set_extension("weight", getter=tfidf, force=True)

def ranktfidf(doc): return np.argsort(np.array([sent._.weight for sent in doc.sents]))[::-1]
Doc.set_extension("ranktfidf", getter=ranktfidf, force=True)
示例#23
0
import spacy
import lemminflect
import logging
import typing

from spacy.tokens import Span, Doc
from spacy.matcher import Matcher
from lemminflect import getInflection

logging.basicConfig(level=logging.INFO)

# DO NOT SET MANUALLY
MOD_CONSERVATIVE = False

Doc.set_extension("clauses", default=[], force=True)
Span.set_extension("clauses", default=[], force=True)

dictionary = {
    "non_ext_copular":
    """die walk""".split(),
    "ext_copular":
    """act
appear
be
become
come
come out
end up
get
go
grow
        if adu[idx] == 0:
            result.append(s)
    return result


def get_features(span):
    return span.doc._.Features[span._.index]


def get_mc(doc):
    for idx, val in enumerate(list(doc.sents)):
        if doc._.MC_List[idx] == 1:
            return val


Span.set_extension("Label", getter=get_sentence_label)
Span.set_extension("CLPR_Label", getter=get_sentence_label)
Span.set_extension("index", getter=get_index)
Span.set_extension("Feature", getter=get_features)
Span.set_extension("mc", default=0)

Token.set_extension("Label", getter=get_token_label)

Doc.set_extension("ADU_Sents", getter=get_ADU)
Doc.set_extension("Claim_Sents", getter=get_CL)
Doc.set_extension("Premise_Sents", getter=get_PR)
Doc.set_extension("MC_List", default=[])
Doc.set_extension("MajorClaim", getter=get_mc)
Doc.set_extension("sentences", getter=get_sentences)
Doc.set_extension("Labels", default=[0])
Doc.set_extension("CLPR_Labels", default=[0])
示例#25
0
}
all_pronouns = set()
inv_pronoun_map = collections.defaultdict(list)
for k, pronoun_list in pronoun_lists.items():
  for v in pronoun_list:
    inv_pronoun_map[v].append(k)
    all_pronouns.add(v)

path = MODELS_DIR

USE_NOUN_CHUNKS = False

import neuralcoref
neuralcoref.add_to_pipe(nlp)

Span.set_extension("fused_type", default="")
Span.set_extension("is_pronoun", default=False)

models = []
if CANDIDATE_RECALL:
  models.append((None, None, collections.defaultdict(lambda: (0,0))))
elif os.path.exists(path):
  for model_name in listdir(path):
    sess = tf.Session(graph=tf.Graph())
    tf.saved_model.loader.load(sess, ["serve"], os.path.join(path,model_name))
    #print([n.name for n in sess.graph.as_graph_def().node][:10])
    header_file = os.path.join(path,model_name,"header.txt")
    model_preds = []
    predicate_thresholds = {}
    if os.path.exists(header_file):
      with open(header_file, 'r') as file:
示例#26
0
 def __init__(self, cfg):
     # TODO fix force=True (isn't supposed to work that way)
     self.__name__ = 'linker'
     Span.set_extension("link", default=None, force=True)
     self.LABEL_URL_MAPPER = cfg['EntityLinker']
        for match_id, start, end in structurematcher(span.as_doc())
    ]


def flagtypematched(span):
    return [
        nlp.vocab.strings[match_id]
        for match_id, start, end in flagtypematcher(span.as_doc())
    ]


def isflagmatched(span):
    return len(isflagmatcher(span.as_doc())) > 0


Span.set_extension("structurematched", getter=structurematched, force=True)
Span.set_extension("flagtypematched", getter=flagtypematched, force=True)
Span.set_extension("isflagmatched", getter=isflagmatched, force=True)

### Specify regular expressions

# Disinformation, fake news, clickbait, unreliable sources
regex_flag = re.compile('|'.join([r'\b' + o
                                  for o in objects_all + attributes]))
fake_regex = re.compile('|'.join(
    [r'\b' + o for o in objects_full + attributes + ['conspir']]))

# Sarcasm and irony
sarcasm_regex = "\\\s\\b|\\/s\\b"
irony_regex = '|'.join([
    "[*'\"“”‘’`´˝˶]" + s
Token.set_extension('is_color', getter=get_is_color)

doc = nlp("The sky is blue")
print(doc[3]._.is_color, '-', doc[3].text)

#Property extensions (2)

from spacy.tokens import Span

#Define getter function
def get_has_color(span):
    colors = ['red', 'yellow', 'blue']
    return any(token.text in colors for token in span)

#Set extension on the Span with getter
Span.set_extension('has_color', getter=get_has_color)

print(doc[1:4]._.has_color, '-', doc[1:4].text)
print(doc[0:2]._.has_color, '-', doc[0:2].text)

#Method extensions

from spacy.tokens import Doc

#Define method with arguments
def has_token(doc, token_text):
    in_doc = token_text in [token.text for token in doc]
    return in_doc

#Set extension on the Doc with method
Doc.set_extension('has_token', method=has_token)
示例#29
0
import requests
from requests.adapters import HTTPAdapter
import os
from sqlitedict import SqliteDict
import hashlib
from spacy.tokens import Span
import json
import time
import diffbot_nlapi
import logging

from config import MODEL, NUMBER_URI_CANDIDATES, SOFT_COREF_CANDIDATES

# el_candidate has types, uri, score
Span.set_extension("el_candidates", default=[])
Span.set_extension("uri_candidates", default=[])

db = SqliteDict(os.path.join('tmp', 'el.db'), autocommit=True)

configuration = diffbot_nlapi.Configuration()
api_instance = diffbot_nlapi.NaturalLanguageApi(
    diffbot_nlapi.ApiClient(configuration))


def _get_uri_candidates_from_mention_with_score(mention, score):
    return [{
        'types': elc["types"],
        'uri': elc["uri"],
        'score': (2 * score) + elc["score"],
        'coref_score': score,
        'el_score': elc["score"]
示例#30
0
import warnings
from typing import Any, Callable, Dict, List, Union

from functional import seq
from jsonschema import validate
from spacy.tokens import Doc, Span

from replacy.db import get_match_dict_schema

# set known extensions:
known_string_extensions = ["description", "match_name", "category", "comment"]
known_list_extensions = ["suggestions"]
for ext in known_list_extensions:
    Span.set_extension(ext, default=[], force=True)
for ext in known_string_extensions:
    Span.set_extension(ext, default="", force=True)

expected_properties = (["patterns", "match_hook", "test"] +
                       known_list_extensions + known_string_extensions)


# set custom extensions for any unexpected keys found in the match_dict
def get_novel_prop_defaults(match_dict):
    """
    Also mutates the global Span to add any needed extensions
    """
    novel_properties = (seq(match_dict.values()).flat_map(
        lambda x: x.keys()).distinct().difference(expected_properties))
    novel_prop_defaults: Dict[str, Any] = {}
    for x in match_dict.values():
        for k, v in x.items():
示例#31
0
with open("exercises/de/capitals.json", encoding="utf8") as f:
    CAPITALS = json.loads(f.read())

nlp = spacy.blank("de")
matcher = PhraseMatcher(nlp.vocab)
matcher.add("COUNTRY", list(nlp.pipe(COUNTRIES)))


@Language.component("countries_component")
def countries_component_function(doc):
    # Erstelle eine Entitäts-Span mit dem Label "LOC" für alle Resultate
    matches = matcher(doc)
    doc.ents = [Span(doc, start, end, label="LOC") for match_id, start, end in matches]
    return doc


# Füge die Komponente zur Pipeline hinzu
nlp.add_pipe("countries_component")
print(nlp.pipe_names)

# Getter-Funktion, die den Text der Span im Lexikon der Hauptstädte nachschlägt
get_capital = lambda span: CAPITALS.get(span.text)

# Registriere die Span-Erweiterung "capital" mit Getter-Funktion get_capital
Span.set_extension("capital", getter=get_capital)

# Verarbeite den Text und drucke den Text, das Label und das Attribut capital für jede Entität
doc = nlp("Tschechien könnte der Slowakei dabei helfen, ihren Luftraum zu schützen")
print([(ent.text, ent.label_, ent._.capital) for ent in doc.ents])
示例#32
0
 def __init__(self, model, vocab, sentence_length, get_features):
     self._model = model
     self._vocab = vocab
     self._get_features = get_features
     self.sentence_length = sentence_length
     Span.set_extension(KerasPipe.extension_name, default=0.0)