def __init__(self): import pywikibot self.spacy = SpacyInstance() self.pywikibot = pywikibot self.cache = dict() self.site = pywikibot.Site( 'en', 'wikipedia') # The site we want to run our bot on
def __init__(self, params_dict, embeddings): """ Args: params_dict: Dictionary containing the following keys- 'max_question' : max length of all questions in the dataset 'max_para' : max length of all paragraphs in the dataset 'hidden_size': number of hidden units in the network 'batch_size' : batch size defined by user embeddings: Glove pretrained embedding matrix """ # Assign Variables: self.max_question = params_dict['max_question'] self.max_para = params_dict['max_para'] self.hidden_size = params_dict['hidden_size'] self.batch_size = params_dict['batch_size'] self.embeddings = embeddings self.inference_only = params_dict['inference_only'] self.G_i = None self.attn = None self.stacked_lists_forward = None self.stacked_lists_reverse = None self.logits_withsf = None # init tokenizer self.tokenizer = SpacyInstance(disable=['tagger', 'ner', 'parser', 'vectors', 'textcat']) # Create Placeholders # Question ids self.question_ids = tf.placeholder(tf.int32, shape=[None, self.max_question], name="question_ids") # Paragraph ids self.para_ids = tf.placeholder(tf.int32, shape=[None, self.max_para], name="para_ids") # Length of question self.question_length = tf.placeholder(tf.int32, shape=[None], name="question_len") # Length of paragraph self.para_length = tf.placeholder(tf.int32, shape=[None], name="para_len") # Mask for paragraph self.para_mask = tf.placeholder(tf.float32, shape=[None, self.max_para], name="para_mask") # Mask for question self.ques_mask = tf.placeholder(tf.float32, shape=[None, self.max_question], name="ques_mask") # Answer spans if self.inference_only is False: self.labels = tf.placeholder(tf.int32, shape=[None, 2], name="labels") # Dropout value self.dropout = tf.placeholder(tf.float32, shape=[], name="dropout") self.global_step = tf.Variable(0, name='global') # Get variables self.create_variables() # Define model self.create_model()
def __init__(self, prompt=True): self.model = None self.model_info = None self.word_vocab = None self.y_vocab = None self.char_vocab = None self._download_pretrained_model(prompt) self.nlp = SpacyInstance(disable=['tagger', 'ner', 'parser', 'vectors', 'textcat'])
def __init__(self, prompt=False): self.model = None self.model_info = None self.word_vocab = None self.y_vocab = None self.char_vocab = None self._download_pretrained_model(prompt) self.nlp = SpacyInstance(disable=["tagger", "ner", "parser", "vectors", "textcat"])
def process_inference_input(input_file): with io.open(input_file) as fp: texts = [l.strip() for l in fp.readlines()] tokenizer = SpacyInstance(disable=["tagger", "parser", "ner"]) examples = [] for i, t in enumerate(texts): examples.append(TokenClsInputExample(str(i), t, tokenizer.tokenize(t))) return examples
def __init__(self, prompt=True): self.model = None self.model_type = None self.word_vocab = None self.tags_vocab = None self.char_vocab = None self.intent_vocab = None self._download_pretrained_model(prompt) self.nlp = SpacyInstance( disable=["tagger", "ner", "parser", "vectors", "textcat"])
def load_parser(chunker): # load spacy parser logger.info("loading spacy. chunker=%s", chunker) if "nlp_arch" in chunker: parser = SpacyInstance(model="en_core_web_sm", disable=["textcat", "ner", "parser"]).parser parser.add_pipe(parser.create_pipe("sentencizer"), first=True) _path_to_model = path.join(chunker_path, chunker_model_file) _path_to_params = path.join(chunker_path, chunker_model_dat_file) if not path.exists(chunker_path): makedirs(chunker_path) if not path.exists(_path_to_model): logger.info( "The pre-trained model to be downloaded for NLP Architect" " word chunker model is licensed under Apache 2.0") download_unlicensed_file(nlp_chunker_url, chunker_model_file, _path_to_model) if not path.exists(_path_to_params): download_unlicensed_file(nlp_chunker_url, chunker_model_dat_file, _path_to_params) parser.add_pipe(NPAnnotator.load(_path_to_model, _path_to_params), last=True) else: parser = SpacyInstance(model="en_core_web_sm", disable=["textcat", "ner"]).parser logger.info("spacy loaded") return parser
def load_parser(chunker): # load spacy parser logger.info('loading spacy. chunker=%s', chunker) if 'nlp_arch' in chunker: parser = SpacyInstance(model='en_core_web_sm', disable=['textcat', 'ner', 'parser']).parser parser.add_pipe(parser.create_pipe('sentencizer'), first=True) _path_to_model = path.join(chunker_path, chunker_model_file) _path_to_params = path.join(chunker_path, chunker_model_dat_file) if not path.exists(chunker_path): makedirs(chunker_path) if not path.exists(_path_to_model): logger.info( 'The pre-trained model to be downloaded for NLP Architect' ' word chunker model is licensed under Apache 2.0') download_unlicensed_file(nlp_chunker_url, chunker_model_file, _path_to_model) if not path.exists(_path_to_params): download_unlicensed_file(nlp_chunker_url, chunker_model_dat_file, _path_to_params) parser.add_pipe(NPAnnotator.load(_path_to_model, _path_to_params), last=True) else: parser = SpacyInstance(model='en_core_web_sm', disable=['textcat', 'ner']).parser logger.info('spacy loaded') return parser
def __init__(self): try: import pywikibot except (AttributeError, ImportError): logger.error( "pywikibot is not installed, please install nlp_architect with [all] package. " + "for example: pip install nlp_architect[all]") sys.exit() self.spacy = SpacyInstance() self.pywikibot = pywikibot self.cache = dict() self.site = pywikibot.Site( "en", "wikipedia") # The site we want to run our bot on
def _parse_json(self, data): tok = SpacyInstance(disable=['tagger', 'ner', 'parser', 'vectors', 'textcat']) sentences = [] for s in data: tokens = [] tags = [] for t in s: new_tokens = tok.tokenize(t['text'].strip()) tokens += new_tokens ent = t.get('entity', None) if ent is not None: tags += self._create_tags(ent, len(new_tokens)) else: tags += ['O'] * len(new_tokens) sentences.append((tokens, tags)) return sentences
def _parse_json(self, data): tok = SpacyInstance( disable=["tagger", "ner", "parser", "vectors", "textcat"]) sentences = [] for s in data: tokens = [] tags = [] for t in s: new_tokens = tok.tokenize(t["text"].strip()) tokens += new_tokens ent = t.get("entity", None) if ent is not None: tags += self._create_tags(ent, len(new_tokens)) else: tags += ["O"] * len(new_tokens) sentences.append((tokens, tags)) return sentences
def test_np_annotator_linked(model_path, settings_path, text, phrases): annotator = SpacyInstance(model="en", disable=["textcat", "ner", "parser"]).parser annotator.add_pipe(annotator.create_pipe("sentencizer"), first=True) annotator.add_pipe(NPAnnotator.load(model_path, settings_path), last=True) doc = annotator(text) noun_phrases = [p.text for p in get_noun_phrases(doc)] for p in phrases: assert p in noun_phrases
def __init__(self, parser=None): if parser is None: self.nlp = SpacyInstance( disable=['ner', 'parser', 'vectors', 'textcat']).parser else: self.nlp = parser self.nlp.add_pipe(self.nlp.create_pipe('sentencizer'), first=True) _path_to_model = path.join(chunker_local_path, chunker_model_file) if not path.exists(chunker_local_path): makedirs(chunker_local_path) if not path.exists(_path_to_model): logger.info( 'The pre-trained model to be downloaded for NLP Architect word' ' chunker model is licensed under Apache 2.0') download_unlicensed_file(nlp_chunker_url, chunker_model_file, _path_to_model) _path_to_params = path.join(chunker_local_path, chunker_model_dat_file) if not path.exists(_path_to_params): download_unlicensed_file(nlp_chunker_url, chunker_model_dat_file, _path_to_params) self.nlp.add_pipe(NPAnnotator.load(_path_to_model, _path_to_params), last=True)
def __init__( self, aspect_lex: Union[str, PathLike], opinion_lex: Union[str, PathLike, dict], parse: bool = True, parser="spacy", spacy_model="en_core_web_sm", ): """Inits SentimentInference with given aspect and opinion lexicons.""" INFERENCE_OUT.mkdir(parents=True, exist_ok=True) self.opinion_lex = ( opinion_lex if type(opinion_lex) is dict else load_opinion_lex(Path(opinion_lex)) ) self.aspect_lex = _load_aspect_lexicon(Path(aspect_lex)) self.intensifier_lex = _read_lexicon_from_csv("IntensifiersLex.csv") self.negation_lex = _read_lexicon_from_csv("NegationSentLex.csv") self.parser_name = parser if parse: if parser == "bist": from nlp_architect.pipelines.spacy_bist import SpacyBISTParser self.parser = SpacyBISTParser(spacy_model=spacy_model) elif parser == "spacy": from nlp_architect.utils.text import SpacyInstance disable = [ "merge_noun_chunks", "ner", "entity_linker", "textcat", "entity_ruler", "sentencizer", "merge_entities", ] self.parser = SpacyInstance( model=spacy_model, disable=disable, ptb_pos=True, n_jobs=1 ) else: self.parser = None
def __init__(self, verbose=False, spacy_model="en", bist_model=None): validate( (verbose, bool), (spacy_model, str, 0, 1000), (bist_model, (type(None), str), 0, 1000) ) if not bist_model: print("Using pre-trained BIST model.") _download_pretrained_model() bist_model = SpacyBISTParser._pretrained self.verbose = verbose self.bist_parser = BISTModel() self.bist_parser.load(bist_model if bist_model else SpacyBISTParser._pretrained) self.spacy_parser = SpacyInstance(spacy_model, disable=["ner", "vectors", "textcat"]).parser
class StringUtils: spacy_no_parser = SpacyInstance(disable=['parser']) spacy_parser = SpacyInstance() stop_words = None pronouns = None preposition = None def __init__(self): pass @staticmethod def is_stop(token: str) -> bool: if not StringUtils.stop_words: StringUtils.stop_words = load_json_file(STOP_WORDS_FILE) StringUtils.stop_words.extend(DISAMBIGUATION_CATEGORY) if token not in StringUtils.stop_words: return False return True @staticmethod def normalize_str(in_str: str) -> str: str_clean = re.sub('[' + string.punctuation + string.whitespace + ']', ' ', in_str).strip().lower() if isinstance(str_clean, str): str_clean = str(str_clean) doc = StringUtils.spacy_no_parser.parser(str_clean) ret_clean = [] for token in doc: lemma = token.lemma_.strip() if not StringUtils.is_pronoun(lemma) and not StringUtils.is_stop( lemma): ret_clean.append(token.lemma_) return ' '.join(ret_clean) @staticmethod def is_pronoun(in_str: str) -> bool: if not StringUtils.pronouns: StringUtils.pronouns = load_json_file(PRONOUN_FILE) tokens = in_str.split() if len(tokens) == 1: if tokens[0] in StringUtils.pronouns: return True return False @staticmethod def is_preposition(in_str: str) -> bool: if not StringUtils.preposition: StringUtils.preposition = load_json_file(PREPOSITION_FILE) tokens = in_str.split() if len(tokens) == 1: if tokens[0] in StringUtils.preposition: return True return False @staticmethod def normalize_string_list(str_list: str) -> List[str]: ret_list = [] for _str in str_list: normalize_str = StringUtils.normalize_str(_str) if normalize_str != '': ret_list.append(normalize_str) return ret_list @staticmethod def find_head_lemma_pos_ner(x: str): ''' :param x: mention :return: the head word and the head word lemma of the mention ''' head = None lemma = None pos = None ner = None doc = StringUtils.spacy_parser.parser(x) for tok in doc: if tok.head == tok: head = tok.text lemma = tok.lemma_ pos = tok.pos_ for ent in doc.ents: if ent.root.text == head: ner = ent.label_ return head, lemma, pos, ner
def initiate_parser(): return SpacyInstance( disable=['tagger', 'ner', 'parser', 'vectors', 'textcat']).parser
class NerApi(AbstractApi): """ NER model API """ model_dir = path.join(LIBRARY_STORAGE_PATH, 'ner-pretrained') pretrained_model = path.join(model_dir, 'model.h5') pretrained_model_info = path.join(model_dir, 'model_info.dat') def __init__(self, prompt=True): self.model = None self.model_info = None self.word_vocab = None self.y_vocab = None self.char_vocab = None self._download_pretrained_model(prompt) self.nlp = SpacyInstance( disable=['tagger', 'ner', 'parser', 'vectors', 'textcat']) @staticmethod def _prompt(): response = input( '\nTo download \'{}\', please enter YES: '.format('ner')) res = response.lower().strip() if res == "yes" or (len(res) == 1 and res == 'y'): print('Downloading {}...'.format('ner')) responded_yes = True else: print('Download declined. Response received {} != YES|Y. '.format( res)) responded_yes = False return responded_yes def _download_pretrained_model(self, prompt=True): """Downloads the pre-trained BIST model if non-existent.""" model_exists = path.isfile(self.pretrained_model) model_info_exists = path.isfile(self.pretrained_model_info) if not model_exists or not model_info_exists: print( 'The pre-trained models to be downloaded for the NER dataset ' 'are licensed under Apache 2.0. By downloading, you accept the terms ' 'and conditions provided by the license') makedirs(self.model_dir, exist_ok=True) if prompt is True: agreed = NerApi._prompt() if agreed is False: sys.exit(0) download_unlicensed_file( 'http://nervana-modelzoo.s3.amazonaws.com/NLP/ner/', 'model.h5', self.pretrained_model) download_unlicensed_file( 'http://nervana-modelzoo.s3.amazonaws.com/NLP/ner/', 'model_info.dat', self.pretrained_model_info) print('Done.') def load_model(self): self.model = NERCRF() self.model.load(self.pretrained_model) with open(self.pretrained_model_info, 'rb') as fp: model_info = pickle.load(fp) self.word_vocab = model_info['word_vocab'] self.y_vocab = {v: k for k, v in model_info['y_vocab'].items()} self.char_vocab = model_info['char_vocab'] @staticmethod def pretty_print(text, tags): spans = [] for s, e, tag in bio_to_spans(text, tags): spans.append({'start': s, 'end': e, 'type': tag}) ents = dict((obj['type'].lower(), obj) for obj in spans).keys() ret = { 'doc_text': ' '.join(text), 'annotation_set': list(ents), 'spans': spans, 'title': 'None' } print({"doc": ret, 'type': 'high_level'}) return {"doc": ret, 'type': 'high_level'} def process_text(self, text): input_text = ' '.join(text.strip().split()) return self.nlp.tokenize(input_text) def vectorize(self, doc, vocab, char_vocab): words = np.asarray([vocab[w.lower()] if w.lower() in vocab else 1 for w in doc]) \ .reshape(1, -1) sentence_chars = [] for w in doc: word_chars = [] for c in w: if c in char_vocab: _cid = char_vocab[c] else: _cid = 1 word_chars.append(_cid) sentence_chars.append(word_chars) sentence_chars = np.expand_dims(pad_sentences(sentence_chars, self.model.word_length), axis=0) return words, sentence_chars def inference(self, doc): text_arr = self.process_text(doc) doc_vec = self.vectorize(text_arr, self.word_vocab, self.char_vocab) seq_len = np.array([len(text_arr)]).reshape(-1, 1) inputs = list(doc_vec) if self.model.crf_mode == 'pad': inputs = list(doc_vec) + [seq_len] doc_ner = self.model.predict(inputs, batch_size=1).argmax(2).flatten() tags = [self.y_vocab.get(n, None) for n in doc_ner] return self.pretty_print(text_arr, tags)
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ****************************************************************************** import pickle from os import makedirs, path, sys import numpy as np from nlp_architect.api.abstract_api import AbstractApi from nlp_architect.models.ner_crf import NERCRF from nlp_architect.utils.generic import pad_sentences from nlp_architect.utils.io import download_unlicensed_file from nlp_architect.utils.text import SpacyInstance nlp = SpacyInstance(disable=['tagger', 'ner', 'parser', 'vectors', 'textcat']) class NerApi(AbstractApi): """ NER model API """ dir = path.dirname(path.realpath(__file__)) pretrained_model = path.join(dir, 'ner-pretrained', 'model.h5') pretrained_model_info = path.join(dir, 'ner-pretrained', 'model_info.dat') def __init__(self, prompt=True): self.model = None self.model_info = None self.model_path = NerApi.pretrained_model self.model_info_path = NerApi.pretrained_model_info
# limitations under the License. # ****************************************************************************** from __future__ import division, print_function, unicode_literals, absolute_import import argparse import pickle import numpy as np from nlp_architect.models.ner_crf import NERCRF from nlp_architect.utils.generic import pad_sentences from nlp_architect.utils.io import validate_existing_filepath from nlp_architect.utils.text import SpacyInstance nlp = SpacyInstance(disable=["tagger", "ner", "parser", "vectors", "textcat"]) def read_input_args(): parser = argparse.ArgumentParser() parser.add_argument( "--model_path", type=validate_existing_filepath, required=True, help="Path of model weights" ) parser.add_argument( "--model_info_path", type=validate_existing_filepath, required=True, help="Path of model topology", ) input_args = parser.parse_args() return input_args
class SentimentInference: """Main class for sentiment inference execution. Attributes: opinion_lex: Opinion lexicon as outputted by TrainSentiment module. aspect_lex: Aspect lexicon as outputted by TrainSentiment module. intensifier_lex (dict): Pre-defined intensifier lexicon. negation_lex (dict): Pre-defined negation lexicon. """ def __init__( self, aspect_lex: Union[str, PathLike], opinion_lex: Union[str, PathLike, dict], parse: bool = True, parser="spacy", spacy_model="en_core_web_sm", ): """Inits SentimentInference with given aspect and opinion lexicons.""" INFERENCE_OUT.mkdir(parents=True, exist_ok=True) self.opinion_lex = ( opinion_lex if type(opinion_lex) is dict else load_opinion_lex(Path(opinion_lex)) ) self.aspect_lex = _load_aspect_lexicon(Path(aspect_lex)) self.intensifier_lex = _read_lexicon_from_csv("IntensifiersLex.csv") self.negation_lex = _read_lexicon_from_csv("NegationSentLex.csv") self.parser_name = parser if parse: if parser == "bist": from nlp_architect.pipelines.spacy_bist import SpacyBISTParser self.parser = SpacyBISTParser(spacy_model=spacy_model) elif parser == "spacy": from nlp_architect.utils.text import SpacyInstance disable = [ "merge_noun_chunks", "ner", "entity_linker", "textcat", "entity_ruler", "sentencizer", "merge_entities", ] self.parser = SpacyInstance( model=spacy_model, disable=disable, ptb_pos=True, n_jobs=1 ) else: self.parser = None def parse_data(self, data: Union[PathLike, PosixPath], out_dir: Union[str, PathLike]): if out_dir: Path(out_dir).mkdir(parents=True, exist_ok=True) parse_func = parse_docs_bist if self.parser_name == "bist" else parse_docs parse_func(self.parser, data, out_dir=out_dir) return out_dir def run(self, doc: str = None, parsed_doc: CoreNLPDoc = None) -> SentimentDoc: """Run SentimentInference on a single document. Returns: The sentiment annotated document, which contains the detected events per sentence. """ if not parsed_doc: if not self.parser: raise RuntimeError("Parser not initialized (try parse=True at init)") parsed_doc = self.parser.parse([doc])[0] sentiment_doc = None for sentence in parsed_doc.sentences: events = [] scores = [] for aspect_row in self.aspect_lex: _, asp_events = self._extract_event(aspect_row, sentence) for asp_event in asp_events: events.append(asp_event) scores += [term.score for term in asp_event if term.type == TermType.ASPECT] if events: if not sentiment_doc: sentiment_doc = SentimentDoc(parsed_doc.doc_text) sentiment_doc.sentences.append( SentimentSentence( sentence[0]["start"], sentence[-1]["start"] + sentence[-1]["len"] - 1, events, ) ) return sentiment_doc def run_multiple( self, data: Union[str, PathLike] = None, parsed_data: Union[str, PathLike] = None, out_dir: Union[str, PathLike] = INFERENCE_OUT, ): if not parsed_data: if not self.parser: raise RuntimeError("Parser not initialized (try parse=True at init)") parsed_dir = Path(out_dir) / "parsed" / Path(data).stem parsed_data = self.parse_data(data, out_dir=parsed_dir) sentiment_docs = {} for f, parsed_doc in tqdm(_load_parsed_docs_from_dir(out_dir)): sentiment_doc = self.run(parsed_doc=parsed_doc) sentiment_docs[f] = sentiment_doc return sentiment_docs def _extract_intensifier_terms(self, toks, sentiment_index, polarity, sentence): """Extract intensifier events from sentence.""" count = 0 terms = [] for intens_i, intens in [(i, x) for i, x in enumerate(toks) if x in self.intensifier_lex]: if math.fabs(sentiment_index - intens_i) == 1: score = self.intensifier_lex[intens].score terms.append( Term( intens, TermType.INTENSIFIER, polarity, score, sentence[intens_i]["start"], sentence[intens_i]["len"], ) ) count += abs(score + float(INTENSIFIER_FACTOR)) return count if count != 0 else 1, terms def _extract_neg_terms(self, toks: list, op_i: int, sentence: list) -> tuple: """Extract negation terms from sentence. Args: toks: Sentence text broken down to tokens (words). op_i: Index of opinion term in sentence. sentence: parsed sentence Returns: List of negation terms and its aggregated sign (positive or negative). """ sign = 1 terms = [] gov_op_i = sentence[op_i]["gov"] dep_op_indices = [sentence.index(x) for x in sentence if x["gov"] == op_i] for neg_i, negation in [(i, x) for i, x in enumerate(toks) if x in self.negation_lex]: position = self.negation_lex[negation].position dist = op_i - neg_i before = position == "before" and (dist == 1 or neg_i in dep_op_indices) after = position == "after" and (dist == -1 or neg_i == gov_op_i) both = position == "both" and dist in (1, -1) if before or after or both: terms.append( Term( negation, TermType.NEGATION, Polarity.NEG, self.negation_lex[negation].score, sentence[toks.index(negation)]["start"], sentence[toks.index(negation)]["len"], ) ) sign *= self.negation_lex[negation].score return terms, sign def _extract_event(self, aspect_row: LexiconElement, parsed_sentence: list) -> tuple: """Extract opinion and aspect terms from sentence.""" event = [] sent_aspect_pair = None real_aspect_indices = _consolidate_aspects(aspect_row.term, parsed_sentence) aspect_key = aspect_row.term[0] for aspect_index_range in real_aspect_indices: for word_index in aspect_index_range: sent_aspect_pair, event = self._detect_opinion_aspect_events( word_index, parsed_sentence, aspect_key, aspect_index_range ) if sent_aspect_pair: break return sent_aspect_pair, event @staticmethod def _modify_for_multiple_word(cur_tkn, parsed_sentence, index_range): """Modify multiple-word aspect tkn length and start index. Args: index_range: The index range of the multi-word aspect. Returns: The modified aspect token. """ if len(index_range) >= 2: cur_tkn["start"] = parsed_sentence[index_range[0]]["start"] cur_tkn["len"] = len(parsed_sentence[index_range[0]]["text"]) for i in index_range[1:]: cur_tkn["len"] = int(cur_tkn["len"]) + len(parsed_sentence[i]["text"]) + 1 return cur_tkn def _detect_opinion_aspect_events(self, aspect_index, parsed_sent, aspect_key, index_range): """Extract opinion-aspect events from sentence. Args: aspect_index: index of aspect in sentence. parsed_sent: current sentence parse tree. aspect_key: main aspect term serves as key in aspect dict. index_range: The index range of the multi word aspect. Returns: List of aspect sentiment pair, and list of events extracted. """ all_pairs, events = [], [] sentence_text_list = [x["text"] for x in parsed_sent] sentence_text = " ".join(sentence_text_list) for tok_i, tok in enumerate(parsed_sent): aspect_op_pair = [] terms = [] gov_i = tok["gov"] gov = parsed_sent[gov_i] gov_text = gov["text"] tok_text = tok["text"] # 1st order rules # Is cur_tkn an aspect and gov an opinion? if tok_i == aspect_index: if gov_text.lower() in self.opinion_lex: aspect_op_pair.append( (self._modify_for_multiple_word(tok, parsed_sent, index_range), gov) ) # Is gov an aspect and cur_tkn an opinion? if gov_i == aspect_index and tok_text.lower() in self.opinion_lex: aspect_op_pair.append( (self._modify_for_multiple_word(gov, parsed_sent, index_range), tok) ) # If not found, try 2nd order rules if not aspect_op_pair and tok_i == aspect_index: # 2nd order rule #1 for op_t in parsed_sent: if op_t["gov"] == gov_i and op_t["text"].lower() in self.opinion_lex: aspect_op_pair.append( (self._modify_for_multiple_word(tok, parsed_sent, index_range), op_t) ) # 2nd order rule #2 gov_gov = parsed_sent[parsed_sent[gov_i]["gov"]] if gov_gov["text"].lower() in self.opinion_lex: aspect_op_pair.append( (self._modify_for_multiple_word(tok, parsed_sent, index_range), gov_gov) ) # if aspect_tok found for aspect, opinion in aspect_op_pair: op_tok_i = parsed_sent.index(opinion) score = self.opinion_lex[opinion["text"].lower()].score neg_terms, sign = self._extract_neg_terms(sentence_text_list, op_tok_i, parsed_sent) polarity = Polarity.POS if score * sign > 0 else Polarity.NEG intensifier_score, intensifier_terms = self._extract_intensifier_terms( sentence_text_list, op_tok_i, polarity, parsed_sent ) over_all_score = score * sign * intensifier_score terms.append( Term( aspect_key, TermType.ASPECT, polarity, over_all_score, aspect["start"], aspect["len"], ) ) terms.append( Term( opinion["text"], TermType.OPINION, polarity, over_all_score, opinion["start"], opinion["len"], ) ) if len(neg_terms) > 0: terms = terms + neg_terms if len(intensifier_terms) > 0: terms = terms + intensifier_terms all_pairs.append( [aspect_key, opinion["text"], over_all_score, polarity, sentence_text] ) events.append(terms) return all_pairs, events
class IntentExtractionApi(AbstractApi): model_dir = str(LIBRARY_OUT / "intent-pretrained") pretrained_model_info = path.join(model_dir, "model_info.dat") pretrained_model = path.join(model_dir, "model.h5") def __init__(self, prompt=False): self.model = None self.model_type = None self.word_vocab = None self.tags_vocab = None self.char_vocab = None self.intent_vocab = None self._download_pretrained_model(prompt) self.nlp = SpacyInstance( disable=["tagger", "ner", "parser", "vectors", "textcat"]) def process_text(self, text): input_text = " ".join(text.strip().split()) return self.nlp.tokenize(input_text) @staticmethod def _prompt(): response = input("\nTo download '{}', please enter YES: ".format( "intent_extraction")) res = response.lower().strip() if res == "yes" or (len(res) == 1 and res == "y"): print("Downloading {}...".format("ner")) responded_yes = True else: print("Download declined. Response received {} != YES|Y. ".format( res)) responded_yes = False return responded_yes @staticmethod def _download_pretrained_model(prompt=True): """Downloads the pre-trained BIST model if non-existent.""" model_info_exists = path.isfile( IntentExtractionApi.pretrained_model_info) model_exists = path.isfile(IntentExtractionApi.pretrained_model) if not model_exists or not model_info_exists: print( "The pre-trained models to be downloaded for the intent extraction dataset " "are licensed under Apache 2.0. By downloading, you accept the terms " "and conditions provided by the license") makedirs(IntentExtractionApi.model_dir, exist_ok=True) if prompt is True: agreed = IntentExtractionApi._prompt() if agreed is False: sys.exit(0) download_unlicensed_file( "https://s3-us-west-2.amazonaws.com/nlp-architect-data" "/models/intent/", "model_info.dat", IntentExtractionApi.pretrained_model_info, ) download_unlicensed_file( "https://s3-us-west-2.amazonaws.com/nlp-architect-data" "/models/intent/", "model.h5", IntentExtractionApi.pretrained_model, ) print("Done.") @staticmethod def display_results(text_str, predictions, intent_type): ret = { "annotation_set": [], "doc_text": " ".join([t for t in text_str]) } spans = [] available_tags = set() for s, e, tag in bio_to_spans(text_str, predictions): spans.append({"start": s, "end": e, "type": tag}) available_tags.add(tag) ret["annotation_set"] = list(available_tags) ret["spans"] = spans ret["title"] = intent_type return {"doc": ret, "type": "high_level"} def vectorize(self, doc, vocab, char_vocab=None): words = np.asarray([ vocab[w.lower()] if w.lower() in vocab else 1 for w in doc ]).reshape(1, -1) if char_vocab is not None: sentence_chars = [] for w in doc: word_chars = [] for c in w: if c in char_vocab: _cid = char_vocab[c] else: _cid = 1 word_chars.append(_cid) sentence_chars.append(word_chars) sentence_chars = np.expand_dims(pad_sentences( sentence_chars, self.model.word_length), axis=0) return [words, sentence_chars] return words def inference(self, doc): text_arr = self.process_text(doc) intent_type = None if self.model_type == "mtl": doc_vec = self.vectorize(text_arr, self.word_vocab, self.char_vocab) intent, tags = self.model.predict(doc_vec, batch_size=1) intent = int(intent.argmax(1).flatten()) intent_type = self.intent_vocab.get(intent, None) print("Detected intent type: {}".format(intent_type)) else: doc_vec = self.vectorize(text_arr, self.word_vocab, None) tags = self.model.predict(doc_vec, batch_size=1) tags = tags.argmax(2).flatten() tag_str = [self.tags_vocab.get(n, None) for n in tags] for t, n in zip(text_arr, tag_str): print("{}\t{}\t".format(t, n)) return self.display_results(text_arr, tag_str, intent_type) def load_model(self): with open(IntentExtractionApi.pretrained_model_info, "rb") as fp: model_info = pickle.load(fp) self.model_type = model_info["type"] self.word_vocab = model_info["word_vocab"] self.tags_vocab = {v: k for k, v in model_info["tags_vocab"].items()} if self.model_type == "mtl": self.char_vocab = model_info["char_vocab"] self.intent_vocab = { v: k for k, v in model_info["intent_vocab"].items() } model = MultiTaskIntentModel() else: model = Seq2SeqIntentModel() model.load(self.pretrained_model) self.model = model
class IntentExtractionApi(AbstractApi): model_dir = path.join(LIBRARY_STORAGE_PATH, 'intent-pretrained') pretrained_model_info = path.join(model_dir, 'model_info.dat') pretrained_model = path.join(model_dir, 'model.h5') def __init__(self, prompt=True): self.model = None self.model_type = None self.word_vocab = None self.tags_vocab = None self.char_vocab = None self.intent_vocab = None self._download_pretrained_model(prompt) self.nlp = SpacyInstance( disable=['tagger', 'ner', 'parser', 'vectors', 'textcat']) def process_text(self, text): input_text = ' '.join(text.strip().split()) return self.nlp.tokenize(input_text) @staticmethod def _prompt(): response = input('\nTo download \'{}\', please enter YES: '.format( 'intent_extraction')) res = response.lower().strip() if res == "yes" or (len(res) == 1 and res == 'y'): print('Downloading {}...'.format('ner')) responded_yes = True else: print('Download declined. Response received {} != YES|Y. '.format( res)) responded_yes = False return responded_yes @staticmethod def _download_pretrained_model(prompt=True): """Downloads the pre-trained BIST model if non-existent.""" model_info_exists = path.isfile( IntentExtractionApi.pretrained_model_info) model_exists = path.isfile(IntentExtractionApi.pretrained_model) if not model_exists or not model_info_exists: print( 'The pre-trained models to be downloaded for the intent extraction dataset ' 'are licensed under Apache 2.0. By downloading, you accept the terms ' 'and conditions provided by the license') makedirs(IntentExtractionApi.model_dir, exist_ok=True) if prompt is True: agreed = IntentExtractionApi._prompt() if agreed is False: sys.exit(0) download_unlicensed_file( 'http://nervana-modelzoo.s3.amazonaws.com/NLP/intent/', 'model_info.dat', IntentExtractionApi.pretrained_model_info) download_unlicensed_file( 'http://nervana-modelzoo.s3.amazonaws.com/NLP/intent/', 'model.h5', IntentExtractionApi.pretrained_model) print('Done.') @staticmethod def display_results(text_str, predictions, intent_type): ret = { 'annotation_set': [], 'doc_text': ' '.join([t for t in text_str]) } spans = [] available_tags = set() for s, e, tag in bio_to_spans(text_str, predictions): spans.append({'start': s, 'end': e, 'type': tag}) available_tags.add(tag) ret['annotation_set'] = list(available_tags) ret['spans'] = spans ret['title'] = intent_type return {'doc': ret, 'type': 'high_level'} def vectorize(self, doc, vocab, char_vocab=None): words = np.asarray([vocab[w.lower()] if w.lower() in vocab else 1 for w in doc])\ .reshape(1, -1) if char_vocab is not None: sentence_chars = [] for w in doc: word_chars = [] for c in w: if c in char_vocab: _cid = char_vocab[c] else: _cid = 1 word_chars.append(_cid) sentence_chars.append(word_chars) sentence_chars = np.expand_dims(pad_sentences( sentence_chars, self.model.word_length), axis=0) return [words, sentence_chars] return words def inference(self, doc): text_arr = self.process_text(doc) intent_type = None if self.model_type == 'mtl': doc_vec = self.vectorize(text_arr, self.word_vocab, self.char_vocab) intent, tags = self.model.predict(doc_vec, batch_size=1) intent = int(intent.argmax(1).flatten()) intent_type = self.intent_vocab.get(intent, None) print('Detected intent type: {}'.format(intent_type)) else: doc_vec = self.vectorize(text_arr, self.word_vocab, None) tags = self.model.predict(doc_vec, batch_size=1) tags = tags.argmax(2).flatten() tag_str = [self.tags_vocab.get(n, None) for n in tags] for t, n in zip(text_arr, tag_str): print('{}\t{}\t'.format(t, n)) return self.display_results(text_arr, tag_str, intent_type) def load_model(self): with open(IntentExtractionApi.pretrained_model_info, 'rb') as fp: model_info = pickle.load(fp) self.model_type = model_info['type'] self.word_vocab = model_info['word_vocab'] self.tags_vocab = {v: k for k, v in model_info['tags_vocab'].items()} if self.model_type == 'mtl': self.char_vocab = model_info['char_vocab'] self.intent_vocab = { v: k for k, v in model_info['intent_vocab'].items() } model = MultiTaskIntentModel() else: model = Seq2SeqIntentModel() model.load(self.pretrained_model) self.model = model
'chunker or \'nlp_arch\' for NLP Architect NP Extractor') args = arg_parser.parse_args() if args.corpus.endswith('gz'): corpus_file = gzip.open(args.corpus, 'rt', encoding='utf8', errors='ignore') else: corpus_file = open(args.corpus, 'r', encoding='utf8', errors='ignore') with open(args.marked_corpus, 'w', encoding='utf8') as marked_corpus_file: # load spacy parser logger.info('loading spacy') if 'nlp_arch' in args.chunker: nlp = SpacyInstance(model='en_core_web_sm', disable=['textcat', 'ner', 'parser']).parser nlp.add_pipe(nlp.create_pipe('sentencizer'), first=True) logger.info( 'The pre-trained model to be downloaded for NLP Architect word' ' chunker model is licensed under Apache 2.0') _path_to_model = path.join(cur_dir, chunker_model_file) download_unlicensed_file(nlp_chunker_url, chunker_model_file, _path_to_model) _path_to_params = path.join(cur_dir, chunker_model_dat_file) download_unlicensed_file(nlp_chunker_url, chunker_model_dat_file, _path_to_params) logger.info('Done.') nlp.add_pipe(NPAnnotator.load(_path_to_model, _path_to_params), last=True) else: nlp = SpacyInstance(model='en_core_web_sm',
class NerApi(AbstractApi): """ NER model API """ model_dir = str(LIBRARY_OUT / "ner-pretrained") pretrained_model = path.join(model_dir, "model_v4.h5") pretrained_model_info = path.join(model_dir, "model_info_v4.dat") def __init__(self, prompt=True): self.model = None self.model_info = None self.word_vocab = None self.y_vocab = None self.char_vocab = None self._download_pretrained_model(prompt) self.nlp = SpacyInstance( disable=["tagger", "ner", "parser", "vectors", "textcat"]) @staticmethod def _prompt(): response = input( "\nTo download '{}', please enter YES: ".format("ner")) res = response.lower().strip() if res == "yes" or (len(res) == 1 and res == "y"): print("Downloading {}...".format("ner")) responded_yes = True else: print("Download declined. Response received {} != YES|Y. ".format( res)) responded_yes = False return responded_yes def _download_pretrained_model(self, prompt=True): """Downloads the pre-trained BIST model if non-existent.""" model_exists = path.isfile(self.pretrained_model) model_info_exists = path.isfile(self.pretrained_model_info) if not model_exists or not model_info_exists: print( "The pre-trained models to be downloaded for the NER dataset " "are licensed under Apache 2.0. By downloading, you accept the terms " "and conditions provided by the license") makedirs(self.model_dir, exist_ok=True) if prompt is True: agreed = NerApi._prompt() if agreed is False: sys.exit(0) download_unlicensed_file( "https://s3-us-west-2.amazonaws.com/nlp-architect-data" "/models/ner/", "model_v4.h5", self.pretrained_model, ) download_unlicensed_file( "https://s3-us-west-2.amazonaws.com/nlp-architect-data" "/models/ner/", "model_info_v4.dat", self.pretrained_model_info, ) print("Done.") def load_model(self): self.model = NERCRF() self.model.load(self.pretrained_model) with open(self.pretrained_model_info, "rb") as fp: model_info = pickle.load(fp) self.word_vocab = model_info["word_vocab"] self.y_vocab = {v: k for k, v in model_info["y_vocab"].items()} self.char_vocab = model_info["char_vocab"] @staticmethod def pretty_print(text, tags): spans = [] for s, e, tag in bio_to_spans(text, tags): spans.append({"start": s, "end": e, "type": tag}) ents = dict((obj["type"].lower(), obj) for obj in spans).keys() ret = { "doc_text": " ".join(text), "annotation_set": list(ents), "spans": spans, "title": "None", } print({"doc": ret, "type": "high_level"}) return {"doc": ret, "type": "high_level"} def process_text(self, text): input_text = " ".join(text.strip().split()) return self.nlp.tokenize(input_text) def vectorize(self, doc, vocab, char_vocab): words = np.asarray([ vocab[w.lower()] if w.lower() in vocab else 1 for w in doc ]).reshape(1, -1) sentence_chars = [] for w in doc: word_chars = [] for c in w: if c in char_vocab: _cid = char_vocab[c] else: _cid = 1 word_chars.append(_cid) sentence_chars.append(word_chars) sentence_chars = np.expand_dims(pad_sentences(sentence_chars, self.model.word_length), axis=0) return words, sentence_chars def inference(self, doc): text_arr = self.process_text(doc) doc_vec = self.vectorize(text_arr, self.word_vocab, self.char_vocab) seq_len = np.array([len(text_arr)]).reshape(-1, 1) inputs = list(doc_vec) # pylint: disable=no-member inputs = list(doc_vec) + [seq_len] doc_ner = self.model.predict(inputs, batch_size=1).argmax(2).flatten() tags = [self.y_vocab.get(n, None) for n in doc_ner] return self.pretty_print(text_arr, tags)
'{}.params'.format(str(args.model_name))) validate_existing_filepath(model_path) validate_existing_filepath(settings_path) # load model and parameters model = SequenceChunker() model.load(model_path) word_length = model.max_word_len with open(settings_path, 'rb') as fp: model_params = pickle.load(fp) word_vocab = model_params['word_vocab'] chunk_vocab = model_params['chunk_vocab'] char_vocab = model_params.get('char_vocab', None) # parse documents and get tokens nlp = SpacyInstance( disable=['tagger', 'ner', 'parser', 'vectors', 'textcat']) with open(args.input_file) as fp: document_texts = [nlp.tokenize(t.strip()) for t in fp.readlines()] # vectorize input tokens and run inference doc_vecs = vectorize(document_texts, word_vocab, char_vocab) document_annotations = [] for vec in doc_vecs: doc_chunks = model.predict(vec, batch_size=args.b) chunk_a = [ chunk_vocab.id_to_word(l) for l in doc_chunks.argmax(2).flatten() ] document_annotations.append(chunk_a) # print document text and annotations build_annotation(document_texts, document_annotations)
class WikiOnline(object): def __init__(self): import pywikibot self.spacy = SpacyInstance() self.pywikibot = pywikibot self.cache = dict() self.site = pywikibot.Site( 'en', 'wikipedia') # The site we want to run our bot on def get_pages(self, phrase): if phrase in self.cache: return self.cache[phrase] ret_pages = set() word_clean = phrase.replace('-', ' ') word_lower = word_clean.lower() word_upper = word_clean.upper() word_title = word_clean.title() words_set = {phrase, word_clean, word_lower, word_upper, word_title} for appr in words_set: try: page_result = self.get_page_redirect(appr) if page_result.pageid != 0: full_page = self.get_wiki_page_with_items( phrase, page_result) ret_pages.add(WikipediaSearchPageResult(appr, full_page)) except Exception as e: logger.error(e) self.cache[phrase] = ret_pages return ret_pages # pylint: disable=protected-access def get_wiki_page_with_items(self, phrase, page): item = self.get_wiki_page_item(page) pageid = page.pageid aliases = self.get_aliases(item) description = self.get_description(item) text = page.text page_title = page._link._title relations = WikipediaPageExtractedRelations() relations.is_disambiguation = self.is_disambiguation_page(item) relations.is_part_name = self.is_name_description( text, item, relations.is_disambiguation) relations.aliases = aliases relations.be_comp, relations.be_comp_norm = self.extract_be_comp(text) relations.extract_relations_from_text_v0(text) ret_page = WikipediaPage(phrase, None, page_title, None, 0, pageid, description, relations) logger.debug('Page: {}. Extracted successfully'.format(ret_page)) return ret_page def get_wiki_page_item(self, page): if page is not None: try: item = self.pywikibot.ItemPage.fromPage( page) # this can be used for any page object item.get() # need to call it to access any data. return item except (self.pywikibot.NoPage, AttributeError, TypeError, NameError): pass return None def get_page_redirect(self, word): page = self.pywikibot.Page(self.site, word) if page.pageid != 0 and page.isRedirectPage(): return page.getRedirectTarget() return page @staticmethod def get_aliases(item): if item is not None and item.aliases is not None: if 'en' in item.aliases: aliases = item.aliases['en'] return aliases return None @staticmethod def get_description(item): description = {} if item is not None: item_desc = item.get() if 'desctiptions' in item_desc and 'en' in item_desc[ 'descriptions']: dict([("age", 25)]) description['descriptions'] = dict([ ('en', item_desc['descriptions']['en']) ]) return description @staticmethod def is_disambiguation_page(item): if item is not None: dic = item.get() if dic is not None and 'descriptions' in dic: desc = dic['descriptions'] if desc is not None and 'en' in desc: return desc['en'].lower() in DISAMBIGUATE_PAGE return False @staticmethod def is_name_description(text, item, is_disambiguation): if item is not None: if is_disambiguation: if WikipediaPageExtractedRelations.is_name_part(text): return True else: dic = item.get() if dic is not None and 'descriptions' in dic: desc = dic['descriptions'] if desc is not None and 'en' in desc: if [ s for s in NAME_DESCRIPTIONS if s in desc['en'].lower() ]: return True return False # pylint: disable=no-else-return def extract_be_comp(self, text): first_sentence_start_index = text.index("'''") if first_sentence_start_index >= 0: last_temp_index = text.find('\n', first_sentence_start_index) if last_temp_index == -1: last_temp_index = len(text) first_paragraph = text[first_sentence_start_index:last_temp_index] if WikiOnline.extract_be_a_index( first_paragraph) == -1 and last_temp_index != len(text): return self.extract_be_comp(text[last_temp_index:]) elif last_temp_index == len(text): return None, None first_paragraph_clean = re.sub(r'\([^)]*\)', '', first_paragraph) first_paragraph_clean = re.sub(r'<[^>]*>', '', first_paragraph_clean) first_paragraph_clean = re.sub(r'{[^}]*}', '', first_paragraph_clean) first_paragraph_clean = re.sub(r'\[\[[^]]*\]\]', '', first_paragraph_clean) first_paragraph_clean = re.sub(r'[\']', '', first_paragraph_clean) first_paragraph_clean = re.sub(r' ', ' ', first_paragraph_clean) return self.extract_be_comp_relations(first_paragraph_clean) # pylint: disable=not-callable def extract_be_comp_relations(self, first_paragraph): be_comp = set() be_comp_norm = set() if first_paragraph: doc = self.spacy.parser(first_paragraph) for token in doc: target = token.text target_lemma = token.lemma_ relation = token.dep_ governor = token.head.text governor_lemma = token.head.lemma_ if relation == 'acl': break if relation == 'punct' and target == '.': break elif relation == 'cop': be_comp.add(governor) be_comp_norm.add(governor_lemma) elif relation == 'nsubj': be_comp.add(target) be_comp_norm.add(target_lemma) elif relation == 'dep': be_comp.add(governor) be_comp_norm.add(governor_lemma) elif relation == 'compound': be_comp.add(target + ' ' + governor) be_comp_norm.add(target_lemma + ' ' + governor_lemma) elif relation == 'amod': be_comp.add(target + ' ' + governor) be_comp_norm.add(target_lemma + ' ' + governor_lemma) elif relation in ['conj', 'appos']: be_comp.add(target) be_comp_norm.add(target_lemma) return be_comp, be_comp_norm @staticmethod def extract_be_a_index(sentence): result = None if 'is a' in sentence: result = sentence.index("is a") elif 'are a' in sentence: result = sentence.index("are a") elif 'was a' in sentence: result = sentence.index("was a") elif 'were a' in sentence: result = sentence.index("were a") elif 'be a' in sentence: result = sentence.index("be a") elif 'is the' in sentence: result = sentence.index("is the") elif 'are the' in sentence: result = sentence.index("are the") elif 'was the' in sentence: result = sentence.index("was the") elif 'were the' in sentence: result = sentence.index("were the") elif 'be the' in sentence: result = sentence.index("be the") return result
class NPScorer(object): def __init__(self, parser=None): if parser is None: self.nlp = SpacyInstance( disable=["ner", "parser", "vectors", "textcat"]).parser else: self.nlp = parser self.nlp.add_pipe(self.nlp.create_pipe("sentencizer"), first=True) _path_to_model = path.join(chunker_local_path, chunker_model_file) if not path.exists(chunker_local_path): makedirs(chunker_local_path) if not path.exists(_path_to_model): logger.info( "The pre-trained model to be downloaded for NLP Architect word" " chunker model is licensed under Apache 2.0") download_unlicensed_file(nlp_chunker_url, chunker_model_file, _path_to_model) _path_to_params = path.join(chunker_local_path, chunker_model_dat_file) if not path.exists(_path_to_params): download_unlicensed_file(nlp_chunker_url, chunker_model_dat_file, _path_to_params) self.nlp.add_pipe(NPAnnotator.load(_path_to_model, _path_to_params), last=True) def score_documents(self, texts: list, limit=-1, return_all=False, min_tf=5): documents = [] assert len(texts) > 0, "texts should contain at least 1 document" assert min_tf > 0, "min_tf should be at least 1" with tqdm(total=len(texts), desc="documents scoring progress", unit="docs") as pbar: for doc in self.nlp.pipe(texts, n_threads=-1): if len(doc) > 0: documents.append(doc) pbar.update(1) corpus = [] for doc in documents: spans = get_noun_phrases(doc) if len(spans) > 0: corpus.append((doc, spans)) if len(corpus) < 1: return [] documents, doc_phrases = list(zip(*corpus)) scorer = TextSpanScoring(documents=documents, spans=doc_phrases, min_tf=min_tf) tfidf_scored_list = scorer.get_tfidf_scores() if len(tfidf_scored_list) < 1: return [] cvalue_scored_list = scorer.get_cvalue_scores() freq_scored_list = scorer.get_freq_scores() if limit > 0: tf = {tuple(k[0]): k[1] for k in tfidf_scored_list} cv = {tuple(k[0]): k[1] for k in cvalue_scored_list} fr = {tuple(k[0]): k[1] for k in freq_scored_list} tfidf_scored_list_limit = [] cvalue_scored_list_limit = [] freq_scored_list_limit = [] for phrase in list(zip(*tfidf_scored_list))[0][:limit]: tfidf_scored_list_limit.append((phrase, tf[tuple(phrase)])) cvalue_scored_list_limit.append((phrase, cv[tuple(phrase)])) freq_scored_list_limit.append((phrase, fr[tuple(phrase)])) tfidf_scored_list = tfidf_scored_list_limit cvalue_scored_list = cvalue_scored_list_limit freq_scored_list = freq_scored_list_limit tfidf_scored_list = scorer.normalize_l2(tfidf_scored_list) cvalue_scored_list = scorer.normalize_l2(cvalue_scored_list) freq_scored_list = scorer.normalize_minmax(freq_scored_list, invert=True) tfidf_scored_list = scorer.normalize_minmax(tfidf_scored_list) cvalue_scored_list = scorer.normalize_minmax(cvalue_scored_list) if return_all: tf = {tuple(k[0]): k[1] for k in tfidf_scored_list} cv = {tuple(k[0]): k[1] for k in cvalue_scored_list} fr = {tuple(k[0]): k[1] for k in freq_scored_list} final_list = [] for phrases in tf.keys(): final_list.append(([p for p in phrases], tf[phrases], cv[phrases], fr[phrases])) return final_list merged_list = scorer.interpolate_scores( [tfidf_scored_list, cvalue_scored_list], [0.5, 0.5]) merged_list = scorer.multiply_scores([merged_list, freq_scored_list]) merged_list = scorer.normalize_minmax(merged_list) final_list = [] for phrases, score in merged_list: if any([len(p) > 1 for p in phrases]): final_list.append(([p for p in phrases], score)) return final_list
"{}.params".format(str(args.model_name))) validate_existing_filepath(model_path) validate_existing_filepath(settings_path) # load model and parameters model = SequenceChunker() model.load(model_path) word_length = model.max_word_len with open(settings_path, "rb") as fp: model_params = pickle.load(fp) word_vocab = model_params["word_vocab"] chunk_vocab = model_params["chunk_vocab"] char_vocab = model_params.get("char_vocab", None) # parse documents and get tokens nlp = SpacyInstance( disable=["tagger", "ner", "parser", "vectors", "textcat"]) with open(args.input_file) as fp: document_texts = [nlp.tokenize(t.strip()) for t in fp.readlines()] # vectorize input tokens and run inference doc_vecs = vectorize(document_texts, word_vocab, char_vocab) document_annotations = [] for vec in doc_vecs: doc_chunks = model.predict(vec, batch_size=args.b) chunk_a = [ chunk_vocab.id_to_word(l) for l in doc_chunks.argmax(2).flatten() ] document_annotations.append(chunk_a) # print document text and annotations build_annotation(document_texts, document_annotations)
class WikiOnline(object): def __init__(self): try: import pywikibot except (AttributeError, ImportError): logger.error( "pywikibot is not installed, please install nlp_architect with [all] package. " + "for example: pip install nlp_architect[all]") sys.exit() self.spacy = SpacyInstance() self.pywikibot = pywikibot self.cache = dict() self.site = pywikibot.Site( "en", "wikipedia") # The site we want to run our bot on def get_pages(self, phrase): if phrase in self.cache: return self.cache[phrase] ret_pages = set() word_clean = phrase.replace("-", " ") word_lower = word_clean.lower() word_upper = word_clean.upper() word_title = word_clean.title() words_set = {phrase, word_clean, word_lower, word_upper, word_title} for appr in words_set: try: page_result = self.get_page_redirect(appr) if page_result.pageid != 0: full_page = self.get_wiki_page_with_items( phrase, page_result) ret_pages.add(WikipediaSearchPageResult(appr, full_page)) except Exception as e: logger.error(e) self.cache[phrase] = ret_pages return ret_pages # pylint: disable=protected-access def get_wiki_page_with_items(self, phrase, page): item = self.get_wiki_page_item(page) pageid = page.pageid aliases = self.get_aliases(item) description = self.get_description(item) text = page.text page_title = page._link._title relations = WikipediaPageExtractedRelations() relations.is_disambiguation = self.is_disambiguation_page(item) relations.is_part_name = self.is_name_description( text, item, relations.is_disambiguation) relations.aliases = aliases relations.be_comp, relations.be_comp_norm = self.extract_be_comp(text) relations.extract_relations_from_text_v0(text) ret_page = WikipediaPage(phrase, None, page_title, None, 0, pageid, description, relations) logger.debug("Page: {}. Extracted successfully".format(ret_page)) return ret_page def get_wiki_page_item(self, page): if page is not None: try: item = self.pywikibot.ItemPage.fromPage( page) # this can be used for any page object item.get() # need to call it to access any data. return item except (self.pywikibot.NoPage, AttributeError, TypeError, NameError): pass return None def get_page_redirect(self, word): page = self.pywikibot.Page(self.site, word) if page.pageid != 0 and page.isRedirectPage(): return page.getRedirectTarget() return page @staticmethod def get_aliases(item): if item is not None and item.aliases is not None: if "en" in item.aliases: aliases = item.aliases["en"] return aliases return None @staticmethod def get_description(item): description = {} if item is not None: item_desc = item.get() if "desctiptions" in item_desc and "en" in item_desc[ "descriptions"]: dict([("age", 25)]) description["descriptions"] = dict([ ("en", item_desc["descriptions"]["en"]) ]) return description @staticmethod def is_disambiguation_page(item): if item is not None: dic = item.get() if dic is not None and "descriptions" in dic: desc = dic["descriptions"] if desc is not None and "en" in desc: return desc["en"].lower() in DISAMBIGUATE_PAGE return False @staticmethod def is_name_description(text, item, is_disambiguation): if item is not None: if is_disambiguation: if WikipediaPageExtractedRelations.is_name_part(text): return True else: dic = item.get() if dic is not None and "descriptions" in dic: desc = dic["descriptions"] if desc is not None and "en" in desc: if [ s for s in NAME_DESCRIPTIONS if s in desc["en"].lower() ]: return True return False # pylint: disable=no-else-return def extract_be_comp(self, text): first_sentence_start_index = text.index("'''") if first_sentence_start_index >= 0: last_temp_index = text.find("\n", first_sentence_start_index) if last_temp_index == -1: last_temp_index = len(text) first_paragraph = text[first_sentence_start_index:last_temp_index] if WikiOnline.extract_be_a_index( first_paragraph) == -1 and last_temp_index != len(text): return self.extract_be_comp(text[last_temp_index:]) elif last_temp_index == len(text): return None, None first_paragraph_clean = re.sub(r"\([^)]*\)", "", first_paragraph) first_paragraph_clean = re.sub(r"<[^>]*>", "", first_paragraph_clean) first_paragraph_clean = re.sub(r"{[^}]*}", "", first_paragraph_clean) first_paragraph_clean = re.sub(r"\[\[[^]]*\]\]", "", first_paragraph_clean) first_paragraph_clean = re.sub(r"[\']", "", first_paragraph_clean) first_paragraph_clean = re.sub(r" ", " ", first_paragraph_clean) return self.extract_be_comp_relations(first_paragraph_clean) # pylint: disable=not-callable def extract_be_comp_relations(self, first_paragraph): be_comp = set() be_comp_norm = set() if first_paragraph: doc = self.spacy.parser(first_paragraph) for token in doc: target = token.text target_lemma = token.lemma_ relation = token.dep_ governor = token.head.text governor_lemma = token.head.lemma_ if relation == "acl": break if relation == "punct" and target == ".": break elif relation == "cop": be_comp.add(governor) be_comp_norm.add(governor_lemma) elif relation == "nsubj": be_comp.add(target) be_comp_norm.add(target_lemma) elif relation == "dep": be_comp.add(governor) be_comp_norm.add(governor_lemma) elif relation == "compound": be_comp.add(target + " " + governor) be_comp_norm.add(target_lemma + " " + governor_lemma) elif relation == "amod": be_comp.add(target + " " + governor) be_comp_norm.add(target_lemma + " " + governor_lemma) elif relation in ["conj", "appos"]: be_comp.add(target) be_comp_norm.add(target_lemma) return be_comp, be_comp_norm @staticmethod def extract_be_a_index(sentence): result = None if "is a" in sentence: result = sentence.index("is a") elif "are a" in sentence: result = sentence.index("are a") elif "was a" in sentence: result = sentence.index("was a") elif "were a" in sentence: result = sentence.index("were a") elif "be a" in sentence: result = sentence.index("be a") elif "is the" in sentence: result = sentence.index("is the") elif "are the" in sentence: result = sentence.index("are the") elif "was the" in sentence: result = sentence.index("was the") elif "were the" in sentence: result = sentence.index("were the") elif "be the" in sentence: result = sentence.index("be the") return result