Пример #1
0
def load_final_pipe():
    import string
    import en_core_sci_lg
    import en_core_web_md
    from collections import Counter
    from tqdm import tqdm

    en_core_sci_lg.load()
    en_core_web_md.load()
    return load_object(FINAL_PIPE_FILE_PATH)
Пример #2
0
 def __init__(self, model_size='en_core_web_md'):
     try:
         self.nlp = spacy.load(model_size)
     except:
         self.nlp = en_core_web_md.load()
     finally:
         self.nlp.tokenizer = self.custom_tokenizer(self.nlp)
Пример #3
0
def entity_analyzer(my_text):
    nlp = en_core_web_md.load()
    docx = nlp(my_text)
    tokens = [token.text for token in docx]
    entities = [(entity.text, entity.label_) for entity in docx.ents]
    allData = ['"Token":{},\n"Entities":{}'.format(tokens, entities)]
    return allData
Пример #4
0
def text_analyzer(my_text):
    nlp = en_core_web_md.load()
    docx = nlp(my_text)

    allData = [('"Token":{},\n"Lemma":{}'.format(token.text, token.lemma_))
               for token in docx]
    return allData
Пример #5
0
def do_eval(test_data_path, shuffle=False):
    if FLAGS.load_model is None:
        raise ValueError("You need to specify the model location by --load_model=[location]")

    # Load Testing Data
    question_1, question_2, labels = get_input_from_csv(test_data_path)

    if shuffle:
        question_1, question_2, labels = shuffle_data(question_1, question_2, labels)

    # Load Pre-trained Model
    if FLAGS.best_glove:
        import en_core_web_md
        nlp = en_core_web_md.load()  # load best-matching version for Glove
    else:
        nlp = spacy.load('en')
    embedding_matrix = load_glove_embeddings(nlp.vocab, n_unknown=FLAGS.num_unknown)  # shape=(1071074, 300)

    tf.logging.info('Build model ...')
    esim = ESIM(embedding_matrix, FLAGS.max_length, FLAGS.num_hidden, FLAGS.num_classes, FLAGS.keep_prob, FLAGS.learning_rate)

    if FLAGS.load_model:
        model = esim.build_model(FLAGS.load_model)
    else:
        raise ValueError("You need to specify the model location by --load_model=[location]")

    # Convert the "raw data" to word-ids format && convert "labels" to one-hot vectors
    q1_test, q2_test = convert_questions_to_word_ids(question_1, question_2, nlp, max_length=FLAGS.max_length, tree_truncate=FLAGS.tree_truncate)
    labels = to_categorical(np.asarray(labels, dtype='int32'))

    scores = model.evaluate([q1_test, q2_test], labels, batch_size=FLAGS.batch_size, verbose=1)

    print("=================== RESULTS =====================")
    print("[*] LOSS OF TEST DATA: %.4f" % scores[0])
    print("[*] ACCURACY OF TEST DATA: %.4f" % scores[1])
Пример #6
0
    def test_predict(self):
        nlp = en_core_web_md.load()
        classifier = TextClassifier(nlp.vocab, n_classes=2)

        text = "Sample text to test the classifier"
        output = classifier.predict(text)
        self.assertTrue(isinstance(output.data, torch.Tensor))
Пример #7
0
    def test_get_ids(self):
        nlp = en_core_web_md.load()
        classifier = TextClassifier(nlp.vocab, n_classes=2)

        text = "Sample text to test the classifier"
        ids = classifier.get_ids(text)
        self.assertTrue(isinstance(ids, list))
Пример #8
0
def get_spacy_model():
    global _spacy_cache
    if _spacy_cache is None:
        print("Initializing spaCy model...")
        _spacy_cache = en_core_web_md.load()

    return _spacy_cache
    def __init__(self,
                 dictionary_loader: DictionaryLoader,
                 language="en",
                 filters: List[AnnotationFilter] = None):
        """
        Parameters
        ----------
            dictionary_loader: DictionaryLoader
                The dictionary loader that will provide the dictionary contents
            language: str
                The language of the text that will processed (affects the choice of tokenner and stemmer).
            filters: List[AnnotationFilter]
                A list of filters to apply post recognition
        """
        super().__init__(dictionary_loader, language=language, filters=filters)
        self.punctuation_remove = regex.compile(
            r'[\p{C}|\p{M}|\p{P}|\p{S}|\p{Z}]+', regex.UNICODE)
        self.label_concept_index = {}
        self.label_token_counts = {}
        self.label_lengths = {}
        self.trie = Trie()

        if language == 'en':
            import en_core_web_md
            self.spacy = en_core_web_md.load()
        elif language == 'fr':
            import fr_core_web_md
            self.spacy = fr_core_web_md.load()
        else:
            raise ValueError(f"Unsupported language: {language}")
Пример #10
0
def train(train_data, val_data, batch_size, n_epochs, save_dir=None):
    # Stage 1: Read training data (csv) && Preprocessing them
    tf.logging.info('Loading training and validataion data ...')
    train_question_1, train_question_2, train_labels = get_input_from_csv(train_data)
    # val_question_1, val_question_2, val_labels = get_input_from_csv(val_data)

    # Stage 2: Load Pre-trained embedding matrix (Using GLOVE here)
    tf.logging.info('Loading pre-trained embedding matrix ...')
    if FLAGS.best_glove:
        import en_core_web_md
        nlp = en_core_web_md.load()  # load best-matching version for Glove
    else:
        nlp = spacy.load('en')
    embedding_matrix = load_glove_embeddings(nlp.vocab, n_unknown=FLAGS.num_unknown)  # shape=(1071074, 300)

    # Stage 3: Build Model
    tf.logging.info('Build model ...')
    esim = ESIM(embedding_matrix, FLAGS.max_length, FLAGS.num_hidden, FLAGS.num_classes, FLAGS.keep_prob, FLAGS.learning_rate)

    if FLAGS.load_model:
        model = esim.build_model(FLAGS.load_model)
    else:
        model = esim.build_model()

    # Stage 4: Convert the "raw data" to word-ids format && convert "labels" to one-hot vectors
    tf.logging.info('Converting questions into ids ...')
    q1_train, q2_train = convert_questions_to_word_ids(train_question_1, train_question_2, nlp, max_length=FLAGS.max_length, tree_truncate=FLAGS.tree_truncate)
    train_labels = to_categorical(np.asarray(train_labels, dtype='int32'))

    # q1_val, q2_val = convert_questions_to_word_ids(val_question_1, val_question_2, nlp, max_length=FLAGS.max_length, tree_truncate=FLAGS.tree_truncate)
    # val_labels = to_categorical(np.asarray(val_labels, dtype='int32'))

    # Stage 5: Training
    tf.logging.info('Start training ...')

    callbacks = []
    save_dir = save_dir if save_dir is not None else 'checkpoints'
    filepath = os.path.join(save_dir, "weights-{epoch:02d}-{val_acc:.2f}.hdf5")
    checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
    callbacks.append(checkpoint)

    if FLAGS.tensorboard:
        graph_dir = os.path.join('.', 'GRAPHs')
        if not os.path.exists(graph_dir):
            os.makedirs(graph_dir)
        tb = TensorBoard(log_dir=graph_dir, histogram_freq=0, write_graph=True, write_images=True)
        callbacks.append(tb)

    model.fit(
        x=[q1_train, q2_train],
        y=train_labels,
        batch_size=batch_size,
        epochs=n_epochs,
        # validation_data=([q1_val, q2_val], val_labels),
        validation_split=0.2,
        callbacks=callbacks,
        shuffle=True,
        verbose=FLAGS.verbose
    )
Пример #11
0
    def __init__(self):
        ifninstall(self.__class__.md_pkg)
        print('initiating NLP language pipeline...', end='', flush=True)
        import en_core_web_md
        self._nlp = en_core_web_md.load()
        print('done')

        self._corpus = None
Пример #12
0
def _get_spacy_wmd_model():
    global _spacy_wmd_cache
    if _spacy_wmd_cache is None:
        _spacy_wmd_cache = en_core_web_md.load()
        _spacy_wmd_cache.add_pipe(
            wmd.WMD.SpacySimilarityHook(_spacy_wmd_cache), last=True)

    return _spacy_wmd_cache
Пример #13
0
 def __init__(self):
     # init model and disabling unuse module(s)
     try:
         self.model = en_core_web_md.load()
         print("Set SpaCy model to medium model")
     except:
         self.model = en_core_web_sm.load()
         print("Set SpaCy model to small model")
    def __init__(self, lang):
        if lang == LANG.EN:
            self.nlp = en_core_web_md.load()
        else:
            self.nlp = de_core_news_sm.load()

        self.stanford_ner = StanfordNERTagger(model,
                                              '../models/stanford-ner.jar',
                                              encoding='utf-8')
Пример #15
0
 def embbeding(self):
     pretrained_emb = []
     spacy_tool = en_core_web_md.load()
     pretrained_emb.append(spacy_tool('PAD').vector)
     pretrained_emb.append(spacy_tool('UNK').vector)
     pretrained_emb.append(spacy_tool('CLS').vector)
     for i in range(len(self.dictionary.idx2word)):
         word = self.dictionary.idx2word[i]
         pretrained_emb.append(spacy_tool(word).vector)
     return np.array(pretrained_emb)
Пример #16
0
def create_corpus(text_list, target_path):

    nlp = en_core_web_md.load(disable=['tagger', 'ner', 'textcat'])

    with open(target_path, 'w') as f:
        #  Split sentences for each document
        for text in tqdm(text_list):
            text_lines = [str(sent)
                          for i, sent in enumerate(nlp(str(text)).sents)]
            f.write('\n'.join(text_lines))
            f.write("\n  \n")
Пример #17
0
    def en_lang(cls):
        me_list = ['i', 'my']
        embeddings_model = FlairEmbeddingModels().en_lang()
        nlp = en_core_web_md.load()
        relationship_list = [
            'father', 'mother', 'sister', 'brother', 'son', 'daughter',
            'husband', 'wife', 'grandson', 'granddaughter', 'grandmother',
            'grandfather', 'uncle', 'aunt', 'friend'
        ]

        return cls(me_list, embeddings_model, nlp, relationship_list)
Пример #18
0
    def __init__(self, n_classes, arch="cnn", vocab=None, hparams={}):
        """
        Constructor class for TextClassifier
        Args:
            vocab (spacy.vocab): Spacy vocabulary class
            arch (string/pl.Lightningmodule): The underlying text classifier model architecture
            hparams (dict): Dictionary of hyper parameters for the underlying model
        Returns:
            None
        """

        if isinstance(vocab, Vocab):
            self._vocab = vocab
        else:
            self._vocab = en_core_web_md.load().vocab

        self._vocab.set_vector("@pad@",
                               vector=np.zeros(self.vocab.vectors.shape[1]))
        self._vocab.set_vector("@oov@",
                               vector=np.zeros(self.vocab.vectors.shape[1]))

        oov_orth = self._vocab["@oov@"].orth
        self.oov_id = self._vocab.vectors.key2row[oov_orth]

        self.tokenizer = Tokenizer(self.vocab)

        input_dim, embedding_dim = self.vocab.vectors.shape
        output_dim = n_classes
        pad_idx = self.vocab.vectors.key2row[self.vocab["@pad@"].orth]

        hparams["pad_idx"] = pad_idx
        hparams["input_dim"] = input_dim
        hparams["embedding_dim"] = embedding_dim

        if isinstance(arch, BaseModel):
            self._model = arch

        elif isinstance(arch, str):

            if arch == "cnn":

                self._model = CNN2D(output_dim, hparams)

            elif arch == "bilstm":

                self._model = BiLSTM(output_dim, hparams)

            else:
                print("No such architecture exists")

        else:
            print("arch should be string or a torch file duh")
Пример #19
0
def spacy_keyList(q):
    nlp = en_core_web_md.load()
    doc = nlp(q)
    # for w in doc:
    #     print(f'{w.text:15s}[{w.tag_:5s}|{w.lemma_:20s}|{w.pos_:6s}]')
    # for ent in doc.ents:
    #     print(f'{ent.text:15s}[{ent.label_}]')

    pattern1 = [{'ENT_TYPE': 'ORG', "OP": "+"}]
    pattern2 = [{'ENT_TYPE': 'PERSON', "OP": "+"}]
    pattern3 = [{'ENT_TYPE': 'FAC', "OP": "+"}]
    pattern4 = [{'ENT_TYPE': 'LOC', "OP": "+"}]
    pattern5 = [{'ENT_TYPE': 'GPE', "OP": "+"}]
    pattern6 = [{
        'ENT_TYPE': 'CARDINAL',
        "OP": "*"
    }, {
        'ENT_TYPE': 'LOC',
        "OP": "+"
    }, {
        "IS_PUNCT": True
    }, {
        'ENT_TYPE': 'GPE',
        "OP": "+"
    }, {
        "IS_PUNCT": True
    }, {
        'ENT_TYPE': 'GPE',
        "OP": "+"
    }]
    pattern7 = [{
        'ENT_TYPE': 'FAC',
        "OP": "+"
    }, {
        "IS_LOWER": True
    }, {
        'ENT_TYPE': 'CARDINAL',
        "OP": "?"
    }, {
        'ENT_TYPE': 'GPE',
        "OP": "+"
    }]
    matcher = Matcher(nlp.vocab)
    matcher.add("name1", None, pattern1, pattern2, pattern3, pattern4,
                pattern5, pattern6, pattern7)
    matches = matcher(doc)
    keyList = []
    for m_id, s, e in matches:
        keyword = doc[s:e].text
        if len(keyword) > 1 and len(doc[s:e].text.split()) > 1:
            keyList.append(keyword)
    return keyList
Пример #20
0
	def __init__(self, title, path, verbose = False, testing = 0):
		self.verbose = verbose
		self.testing = testing
		self.home = path
		self.title = title
		self.date = self.get_date()
		self.today = date.today()
		self.pdf_path = os.path.join(self.home, self.title)
		self.img_path = os.path.join(self.pdf_path + 'convertedimages')
		self.words = []
		self.num_pages = self.get_pdf_images()
		self.text = self.get_text()
		self.nlp = en_core_web_md.load()
		self.doc = self.nlp(self.text)
Пример #21
0
    def get_entities(self, document: str, language: str = 'english'):
        ''' Takes a document and returns a list of extracted entities '''
        if language == 'spanish':
            try:
                import es_core_news_md
                logger.info("Success importing en_core_web_md")
            except ImportError:
                logger.error("Error importing es_core_news_md")
                sys.exit(-1)
        else:
            try:
                import en_core_web_md
                logger.info("Success importing en_core_web_md")
            except ImportError:
                logger.error("Error importing en_core_web_md")
                sys.exit(-1)

        if isinstance(document, List):
            document = " ".join(document)
        # if language given is not the name of a spacy parser, try to convert it to one
        parser_name = language if language in LANG_TO_PARSER.values(
        ) else LANG_TO_PARSER.get(language.lower())
        if not parser_name:
            raise Exception('language not supported')

        # if requested parser is not already in memory, try to load from spacy
        if parser_name not in PARSERS:
            try:
                logger.info("Trying to load parser.")
                if language == 'spanish':
                    PARSERS[parser_name] = es_core_news_md.load()
                else:
                    PARSERS[parser_name] = en_core_web_md.load()
                #PARSERS[parser_name] = spacy.load(parser_name)
            except Exception:
                logger.exception("Error loading parser")
                sys.exit(-1)
        else:
            logger.info("Found parser %s in memory" % parser_name)

        def get_ents(doc):
            ''' prep, parse, then extract entities from doc text '''
            doc = prep_text(doc)  # preprocess string
            doc = PARSERS[parser_name](doc)  # parse prepped doc
            ents = set(ent.text for ent in doc.ents
                       if not self.filter_entity(ent))  # extract entities
            return list(ents)

        return get_ents(document)
Пример #22
0
    def __init__(self, name="", motorcycles=[], closest_track=None, **kwargs):
        print("Loading in NLP models... (this could take a minute or two)")
        self.nlp = en_core_web_md.load()
        self.sbert_model = SentenceTransformer('stsb-roberta-base')
        # For NER (at least for detecting a person's name) it seems to perform a lot better with small, it wouldn't detect my name on medium
        self.ner = en_core_web_sm.load()

        # user data
        self.name = name
        self.motorcycles = motorcycles
        self.closest_track = closest_track

        self.conn = sqlite3.connect("knowledge_base.db").cursor()
        self.motorcycle_finder = MotorcycleFinder(ner=self.ner)
        self.new_user = False if name else True  # will eventually be something like True if name else False
Пример #23
0
def tokenize_string_details(s):
    global nlp
    if nlp is None:
        # not doing it here causes some errors about en_models.vector not being able to be loaded
        # essentially this causes the models to be loaded at most once in the executors
        # not very good, but it will do for this sie
        nlp = en_core_web_md.load()
    doc = nlp(s)
    tokens = []
    for token in doc:
        if token.ent_type_ in ['GPE', 'LOC', 'DATE', 'TIME']:
            tokens.append(f'-{token.ent_type}-')
        elif token.like_url:
            tokens.append(f'-URL-')
        else:
            tokens.append(token.lower_)
    return tokens
Пример #24
0
def main():
    logging.getLogger().setLevel('INFO')
    nlp = en_core_web_md.load()
    split = 'val'
    sent = sentences(json.load(open(coco_path + '/dataset.json')), split=split)
    writer = csv.writer(open(data_path + '/depparse_coco_val.csv', "w"))
    writer.writerow(
        ["sentid", "position", "word", "postag", "postag_c", "dep", "head"])
    for sent_i in sent:
        #logging.info("Parsing: {} {}".format(sent_i['sentid'], ' '.join(sent_i['tokens'])))
        logging.info("Parsing: {} {}".format(sent_i['sentid'], sent_i['raw']))
        postag, postag_c, label, head = parse(nlp, sent_i['tokens'])
        for i in range(len(sent_i['tokens'])):
            writer.writerow([
                sent_i['sentid'], i, sent_i['tokens'][i], postag[i],
                postag_c[i], label[i], head[i]
            ])
def get_spacy_tokenizer(default_lingo, supported_languages, bigmodel_required):
    '''returns the spacy nlp function corresponding to the language of a document'''
    if default_lingo in supported_languages:
        if bigmodel_required == False:
            if default_lingo == "German":
                import de_core_news_sm
                nlp = de_core_news_sm.load()
            elif default_lingo == "English":
                import en_core_web_sm
                nlp = en_core_web_sm.load()
            elif default_lingo == "Spanish":
                import es_core_news_sm
                nlp = es_core_news_sm.load()
            elif default_lingo == "French":
                import fr_core_news_sm
                nlp = fr_core_news_sm.load()
            elif default_lingo == "Portuguese":
                import pt_core_news_sm
                nlp = pt_core_news_sm.load()
            else:
                import it_core_news_sm
                nlp = it_core_news_sm.load()
        else:
            if default_lingo == "German":
                import de_core_news_md
                nlp = de_core_news_md.load()
            elif default_lingo == "English":
                import en_core_web_md
                nlp = en_core_web_md.load()
            elif default_lingo == "Spanish":
                import es_core_news_md
                nlp = es_core_news_md.load()
            elif default_lingo == "French":
                import fr_core_news_md
                nlp = fr_core_news_md.load()
            elif default_lingo == "Portuguese":
                # there is no pt_md model
                import pt_core_news_sm
                nlp = pt_core_news_sm.load()
            else:
                # there is no it_md model
                import it_core_news_sm
                nlp = it_core_news_sm.load()
    else:
        print("NOT A SUPPORTED LANGUAGE!")
    return nlp
Пример #26
0
    def en_lang(cls):
        nlp = en_core_web_md.load()
        embeddings_model = FlairEmbeddingModels().en_lang()
        # PP: e.g. 'I have a son', 'I have a smaller brother', 'I have a 9 year old son'
        # NP: e.g. 'My (little) sister'
        grammar = r"""
                    PP: {<PRON><VERB><NUM>?<DET>?<ADJ>?<NOUN>}
                    NP: {<ADJ><ADJ>?<NOUN>}            
                    REL: {<PP>|<NP>}"""
        relationship_list = [
            'father', 'mother', 'sister', 'brother', 'son', 'daughter',
            'husband', 'wife', 'grandson', 'granddaughter', 'grandmother',
            'grandfather', 'uncle', 'aunt', 'friend'
        ]
        me_list = ['i', 'my', 'me']

        return cls(nlp, grammar, relationship_list, me_list, embeddings_model)
Пример #27
0
def process_page(row):
    nlp = en_core_web_md.load()
    warc_record = WarcRecord(row)
    warc_payload = warc_record.payload
    if warc_record.broken or not warc_payload or warc_payload is None or warc_record.id is None:
        return (yield "")
    
    ents = []
    soup = None
    try:
        soup = BeautifulSoup(warc_payload, "html.parser")
    except:
        raise Exception("SOUP FAILED:  %s %s\n" % (warc_payload, warc_record.id) )
    for script in soup(['style', 'script', 'head', 'title', 'meta', '[document]', 'code' 'blockquote', 'cite']):
        script.extract()
    text = " ".join(re.findall(r'\w+',  soup.get_text()))
    article = nlp(text)

    raw_words = []
    for ent in article.ents:
        if(ent.label_ not in ["CARDINAL", "DATE", "QUANTITY", "TIME", "ORDINAL", "MONEY", "PERCENT", "QUANTITY"]) and len(ent.text.split(" ")) < 4:
            raw_words.append(ent.text)
            ents.append(ent)

    processed_results = {}
    output = []
    for ent in ents:
        stripped_ent = ent.text.strip()
        if text == '':
            continue
        es_results = es.search(stripped_ent)
        sims = []
        abstracts = []
        processed_results = {}
        for es_result in es_results:
            if es_result.label in processed_results:
              continue
            else:
               processed_results[es_result.label] = True
               abstract = sparql.search(es_result.id, es_result.label, es_result.score)
               if abstract:
                    abstracts.append([es_result.id, abstract])
        if len(abstracts) > 0:
            output.append(stringify_reply(warc_record.id, stripped_ent, get_sim(text, abstracts)))
    yield output
Пример #28
0
def load_model(model_str):
    """
    Loads a model given an input string (could work differently)
    """
    if model_str == "en_core_web_md":
        import en_core_web_md
        model_func = en_core_web_md.load()
    if model_str == "en_resume_model":
        import en_resume_model
        model_func = en_resume_model.load()

    if model_str == "seed_data":
        import spacy
        from spacy.matcher import PhraseMatcher
        from spacy.tokens import Span

        class EntityMatcher(object):
            """
            Matcher with multi-case
            """
            def __init__(self, nlp, terms, label):
                patterns = [nlp(text.lower()) for text in terms]
                patterns += [nlp(text.upper()) for text in terms]
                patterns += [nlp(text.title()) for text in terms]
                self.matcher = PhraseMatcher(nlp.vocab)
                self.matcher.add(label, None, *patterns)
                self.name = '{}_matcher'.format(label)

            def __call__(self, doc):
                matches = self.matcher(doc)
                for match_id, start, end in matches:
                    span = Span(doc, start, end, label=match_id)
                    doc.ents = list(doc.ents) + [span]
                return doc

        nlp = spacy.load('en_core_web_md', disable=['ner'])
        with open("../data/seeds.json", 'r') as f:
            seed_data_json = json.loads(f.read())
        for key in seed_data_json.keys():
            terms = seed_data_json[key]
            #terms = spacy_similarity(terms, nlp, num_similar=2)
            entity_matcher = EntityMatcher(nlp, terms, key)
            nlp.add_pipe(entity_matcher)
        model_func = nlp
    return model_func
Пример #29
0
def eventLister(text, subject):
    # Load the large English NLP model
    nlp = en_core_web_md.load()

    # Parse the document with spaCy
    doc = nlp(text)

    # Extract semi-structured statements
    statements = textacy.extract.semistructured_statements(doc, subject)

    # Return the results
    str = "Here are the things I know about " + subject + ":"

    for statement in statements:
        subject, verb, fact = statement
        str += f"\n - {fact}"

    return str
Пример #30
0
def get_people_orgs_from_text(text, model="large"):
    """
    Loads and runs spaCy NLP object on the text provided.
    
    returns two lists:
        (1) tuples of people (person, number of appearances)
        (2) tuples of organizations (organization, number of appearances)
    """

    # Step 1: Check model input and load nlp object
    if model.lower() == "large": nlp = en_core_web_lg.load()
    elif model.lower() == "medium": nlp = en_core_web_md.load()
    elif model.lower() == "small": nlp = en_core_web_sm.load()
    else:
        print("Invalid NLP model.")
        return [], []

    # Step 2: Process the text using spaCy and get list of people and
    #         orgnization objects
    doc = nlp(text)
    orgs = [x.text for x in doc.ents if x.label_ == 'ORG']
    people = [x.text for x in doc.ents if x.label_ == 'PERSON']
    #pprint([(x.text, x.label_) for x in doc.ents])

    # Step 3: Process and filter the list of organizations
    orgdict = dict()
    for item in orgs:
        if item in orgdict.keys(): orgdict[item] += 1
        else: orgdict[item] = 1
    orgs = list(orgdict.items())
    orgs.sort(key=lambda x: x[1], reverse=True)
    orgs = filter_people_orgs(orgs)

    # Step 4: Process and filter the list of people
    peopledict = dict()
    for item in people:
        if item in peopledict.keys(): peopledict[item] += 1
        else: peopledict[item] = 1
    people = list(peopledict.items())
    people.sort(key=lambda x: x[1], reverse=True)
    people = filter_people_orgs(people)

    return people, orgs