示例#1
0
    def get_ner_tags(self):
        sys.path.append('../preprocess')
        from nltk.tag.stanford import StanfordNERTagger
        st = StanfordNERTagger(
            '../stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
            '../stanford-ner/stanford-ner.jar')

        tokenized_list = [ct.split() for ct in self.cleaned_data]
        NERTags = st.tag_sents(tokenized_list)

        n = []
        for nt in NERTags:
            n.extend(nt)

        ids = []
        #get the indexes of all words that have NER tags
        ids = [i for a, i in zip(n, range(len(n))) if a[1] != "O"]
        a = np.array(ids)

        consecutive_ids = np.split(a, np.where(np.diff(a) != 1)[0] + 1)

        phrases = []
        for ci in consecutive_ids:
            phrase = ""
            tag = ""
            for id_ in ci:
                phrase += "{} ".format(n[id_][0])

            tag += "{}".format(n[id_][1])
            phrases.append(phrase)

        cleaned_phrases = self.del_repeat(phrases)
        return cleaned_phrases
示例#2
0
    def get_ner_tags(self):
        sys.path.append('../preprocess')
        from nltk.tag.stanford import StanfordNERTagger
        st = StanfordNERTagger(
            '../stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
            '../stanford-ner/stanford-ner.jar')

        tokenized_list = [ct.split() for ct in self.cleaned_data]
        NERTags = st.tag_sents(tokenized_list)

        tags = [nt for nt in NERTags]
        ids = [[i for a, i in zip(t, range(len(t))) if a[1] != "O"]
               for t in tags]

        phrases = []
        for i, t in zip(ids, tags):
            phrase = ""
            tt = "N/A"
            for p, index in zip(i, range(len(i))):
                if index == len(i) - 1:
                    phrase += "{}".format(t[p][0])
                    tt = phrase, t[p][1]
                else:
                    phrase += "{} ".format(t[p][0])

            phrases.append(tt)
        return phrases
示例#3
0
def NERWithOldStanford(input_sample):
    java_path = "C:\Program Files (x86)\Common Files\Oracle\Java\javapath\java.exe"  #"C:/Program Files/Java/jdk1.8.0_161/bin/java.exe"
    os.environ['JAVAHOME'] = java_path
    tagger = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz',
                               'stanford-ner.jar',
                               encoding='utf-8')
    tokenized_text = word_tokenize(input_sample)
    classified_paragraphs_list = tagger.tag_sents([tokenized_text])
    formatted_result = formatted_entities(classified_paragraphs_list)
    return formatted_result
示例#4
0
    def extract_events2(self, tweet_sentences):
        path_to_jar = 'lib/stanford_parser/stanford-parser.jar'
        path_to_models_jar = 'lib/stanford_parser/stanford-english-corenlp-2018-02-27-models.jar'
        path_to_ner_tagger = 'lib/stanford_ner/stanford-ner.jar'
        path_to_ner_model = 'lib/stanford_ner/english.all.3class.distsim.crf.ser.gz'

        sentence_preprocessor = Preprocessor(['remove_non_letters'])
        ner_tagger = StanfordNERTagger(path_to_ner_model, path_to_ner_tagger)
        dependency_parser = StanfordDependencyParser(
            path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)

        events = []

        chunks = list(
            self.utilities.chunkify_list(data_list=tweet_sentences,
                                         items_per_chunk=1000))

        for chunk in chunks:
            created_ats = []
            sentences = []
            for chunk_item in chunk:
                created_ats.append(chunk_item[0])
                sentences.append(
                    sentence_preprocessor.preprocess(chunk_item[1]))

            chunk_sent_dependencies = dependency_parser.raw_parse_sents(
                sentences)
            chunk_sent_ner_tags = ner_tagger.tag_sents(
                [sentence.split() for sentence in sentences])

            for sent_dependencies, sent_ner_tags, created_at in zip(
                    chunk_sent_dependencies, chunk_sent_ner_tags, created_ats):
                dependencies = [
                    list(parse.triples()) for parse in sent_dependencies
                ]

                if len(dependencies) > 0 and dependencies[0] is not None:
                    sentence_events = self.extract_events_from_stanford_dependencies(
                        dependencies[0], sent_ner_tags)
                    if len(sentence_events) > 0:
                        for sentence_event in sentence_events:
                            events.append((created_at, sentence_event))

        return events
                            bestSentenceText[i] = bestSentenceText[i].replace(" ,", ",")
                            bestSentenceTokensNoStopWords.append(bestSentenceText[i])

                allBestSentences.append(bestSentenceTokensNoStopWords) #####################
                best = result[0][0]
                bestSentence[articleNo,questionNo] = best
                if qa['answer'] in bestSentenceText:
                    correctSentence += 1
            else:
                allBestSentences.append([]) #to preserve question sequence


    print("The accuracy on dev set is", (correctSentence/float(totalQuestions)))


    NER_tagged = st.tag_sents(allBestSentences)

    print('NER Tagging Done')




    f = open('bestSentencesTagged.bin', 'wb')  # 'wb' instead 'w' for binary file
    pickle.dump(NER_tagged, f, -1)  # -1 specifies highest binary protocol
    f.close()
    print("NER Saved")


else: #NER tagged found
    f = open('bestSentencesTagged.bin', 'rb')  # 'rb' for reading binary file
    NER_tagged = pickle.load(f)
    print(idx, sep=' ')

    # replace by hyper-tags
    text = re.sub(r'http[^\s]+', 'URL', u)
    text = re.sub(r'www\.[^\s]+', 'URL', text)
    text = re.sub(r'[^\s]+@[^\s]+\.[^\s]+', 'EMAIL', text)
    text = re.sub(r'\d+', 'NUM', text)
    while re.findall(r'[\.,\?\(\)\[\]:/\!_\"]', text) != []:
        # text = re.sub(r'([\s])[\.,\?\(\)\[\]\:]', '\1', text)
        # text = re.sub(r'[\.,\?\(\)\[\]\:]([\s])', '\1', text)
        text = re.sub(r'[\.,\?\(\)\[\]:/\!_\"]', ' ', text)

    tokenized_utterance = []
    tokenized_utterance.append(word_tokenize(text))

    classified_paragraphs_list = tagger.tag_sents(list(tokenized_utterance))

    # replace NER tokens by their NER-type
    processed = [
        stemmer.stem(x[0]) if x[1] == 'O' else x[1]
        for x in classified_paragraphs_list[0]
    ]
    ner_processed = [x.lower() if x.upper() != x else x for x in processed]

    # remove redundant NER tags:
    for i in range(len(ner_processed)):
        try:
            if ner_processed[i - 1] == ner_processed[i] and i > 0:
                ner_processed.pop(i)
        except:
            pass
示例#7
0
class BasicDataProcessor:
    def __init__(self, config, data):
        self.config = config
        self.data = data
        self.lemmatizer = WordNetLemmatizer()
        self.tagger = StanfordNERTagger(
            model_filename=self.config.ner_model_path)
        self.postagger = StanfordPOSTagger(
            path_to_jar=self.config.pos_jar_path,
            model_filename=self.config.pos_model_path)
        self.dependency_parser = StanfordDependencyParser(
            path_to_jar=self.config.parser_jar_path,
            path_to_models_jar=self.config.parser_model_path)
        self.nlp = StanfordCoreNLP("stanford/stanford-corenlp-full")
        # self.nlp = StanfordCoreNLP("http://localhost", port=9000)

        self.punc = r"""!"#&'()*+;<=>?[]^`{}~"""

    def preprocess_questions(self, questions):
        return [self.preprocess_question(q) for q in questions]

    def process_docs(self, docs):
        return [self.preprocess_doc(doc) for doc in docs]

    def preprocess_question(self, question):
        normal_tokens = word_tokenize(
            question.replace("\u200b", '').replace("\u2014", ''))
        remove_punc_tokens = [
            token for token in normal_tokens if not self.is_pure_puncs(token)
        ]
        remove_punc_in_tokens = [
            self.remove_punc_in_token(token) for token in remove_punc_tokens
        ]
        lower_tokens = self.lower_tokens(remove_punc_in_tokens)
        remove_stop_tokens = self.remove_stop_words(lower_tokens)
        for i in range(len(remove_stop_tokens)):
            if remove_stop_tokens[i] == 'where':
                remove_stop_tokens[i] = 'location'
            if remove_stop_tokens[i] == 'when':
                remove_stop_tokens[i] = 'time'
            if remove_stop_tokens[i] == 'who' or remove_stop_tokens[
                    i] == 'whom':
                remove_stop_tokens[i] = 'person'
            if remove_stop_tokens[i] == 'why':
                remove_stop_tokens[i] = 'reason'
        lemmatized_tokens = self.lemmatize_tokens(remove_stop_tokens)
        return lemmatized_tokens

    def is_pure_puncs(self, token):
        if all([c in punctuation for c in token]):
            return True
        return False

    # remove punctuations within a token
    def remove_punc_in_token(self, token):
        return ''.join([x for x in token if x not in punctuation]).strip()

    # remove punctuations within a token if the punctuation is not in puc set
    def remove_punc_in_token_for_rule(self, token):
        return ''.join([x for x in token if x not in self.punc]).strip()

    def remove_stop_words(self, words):
        return [
            word for word in words
            if word.lower() not in stopwords.words("english")
        ]

    def lemmatize_tokens(self, words):
        return [self.lemmatize(word.lower()) for word in words]

    def lemmatize(self, word):
        word = word.lower()
        lemma = self.lemmatizer.lemmatize(word, 'v')
        if lemma == word:
            lemma = self.lemmatizer.lemmatize(word, 'n')
        return lemma

    def preprocess_doc(self, doc):
        normal_tokens = [
            word_tokenize(par.replace(u"\u200b", '').replace(u"\u2014", ''))
            for par in doc
        ]
        remove_punc_tokens = [[
            token for token in tokens if not self.is_pure_puncs(token)
        ] for tokens in normal_tokens]
        remove_punc_in_tokens = [[
            self.remove_punc_in_token(token) for token in tokens
        ] for tokens in remove_punc_tokens]
        lower_tokens = [
            self.lower_tokens(tokens) for tokens in remove_punc_in_tokens
        ]
        remove_stop_tokens = [
            self.remove_stop_words(tokens) for tokens in lower_tokens
        ]
        lemmatized_tokens = [
            self.lemmatize_tokens(tokens) for tokens in remove_stop_tokens
        ]
        return lemmatized_tokens

    def lower_tokens(self, words):
        return [word.lower() for word in words]

    def process_sent(self, sens):
        normal_tokens = word_tokenize(
            sens.replace("\u200b", '').replace("\u2014", ''))
        remove_punc_tokens = [
            token for token in normal_tokens if not self.is_pure_puncs(token)
        ]
        remove_punc_in_tokens = [
            self.remove_punc_in_token(token) for token in remove_punc_tokens
        ]
        ner_tags = self.sens_ner_tagging(remove_punc_in_tokens)
        replaced_tokens = [
            'number' if tup[1] == 'NUMBER' else 'person' if tup[1] == 'PERSON'
            else 'location' if tup[1] == 'LOCATION' else tup[0].lower()
            for tup in ner_tags
        ]
        lower_tokens = self.lower_tokens(replaced_tokens)
        remove_stop_tokens = self.remove_stop_words(lower_tokens)
        lemmatized_tokens = self.lemmatize_tokens(remove_stop_tokens)
        return lemmatized_tokens

    def lower_tokens(self, words):
        return [word.lower() for word in words]

    def sens_ner_tagging(self, sent):
        ner_sents = self.tagger.tag_sents([sent])
        pos_sent = pos_tag(sent)
        ner_sent = ner_sents[0]
        processed_ner_sent = []
        for j in range(len(ner_sent)):
            span, tag = ner_sent[j]
            _, pos = pos_sent[j]
            if span.isdigit() or pos == 'CD':
                processed_ner_sent.append((span, 'NUMBER'))
            elif tag == 'PERSON':
                processed_ner_sent.append((span, 'PERSON'))
            elif tag == 'LOCATION':
                processed_ner_sent.append((span, 'LOCATION'))
            elif tag == 'ORGANIZATION':
                processed_ner_sent.append((span, 'OTHER'))
            elif j != 0 and tag == 'O' and span[0].isupper():
                processed_ner_sent.append((span, 'OTHER'))
            else:
                processed_ner_sent.append((span, tag))
        return processed_ner_sent

    def lemmatize_entity_name(self, entity_name):
        tokens = entity_name.split()
        tokens = self.lemmatize_tokens(tokens)
        return ' '.join(tokens)
def formatted_entities(classified_paragraphs_list):
    entities = {'organizations': list()}

    for classified_paragraph in classified_paragraphs_list:
        for entry in classified_paragraph:
            entry_value = entry[0]
            entry_type = entry[1]

            if entry_type == 'ORGANIZATION':
                entities['organizations'].append(entry_value)

    return entities


tagger = StanfordNERTagger(
    '/Users/tomer.bendavid/Downloads/stanford-ner-2017-06-09/classifiers/english.all.3class.distsim.crf.ser.gz',
    '/Users/tomer.bendavid/Downloads/stanford-ner-2017-06-09/stanford-ner.jar',
    encoding='utf-8')

paragraphs = ['I just bought these Thomson Reuters shoes']

tokenized_paragraphs = list()

for text in paragraphs:
    tokenized_paragraphs.append(word_tokenize(text))

classified_paragraphs_list = tagger.tag_sents(tokenized_paragraphs)

formatted_result = formatted_entities(classified_paragraphs_list)
print(formatted_result)
示例#9
0
                    bestSentenceTokensNoStopWords)  #######------------
                allBestSentencesText.append(bestSentenceText)
                best = result[0][0]
                bestSentence[articleNo, questionNo] = best
                if qa['answer'] in bestSentenceText:
                    correctSentence += 1
            else:
                allBestSentences.append([])  #to preserve question sequence
                allBestSentencesText.append(" ")  ###########---------

            allQuestionText.append(qa['question'])

    print("The accuracy on dev set is",
          (correctSentence / float(totalQuestions)))

    NER_tagged = stanford_NER_tagger.tag_sents(allBestSentences)
    print("NER Time:", ctime())
    print("NER Tagging Done, Now doing POS tagging")
    POS_taggedAnswers = []
    # POS_taggedAnswers = stanford_POS_tagger.tag_sents(allBestSentencesText)
    print("POS answer tagging Done")
    print("POS answer Time:", ctime())
    POS_taggedQuestions = []
    # POS_taggedQuestions = stanford_POS_tagger.tag_sents(allQuestionText)
    print("POS question tagging Done")
    print("POS question Time:", ctime())

    f = open(fname, 'wb')  # 'wb' instead 'w' for binary file
    pickle.dump(
        {
            'NER_tagged': NER_tagged,
示例#10
0
def runstanfordmodel(sents, tagger, model):
    # Prepare NER tagger with english model
    ner_tagger = StanfordNERTagger(model, tagger, encoding='utf8')
    # Run NER tagger on words
    return ner_tagger.tag_sents(sents)
示例#11
0
import os
import json
import helpers
from nltk.tag.stanford import StanfordNERTagger

if not os.path.exists('scenes'):
  print('Scenes folder doesn\'t exist :( Run prepare.py first', end='\n')

ner_tagger = StanfordNERTagger(r'english.all.3class.distsim.crf.ser.gz')
#ner_tagger = StanfordNERTagger(r'ner-model.ser.gz')

for filename in sorted(os.listdir('scenes'), key=helpers.natural_keys):
  scenefile = open('scenes/' + filename, 'r+')
  scene = json.load(scenefile)
  processed = [[word[0] for word in sentence] for sentence in scene['processed']]
  ner_tags = ner_tagger.tag_sents(processed)
  for sindex, sentence in enumerate(ner_tags):
    for windex, word in enumerate(sentence):
      scene['processed'][sindex][windex].append(ner_tags[sindex][windex][1])
  
  #write to test file for validating
  scenefile.seek(0)
  #scenefile.close()
  #scenefile = open('scenes/' + filename + '-test', 'w')
  json.dump(scene, scenefile, indent=2)
  scenefile.truncate()
  scenefile.close()
示例#12
0
#!/bin/env python3.5
from nltk.tag.stanford import StanfordNERTagger
from nltk.internals import find_jars_within_path
from nltk.tokenize import sent_tokenize
import os

tagger = StanfordNERTagger('data/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', 'data/stanford-ner-2015-12-09/stanford-ner.jar')
tagger._stanford_jar = ':'.join(find_jars_within_path(os.getcwd() + 'data/stanford-ner-2015-12-09'))
print(tagger.tag_sents([''.join([c for c in x if c not in '",:.?/!@#$%^&*()][{}~']).split() for x in sent_tokenize(input('Enter a sentence: '))]))
    articleNo = -1
    print("Computing NER start at:", ctime())
    i = -1
    for article in data:
        articleNo += 1
        currentCorrectAnswerSents = []
        currentCorrectAnswerText = []
        print("Reading Article: ", articleNo + 1, '/', len(data))
        for qa in article['qa']:
            i += 1
            currentCorrectAnswerSents.append(
                article['sentences'][qa['answer_sentence']].split())
            currentCorrectAnswerText.append(qa['answer'].split())
            questions.append(qa['question'])
            answers.append(qa['answer'])
        NER_TaggedAnswerSents = NER_TaggedAnswerSents + stanford_NER_tagger.tag_sents(
            currentCorrectAnswerSents)
        NER_TaggedAnswerText = NER_TaggedAnswerText + stanford_NER_tagger.tag_sents(
            currentCorrectAnswerText)
        correctAnswerSents.append(currentCorrectAnswerSents)

    print("Computing NER end at:", ctime())
    #saving the computed NER tags and sentences
    f = open(NER_CacheFIle, 'wb')  # 'wb' instead 'w' for binary file
    pickle.dump(
        {
            'NER_TaggedAnswerSents': NER_TaggedAnswerSents,
            'correctAnswerSents': correctAnswerSents,
            'NER_TaggedAnswerText': NER_TaggedAnswerText,
            'questions': questions,
            'answers': answers
        }, f, -1)  # -1 specifies highest binary protocol
示例#14
0
class EventDetector:
    def __init__(self):
        self.path_to_jar = 'lib/stanford_parser/stanford-parser.jar'
        self.path_to_models_jar = 'lib/stanford_parser/stanford-english-corenlp-2018-02-27-models.jar'
        self.path_to_ner_tagger = 'lib/stanford_ner/stanford-ner.jar'
        self.path_to_ner_model = 'lib/stanford_ner/english.all.3class.distsim.crf.ser.gz'

        self.ner_tagger = StanfordNERTagger(self.path_to_ner_model,
                                            self.path_to_ner_tagger)
        self.dependency_parser = StanfordDependencyParser(
            path_to_jar=self.path_to_jar,
            path_to_models_jar=self.path_to_models_jar)
        self.lemmatizer = WordNetLemmatizer()
        self.utilities = Utilities()

    def extract_events_from_stanford_dependencies(self, dependencies,
                                                  ner_tags):
        entity_categories = ['PERSON', 'LOCATION', 'ORGANIZATION']
        raw_events = {}
        for dependency in dependencies:
            if len(dependency) == 3:
                head = dependency[0]
                relation = dependency[1]
                tail = dependency[2]

                if head[1].startswith('VB'):
                    event_keywords = list(raw_events.keys())
                    event_keyword = self.lemmatizer.lemmatize(
                        head[0].lower(), 'v')
                    if event_keyword not in event_keywords:
                        raw_events[event_keyword] = {}

                    if relation.endswith('subj'):
                        subject_pronoun = [
                            'i', 'you', 'he', 'she', 'we', 'they', 'who'
                        ]
                        subj_value = self.lemmatizer.lemmatize(tail[0].lower())

                        if tail[0].lower() in subject_pronoun:
                            subj_value = 'PERSON'
                        else:
                            for ner_tag in ner_tags:
                                if ner_tag[0] == tail[0] and ner_tag[
                                        1] in entity_categories:
                                    subj_value = ner_tag[1]
                        raw_events[event_keyword]['subj'] = subj_value

                    if relation == 'dobj':
                        objective_pronoun = [
                            'me', 'you', 'him', 'her', 'us', 'you', 'them'
                        ]
                        dobj_value = self.lemmatizer.lemmatize(tail[0].lower())

                        if tail[0].lower() in objective_pronoun:
                            dobj_value = 'PERSON'
                        else:
                            for ner_tag in ner_tags:
                                if ner_tag[0] == tail[0] and ner_tag[
                                        1] in entity_categories:
                                    dobj_value = ner_tag[1]

                        raw_events[event_keyword]['dobj'] = dobj_value

                    if relation == 'compound:prt':
                        raw_events[event_keyword]['prt'] = tail[0]

        event = None
        for verb in list(raw_events.keys()):
            event_info = raw_events[verb]
            if len(verb) < 2 or 'subj' not in list(event_info.keys()) or len(event_info['subj']) < 2 \
                    or 'dobj' not in list(event_info.keys()) or len(event_info['dobj']) < 2:
                continue

            event_info['keyword'] = verb
            event = event_info
            break  # return only one event

        return event

    def extract_soft_events(self, dependency_tree, dependency_relations,
                            ner_tags):

        entity_categories = ['PERSON', 'LOCATION', 'ORGANIZATION']
        accepted_relation_keys = [
            'nsubj', 'nsubjpass', 'amod', 'dobj', 'advmod', 'nmod', 'xcomp',
            'compound:prt', 'compound', 'neg'
        ]

        keyword = self.lemmatizer.lemmatize(dependency_tree.label(), 'v')

        event = {'keyword': keyword}
        for dependency_relation in dependency_relations:
            if len(dependency_relation) == 3:
                head = dependency_relation[0]
                relation = dependency_relation[1]
                tail = dependency_relation[2]

                if head[0] == keyword and relation in accepted_relation_keys:
                    event[relation] = self.lemmatizer.lemmatize(
                        tail[0].lower())
        # print(event)
        return event

    def extract_event_from_sentence(self, sentence):
        event = None
        sentence_preprocessor = Preprocessor(['remove_non_letters'])

        processed_sentence = sentence_preprocessor.preprocess(sentence)

        sent_dependencies = self.dependency_parser.raw_parse(
            processed_sentence)
        sent_ner_tags = self.ner_tagger.tag_sents([processed_sentence.split()])
        dependencies = [list(parse.triples()) for parse in sent_dependencies]

        if len(dependencies) > 0 and dependencies[0] is not None:
            event = self.extract_events_from_stanford_dependencies(
                dependencies[0], sent_ner_tags)
        else:
            event['keyword'] = sentence

        return event

    def extract_event_from_sentences(self, sentences):
        events = []
        sentence_preprocessor = Preprocessor(['remove_non_letters'])

        chunks = list(
            self.utilities.chunkify_list(data_list=sentences,
                                         items_per_chunk=1000))

        for chunk in chunks:
            sentences = []
            for chunk_item in chunk:
                sentences.append(sentence_preprocessor.preprocess(chunk_item))

            chunk_sent_dependencies = self.dependency_parser.raw_parse_sents(
                sentences)
            chunk_sent_ner_tags = self.ner_tagger.tag_sents(
                [sentence.split() for sentence in sentences])

            for sent_dependencies, sent_ner_tags, sentence in zip(
                    chunk_sent_dependencies, chunk_sent_ner_tags, sentences):
                temp_sent_dependencies_1, temp_sent_dependencies_2 = itertools.tee(
                    sent_dependencies, 2)
                dependency_relations = [
                    list(parse.triples()) for parse in temp_sent_dependencies_1
                ]
                dependency_tree = [
                    parse.tree() for parse in temp_sent_dependencies_2
                ][0]

                if len(dependency_relations) > 0 and dependency_relations[
                        0] is not None and len(dependency_relations[0]) > 0:
                    # print(sentence)
                    event = self.extract_soft_events(dependency_tree,
                                                     dependency_relations[0],
                                                     sent_ner_tags)
                else:
                    event = {'keyword': sentence}

                events.append(event)

        return events
示例#15
0
def GetSummary(Filename, SummaryType, Section='All'):
    """
        GetSummary() is used to determine the summary by product, opinions and general discussion points.

        Parameters
        ----------
        Filename : path to the file to assess.
        SummaryType: The summary type aim to identify products(NNP), opinions(JJ) and general discussion points(NN).

        Returns
        -------
        Description.
    """
    # #read File
    file = open(Filename, 'r')
    Audio_Text = file.read()
    file.close()

    # #confirm section
    if (Section == 'Q & A'):
        Audio_Text = Get_QA(Audio_Text)
    elif (Section == 'Exec Speech'):
        Audio_Text = Get_Speech(Audio_Text)

    #***********word tokens***********#
    #word_tokenize based on the text(unit is whole text)
    words = word_tokenize(Audio_Text)
    #function to test if something is a noun
    is_noun = lambda pos: pos[:3
                              ] == SummaryType  #extract part between pos1,pos2
    #nouns = pd.Series([word for (word, pos) in nltk.pos_tag(words) if is_noun(pos)]).value_counts()
    #nltk.pos_tag(words):part-of-speech tagging , or word classes
    nouns = pd.Series(
        [word for (word, pos) in nltk.pos_tag(words) if is_noun(pos)])
    # #clean
    temp = pd.Series(nouns).str.replace('\W', ' ')
    temp = temp.replace(' ', np.nan).dropna()
    temp = " ".join(temp)  #connect nouns
    #tokenize nouns
    Nouns_Cleaned = word_tokenize(temp)
    #convert to lower case
    Nouns_Cleaned = [w.lower() for w in Nouns_Cleaned]
    # #get standard stop words. Review final results for other additional stop words.
    stop_words = stopwords.words('english')
    custom_words = ['']
    stop_words.extend(custom_words)
    #drop stopwords
    Nouns_Cleaned = [i for i in Nouns_Cleaned if i not in stop_words]
    #lemmetize words
    nouns_lemmatized = [
        WordNetLemmatizer().lemmatize(w) for w in Nouns_Cleaned
    ]

    #***********find person names***********#
    #word_tokenize based on the sent_tokenize(unit is each sentence in the text)
    tokenized_sents = [
        word_tokenize(sent) for sent in sent_tokenize(Audio_Text)
    ]

    # #1. Stanford Method
    #Stanford Method (Note, this needs to be commented and needs toe be converted to an api version and/or investigate cloud version)
    #java_path = C:/Program Files/Java/jdk-14.0.1/bin/java.exe
    #java_path = '/Library/Java/JavaVirtualMachines/jdk-11.0.2.jdk/Contents/Home/bin/java'
    java_path = '/usr/lib/jvm/java-11-openjdk-amd64'
    os.environ['JAVAHOME'] = java_path
    #nltk.internals.config_java('C:/Program Files/Java/jdk-14.0.1/bin/java.exe')
    #nltk.internals.config_java("/Library/Java/JavaVirtualMachines/jdk-11.0.2.jdk/Contents/Home/bin/java")
    nltk.internals.config_java('/usr/lib/jvm/java-11-openjdk-amd64/bin/java')
    #st = StanfordNERTagger(stanford-ner/stanford-ner-2017-06-09/classifiers/english.all.3class.distsim.crf.ser.gz,stanford-ner/stanford-ner-2017-06-09//stanford-ner.jar)
    st = StanfordNERTagger(
        '/content/drive/MyDrive/Earnings_call_NLP/stanford-ner/english.all.3class.distsim.crf.ser.gz',
        '/content/drive/MyDrive/Earnings_call_NLP/stanford-ner/stanford-ner-2017-06-09/stanford-ner.jar'
    )
    tags = st.tag_sents(
        tokenized_sents)  #('With', 'O'),('that', 'O'),('Mike', 'PERSON'),
    #get person names list
    names_stanford = []
    for tag in tags:
        for content in tag:
            if content[1] == 'PERSON':
                names_stanford.extend(
                    content)  #'Mike','PERSON', 'Spencer', 'PERSON',
    #keep names and remove 'PERSON' tag
    names_stanford = [i for i in names_stanford if i not in ['PERSON']]

    # #2.WordNet Method
    person_list = []
    names_wordnet = person_list

    def get_human_names(text):
        tokens = nltk.tokenize.word_tokenize(text)
        #tag
        pos = nltk.pos_tag(tokens)
        sentt = nltk.ne_chunk(pos, binary=False)
        person = []
        name = ""
        for subtree in sentt.subtrees(filter=lambda t: t.label() == 'PERSON'):
            for leaf in subtree.leaves():
                person.append(leaf[0])
            if len(person) > 1:  #avoid grabbing lone surnames
                for part in person:
                    name += part + ' '
                if name[:-1] not in person_list:
                    person_list.append(name[:-1])
                name = ''
            person = []

    #     print (person_list)
    names = get_human_names(Audio_Text)
    for person in person_list:
        person_split = person.split(" ")
        for name in person_split:
            if wordnet.synsets(name):
                if (name in person):
                    names_wordnet.remove(person)
                    break
    names_wordnet = [word_tokenize(w) for w in names_wordnet]
    names_wordnet = [item for sublist in names_wordnet for item in sublist]

    #***********remove names***********#
    #Remove Duplicates
    names_stanford = list(dict.fromkeys(names_stanford))
    names_wordnet = list(dict.fromkeys(names_wordnet))
    #Convert to Lower
    names_stanford = [w.lower() for w in names_stanford]
    names_wordnet = [w.lower() for w in names_wordnet]
    #same names in 2 methods
    common_names = [i for i in names_stanford if i in names_wordnet]
    #Remove names from tokens
    Nouns_Cleaned = [i for i in Nouns_Cleaned if i not in common_names]
    #Clean to remove any orphaned works split as part of special character removals
    Nouns_Cleaned = [i for i in Nouns_Cleaned if len(i) > 2]
    #count frequency
    Description = pd.Series(Nouns_Cleaned).value_counts()
    return Description
from nltk.tag.stanford import StanfordNERTagger
from nltk.tokenize import word_tokenize
import os

java_path = r"C:\Program Files (x86)\Java\jre1.8.0_241\bin\java.exe"
os.environ['JAVAHOME'] = java_path
english_tagger = StanfordNERTagger(
    'stanford-ner-2018-10-16\\classifiers\\english.all.3class.distsim.crf.ser.gz',
    'stanford-ner-2018-10-16\\stanford-ner.jar')

infile = open("sample.txt", "r")
my_list = [i.split('\t')[0] for i in infile.readlines()]
print(my_list)
list2 = list()
for text in my_list:
    list2.append(word_tokenize(text))
print(list2)
op = english_tagger.tag_sents(list2)
print(op)
示例#17
0
class StanfordNer():
    def __init__(self) -> None:
        jar = './my_nltk/stanford-ner.jar'
        model = './my_nltk/english.all.3class.distsim.crf.ser.gz'

        # Prepare NER tagger with english model
        self._ner = StanfordNERTagger(model, jar, encoding='utf8')

    def recognize_file(self,
                       input_filename: str,
                       output_filename: str,
                       token_id=0) -> None:
        with open(input_filename, mode="r") as input, open(output_filename,
                                                           mode="w") as output:
            for line in input:
                # Tokenize: Split sentence into words
                words = nltk.word_tokenize(line)

                print(ner_tagger.tag(words))

    def recognize(self, input: TextIO, writer: csv.DictWriter) -> None:
        lines = ''.join(input.readlines())
        tokens = [
            nltk.word_tokenize(sentence)
            for sentence in nltk.sent_tokenize(lines)
        ]

        name_entities = self._ner.tag_sents(tokens)

        text_possition = 0
        for sentence in name_entities:
            current_ne_type = None
            current_len = 0
            for token, ne_type in sentence:
                text_possition = lines.find(token, text_possition)
                if ne_type != 'O':
                    if current_ne_type == ne_type and lines[
                            end:text_possition].isspace():
                        end = text_possition + len(token)
                        current_len += 1
                    else:
                        if current_len > 0:
                            # Write current text
                            writer.writerow({
                                "start": start,
                                "end": end,
                                "text": lines[start:end],
                                "type": current_ne_type,
                                "token_len": current_len
                            })
                        # Update stats
                        current_ne_type = ne_type
                        current_len = 1
                        start = text_possition
                        end = start + len(token)
                else:
                    if current_len > 0:
                        # Write current text
                        writer.writerow({
                            "start": start,
                            "end": end,
                            "text": lines[start:end],
                            "type": current_ne_type,
                            "token_len": current_len
                        })
                    # Update stats
                    current_ne_type = None
                    current_len = 0
                text_possition += len(token)
示例#18
0
            # if len(result) > 3:
            #     bestSentenceText = bestSentenceText + " " + article['sentences'][result[3][0]] #######
            # if len(result) > 4:
            #     bestSentenceText = bestSentenceText + " " + article['sentences'][result[4][0]] #######

            allQuestionText.append(
                qa['question'])  #saving questions too for later usage

    #printing out reterival accuracy, #not much used, but can guide about the theorotical accuracy limit on the final QA system
    print("The reterival accuracy on test set is",
          (correctSentence / float(totalQuestions)))

    #Now computing NER and other tags (like POS if needed)
    print("Computing NER start at:", ctime())

    NER_tagged = stanford_NER_tagger.tag_sents(allBestSentences)
    print("NER Time 1:", ctime())
    NER_tagged2 = stanford_NER_tagger.tag_sents(allSecondBestSentences)

    print("NER Time 2:", ctime())
    print("NER Tagging Done, Now doing POS tagging")
    POS_taggedAnswers = []
    # POS_taggedAnswers = stanford_POS_tagger.tag_sents(allBestSentencesText) ####Maybe needed for the 3rd answer ranking rule
    print("POS answer tagging Done")
    print("POS answer Time:", ctime())
    POS_taggedQuestions = []
    # POS_taggedQuestions = stanford_POS_tagger.tag_sents(allQuestionText) ####Maybe needed for the 3rd answer ranking rule
    print("POS question tagging Done")
    print("POS question Time:", ctime())

    #saving the computed NER tags and sentences
class StanfordNERTaggerExtractor(object):
    """docstring for ClassName"""
    def __init__(self):
        self.st = StanfordNERTagger('intent_class_models/stanford-jars/english.all.3class.distsim.crf.ser.gz' ,
            "intent_class_models/stanford-jars/stanford-ner.jar" )
        # self.st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz' ,
        #     'stanford-ner.jar' )

    def tag_text_single(self,text):
        '''
        :param text:
        :return:
        '''
        # assert type(text) == str
        sents = self.st.tag(nltk.word_tokenize(text))
        return sents

    def identify_NER_tags_single(self,text_tag,tag_to_find):
        '''
        :param text_tag: Tagged text
        :param tag_to_find:
        :return:
        '''
        tag_strs = []
        prev_wrd_tag = False
        for wrd,tag in text_tag:
            if tag == tag_to_find:
                if not prev_wrd_tag:
                    tag_strs.append(wrd)
                else:
                    prev_wrd = tag_strs.pop()
                    new_wrd = prev_wrd+' '+wrd
                    tag_strs.append(new_wrd)
                prev_wrd_tag = True
            else:
                prev_wrd_tag = False
        tags_final = []
        for wrd in tag_strs:
            if wrd not in tags_final:
                tags_final.append(wrd)
        return tags_final

    def tag_text_multi(self,text):
        ''' '''
        tokenized_sents = [nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text)]
        return self.st.tag_sents(tokenized_sents)

    def identify_NER_tags_multi(self,text_tag,tag_to_find):
        ''' '''
        tag_strs = []
        for sent_tag in text_tag:
            for wrd in self.identify_NER_tags_single(sent_tag,tag_to_find):
                if wrd not in tag_strs:
                    tag_strs.append(wrd)
        return tag_strs

    def tag_text_multi_from_single(self,ner_tags):
        ''' converting a huge single text tags into sentence based tags
        this is done because tagging sentence wise is slow. so we tag the entire text
        and split them after'''
        sents = ''
        for wrd,_ in ner_tags:
            sents += wrd+' '
        sent_tags = [nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(sents)]
        cnt = 0
        final_tags = []
        for sent_ind in range(len(sent_tags)):
            sent_tag_list = []
            for wrd_ind in range(len(sent_tags[sent_ind])):
                try:
                    sent_tag_list.append(ner_tags[cnt])
                    cnt += 1
                except:
                    break
            final_tags.append(sent_tag_list)
        return final_tags