def tag_text_en(tokens, tokens_span): """Receive tokens and spans and return tuple list with tagged tokens""" tagger = PerceptronTagger() tags = [] for i, tagged in enumerate(tagger.tag(tokens)): tags.append(tagged + (tokens_span[i], [])) return tags
def ap(train_path, test_path): modelref = 'ap-' + md5(('ap///' + train_path).encode()).hexdigest() + '.pickle' test_sentences = list(gen_corpus(test_path)) if not isfile(modelref): start = perf_counter() training_sentences = list(gen_corpus(train_path)) ap_model = PerceptronTagger(load=False) ap_model.train(list(convert_sents_to_zipped(training_sentences)), save_loc=modelref) end = perf_counter() print('Training took {} ms.'.format(int((end - start) * 1000))) else: ap_model = PerceptronTagger(load=False) ap_model.load(modelref) print('Model loaded from file.') # Evaluation start = perf_counter() y_pred, y_true = [], [] for words, tags in test_sentences: y_pred.extend(y for x, y in ap_model.tag(words)) y_true.extend(tags) end = perf_counter() print('Testing took {} ms.'.format(int((end - start) * 1000))) for l in classification_report(y_true, y_pred).split('\n'): print(l)
def pos_titles_from(input_path, output_path = None, options = None): finput, foutput = get_streams(input_path, output_path) skip, end = get_options(options) tokenizer = Tokenizer() tagger = PerceptronTagger() line_counter = 0 skipped_lines = 0 for line in finput: log_advance(1000000, line_counter) line_counter += 1 if line_counter <= skip: continue if end and line_counter > end: break try: paper_id, title = get_fields(line) if is_english(title): print >> foutput, paper_id tokens = tokenizer.tokenize(title) for token in tagger.tag(tokens): print >> foutput, token[0], token[1] print >> foutput else: skipped_lines += 1 except: print >> sys.stderr, "Error:", line, sys.exc_info() log_nlines(line_counter, skipped_lines)
def Top_K_verbs(k, df, text_col, class_col, plot=False): vect = TfidfVectorizer() vect.fit(df[text_col]) tfidf_df = pd.DataFrame(vect.transform(df[text_col]).toarray()) tfidf_df.columns = vect.get_feature_names() tfidf_T = tfidf_df.transpose() tagger = PerceptronTagger() tfidf_T['pos'] = tagger.tag(tfidf_T.index) tfidf_T = tfidf_T[tfidf_T['pos'].apply( lambda tup: tup[1] in ['VB', 'VBD', 'VBG', 'VBN'])] tfidf_df = tfidf_T.drop(['pos'], axis=1).transpose() top_k_by_class = dict() for v in df[class_col].value_counts().index: freq_in_class = tfidf_df[df[class_col] == v].sum(axis=0).sort_values( ascending=False) frac_in_class = freq_in_class / freq_in_class.sum() top_k_by_class[v] = frac_in_class[:k].index if plot: print('the top {} frequent nouns for class {}:'.format(k, v)) plt.figure(figsize=(5, 10)) sns.barplot(y=frac_in_class[:k].index, x=frac_in_class[:k]) plt.xlabel('fraction') plt.show() return (top_k_by_class)
def test_perceptron_tagging(self): sentence = "This is a test sentence to test if the testing works." tokens = word_tokenize(sentence) pt = PerceptronTagger(load=True) tag_result1 = [x[1] for x in pt.tag(tokens)] pt2 = perctagger() pt2.load() tag_result2 = pt2.tag(tokens) self.assertListEqual(tag_result1, tag_result2)
def pos_sequence_from(keyphrase, tags): """Receive keyphrase dict and return list of tags""" pos_sequence = list(map(lambda i: tags[i][1], keyphrase["tokens-indices"])) # Special case when tokenization don't match with annotation if pos_sequence == []: tagger = PerceptronTagger() keyphrase_tags = tagger.tag(keyphrase['keyphrase-text'].split()) pos_sequence = list(map(lambda t: t[1], keyphrase_tags)) return pos_sequence
class HmmSeqRecognizer(object): def __init__(self): self.hmm_models = [] self.n_hmm = 0 self.hmm2idx = {} self.idx2hmm = {} self.tagger = PerceptronTagger() return def batch_test(self, samples, label): tp,ns = 0,len(samples) for i in xrange(ns): idx = self.predict_sample(samples[i]) if idx==label: tp+=1 return tp,float(tp)/ns def predict_sample(self, sample): sample = [sample] probs = [ model.test(sample) for model in self.hmm_models ] return probs.index(max(probs)) def predict_sentence(self, sentence): sample = [[ tag for _,tag in self.tagger.tag(word_tokenize(sentence)) ]] probs = [ model.test(sample) for model in self.hmm_models ] return probs.index(max(probs)) def add_model(self, name, model): self.hmm_models.append(model) self.hmm2idx[name] = self.n_hmm self.idx2hmm[self.n_hmm] = name self.n_hmm += 1 def new_hmm(self, name, datapath, nhs, ne): print '=> adding HMM model \'%s\'...' % name hmm_model = HmmModel(nhs) hmm_model.train(datapath,ne) self.add_model(name, hmm_model) print '| done' return def save_hmm(self, name, hmm_path): print '=> saving HMM model \'%s\'...' % name f = open(hmm_path, 'wb') pickle.dump(self.hmm_models[self.hmm2idx[name]], f) f.close() print '| done' return def load_hmm(self, name, hmm_path): # print '=> adding HMM model \'%s\'...' % name f = open(hmm_path, 'rb') hmm_model = pickle.load(f) f.close() self.add_model(name, hmm_model) # print '| done' return
class Tagger(AbstractTagger): def __init__(self): self._tagger = PerceptronTagger(load=False) self._name = 'nltkperceptron' self._model_name = "nltkperceptron" self._result = None super().__init__() def _save_model(self, fpath): with open(fpath, 'wb') as f: dill.dump(self._tagger, f) def load(self, path=''): if path == '': self._load_model(path) else: mpath = os.path.join(path, self.model_name) self._load_model(mpath) def _load_model(self, fpath): if fpath == '': self._tagger = PerceptronTagger(load=True) else: with open(fpath, 'rb') as f: self._tagger = dill.load(f) def tag(self, data): res = self._tagger.tag(data) return [x[1] for x in res] def train(self, data): # Reset tagger. self._tagger = PerceptronTagger(load=False) self._tagger.train(data) @property def produces_temp_data(self): return False @property def requires_additional_params(self): return False def set_additional_params(self, options): pass def add_temp_dir(self, options): pass @property def model_name(self): return self._model_name @property def name(self): return self._name
def pos_tokenizer(text): word_tokens = tokenize(text) # using pretrained model to tag all tokens pretrained_tagger = PerceptronTagger(load=True) results = pretrained_tagger.tag(word_tokens) # collecting pos from resulting tuples pos_tokens = [] for word_pos in results: pos_tokens.append(word_pos[1]) return pos_tokens
def pos_per_line(text_file): try: tokenizer = Tokenizer() #pos tagger = PerceptronTagger() for s in text_file: tokens = tokenizer.tokenize(s) #print " ".join([" ".join(token) for token in tagger.tag(tokens)]) print " ".join([token[1] for token in tagger.tag(tokens)]) except: print >> sys.stderr, "Error pos_per_line(text_file): ", sys.exc_info()
class NLTK(BaseTagger): def __init__(self): import nltk nltk.download('averaged_perceptron_tagger') from nltk.tag.perceptron import PerceptronTagger self.inst = PerceptronTagger() def __call__(self, *args, **kwargs): return self.inst.tag(args[0])
def ie_preprocess(document): tagger = PerceptronTagger() tagged = [] sentences = document.split("\n") sentences = [nltk.word_tokenize(sent) for sent in sentences] for sent in sentences: tagged_tokens = tagger.tag(sent) tagged.append(tagged_tokens) return tagged
class NltkTagger(Tagger): '''Require maxtree''' #import nltk def __init__(self, *args, **kwargs): self.tagr = PerceptronTagger() super(self.__class__, self).__init__(*args, **kwargs) #~ def __start__(self): #~ self.tagr = PerceptronTagger() def tag_tokens(self, tokens, single=True): return self.tagr.tag(tokens)
def _perform_analysis(self, tokenized_sents): res = [] if len(self.precalced_data): return self.precalced_data else: for tokens in tokenized_sents: tagger = PerceptronTagger() tags = tagger.tag(tokens) res += tags self.precalced_data = res return res
class NERTagger: def __init__(self): self.pos_tagger = PerceptronTagger() def tag(self, tokens): tree = nltk.ne_chunk(self.pos_tagger.tag(tokens)) tagged_tokens = [] for t in tree: if type(t) == nltk.tree.Tree: label = t.label() for token in t: tagged_tokens.append((token[0], label)) else: tagged_tokens.append(t) return tagged_tokens
def test_lesk(): words = get_sample_words() print("Words =%s" % words) tagger = PerceptronTagger() tags = tagger.tag(words) print('Tags=%s' % tags[:5]) lemmatizer = WordNetLemmatizer() for word, tag in tags: pos = get_wordnet_pos(tag) if pos is None: continue # print("word=%s,tag=%s,pos=%s" %(word, tag, pos)) lemma = lemmatizer.lemmatize(word, pos) # print("Lemma=%s" %lemma) synset = lesk(words, lemma, pos) if synset is None: print('No synsetid for word=%s' % word) else: print('word=%s, synsetname=%s, synsetid=%d' % (word, synset.name(), synset.offset()))
def pos_title_abstract(resource): xml_file = resource[0] pos_data = [] try: xmldoc = minidom.parse(xml_file) elements_title = xmldoc.getElementsByTagName("Title") title = elements_title.item(0).childNodes[0].nodeValue elements_list = xmldoc.getElementsByTagName("S") sentences = [e[0].nodeValue for e in [element.childNodes for element in elements_list] if e[0].nodeType == e[0].TEXT_NODE] sentences.insert(0, title) #raw text txt_file = change_file_extention(resource[1], "xml", "txt") save_to_file(txt_file, sentences) #pos tagger = PerceptronTagger() for s in sentences: tokens = word_tokenize(s) pos_data.append(tagger.tag(tokens)) pos_file = change_file_extention(resource[1], "xml", "pos") save_to_file(pos_file, pos_data) except: print >> sys.stderr, "Error pos_title_abstract:", resource, sys.exc_info() return pos_data
def get_instances(self, folder): instances = [] labels = set() tagger = PerceptronTagger() # load nltk perceptron just once to speed up tagging with io.open(folder, encoding="utf-8") as f: # with open(folder) as f: for line in f: # line = unicode(line).encode("utf-8") line_split = line.rstrip().split("\t") if len(line_split) != 3: continue id, text, label = line_split id = id.rstrip(":") text = re.sub('[#]', '', text.rstrip()) label = re.sub('[^a-z]', '', label) inst = Instance(text, label) inst_tokenized = word_tokenize(text) inst_tagged = tagger.tag(inst_tokenized) for tokentag in inst_tagged: token = Token(tokentag[0], tokentag[1]) inst.add_token(token) instances.append(inst) labels.add(label) return instances, labels
class TextAnalyser(object): """Text analyser""" MULTICLASS_NE_CHUNKER = \ "chunkers/maxent_ne_chunker/english_ace_multiclass.pickle" def __init__(self, keyword_stop_list=None): self.__keyword_stop_list = keyword_stop_list self.__pos_tagger = PerceptronTagger() self.__ne_chunker = nltk_data_load(self.MULTICLASS_NE_CHUNKER) def analyse_file(self, filename): """Analyse the contents of a file :param str filename: the path to a file :rtype: TextAnalysisResult :return: the analysis result """ with open(filename, "r") as f: return self.analyse(f.read()) def _calculate_text_statistics(self, sentences, sentence_words): """Calculate the text statistics :param list[str] sentences: a list with the text sentences :param list[list[str]] sentence_words: a list with the sentences that have been tokenized into separate words :rtype: TextStatistics :return: the calculated text statistics """ words = list(itertools.chain(*sentence_words)) sentence_word_counts = np.array( [len(sentence) for sentence in sentence_words]) return TextStatistics( sentence_count=len(sentences), word_count=len(words), mean_sentence_word_count=float(sentence_word_counts.mean()), median_sentence_word_count=float(np.median(sentence_word_counts)), min_sentence_word_count=int(sentence_word_counts.min()), max_sentence_word_count=int(sentence_word_counts.max()), average_sentence_word_count=float( np.average(sentence_word_counts)), sentence_word_count_std=float(sentence_word_counts.std()), sentence_word_count_variance=float(sentence_word_counts.var())) def _extract_named_entities(self, sentence_words): """Extract the named entities from the sentences The result from this method is a dictionary with the named entity types as keys and a set of the names entities as values :param list[list[str]] sentence_words: a list with the sentences that have been tokenized into separate words :rtype: dict[str: set] :return: the extracted dictionary words """ tagged_sentences = [ self.__pos_tagger.tag(sentence) for sentence in sentence_words ] chunked_sentences = [ self.__ne_chunker.parse(sentence) for sentence in tagged_sentences ] named_entities = defaultdict(set) for sentence in chunked_sentences: for item in sentence: if isinstance(item, Tree): ne_type = item.label() entity = " ".join([ entity_component[0] for entity_component in item.leaves() ]) named_entities[ne_type].add(entity) return dict(named_entities) def analyse(self, text): """Analyse the given text :param str text: the text to analyse :rtype: TextAnalysisResult :return: the analysis result """ readability_scores = calculate_readability_scores(text) keywords = None if isinstance(text, str) and len(text) != 0: keywords = extract_keywords( text=text, keyword_stop_list=self.__keyword_stop_list) sentences = sent_tokenize(text) sentence_words = [word_tokenize(sentence) for sentence in sentences] statistics = self._calculate_text_statistics(sentences, sentence_words) summary = create_summary(text) named_entities = self._extract_named_entities(sentence_words) return TextAnalysisResult(text=text, keywords=keywords, readability_scores=readability_scores, statistics=statistics, summary=summary, named_entities=named_entities)
https://github.com/evanmiltenburg/Dutch-tagger We got a POS tagger and a CHUNK tagger. In combination they can be used to apply NER... """ import os import nltk from nltk.tag.perceptron import PerceptronTagger from nltk.corpus import alpino as alp # Trained on ALP data. tagger = PerceptronTagger(load=False) os.chdir(r'D:\nlp_lib') tagger.load('model.perc.dutch_tagger_small.pickle' ) # I don't know the source of training data # Tag a sentence. tagger.tag('Alle vogels zijn nesten begonnen , behalve ik en jij .'.split()) training_corpus = alp.tagged_sents() unitagger = nltk.tag.UnigramTagger(training_corpus) bitagger = nltk.tag.BigramTagger(training_corpus, backoff=unitagger) perctagger = PerceptronTagger(load=True) # What does load=True mean?? perctagger.train(training_corpus) sent = 'NLTK is een goeda taal voor NLP'.split() bitagger.tag(sent) unitagger.tag(sent) perctagger.tag(sent)
('clf', OneVsRestClassifier(LinearSVC()))]) print "Classifier pipeline initialized." labels = [] data = [] print "Processing the training file." with open (file_path, 'r') as readFile : for row in readFile : arr = [] sentences = [] arr = row.split(",,,") arr[1] = arr[1].strip(' ') arr[1] = arr[1].strip('\n') labels.append(arr[1]) sentences = nltk.sent_tokenize(arr[0]) tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] tagged_sentences = [tagger.tag(sentence) for sentence in tokenized_sentences] print tagged_sentences ner_sentence= [nltk.chunk.ne_chunk(pts) for pts in tagged_sentences] sent = '' for tokens in ner_sentence : word = "" for t in tokens : m = re.search('\(\'(.+?)\', \'(.+?)\'\)', str(t)) e = re.search('\(([A-Z]+?) (.+?)\)', str(t)) if m: word = m.group(1) pos = m.group(2) if pos == "JJ" : word = "JJ" elif pos == "CC" : word = "CC"
titles+=[title] bodies+=[standard_body(body)] else: titles+=['Unknown'] bodies+=[standard_body(story)] ### -------------------------- ### # Tokenize and tag text # ### -------------------------- ### stories = pd.DataFrame({'title':titles,'body':bodies}) stories['sents'] = stories['body'].map(lambda x: nltk.sent_tokenize(x)) stories['words'] = stories['sents'].map(lambda x: [[w.lower() for w in nltk.word_tokenize(s)] for s in x]) from nltk.tag.perceptron import PerceptronTagger tagger = PerceptronTagger() stories['tags'] = stories['words'].map(lambda x: [tagger.tag(s) for s in x]) ### -------------------------- ### # Categorize tags into genres # ### -------------------------- ### # Note: Standards replace 'TG' with '{CLASS}' i.e. 'NN' -> '{ANML}' """ Genres: - occupation - animals: is_animal - body parts: - exclamation - food - location """ is_animal = lambda x: is_hyper_of(x,'animal') is_bodypart = lambda x: is_hyper_of(x,'body_part')
type_indexes_tmp = ann_items[1].split(" ") type_indexes = type_indexes_tmp[0:2] + type_indexes_tmp[3:] else: type_indexes = ann_items[1].split(" ") type_indexes[1] = int(type_indexes[1]) type_indexes[2] = int(type_indexes[2]) indexes_kp_tmp.setdefault(type_indexes[1], -1) if indexes_kp_tmp[type_indexes[1]] < type_indexes[2]: indexes_kp_tmp[type_indexes[1]] = type_indexes[2] ann_text = ann_items[2] tokens = tokenizer.tokenize(ann_text) if without_types: annotation_type = 'KeyPhrase' else: annotation_type = type_indexes[0] pos_tags = [t + (annotation_type,) for t in tagger.tag(tokens)] if pos_tags: pos_tags[0] = pos_tags[0][0:2] + ("B-" + pos_tags[0][2],) if debug: print >> sys.stderr, pos_tags annotations[" ".join([str(ti) for ti in type_indexes[1:]])] = pos_tags #print >> ann_ext_file, " ".join([str(ti) for ti in type_indexes]) + "\t" + ann_text + "\t" + pos_tags ann_file.close() #ann_ext_file.close() if debug: pass #print indexes_kp_tmp #print annotations file_text = os.path.join(dirname, f[:-4] + ".txt")
# for (index,line) in enumerate(data_file): # line=line.decode('utf-8','ignore') # you have to decode the line using the corresponded coding! # sents = nltk.sent_tokenize(line) # nltk.pos_tag_sents(sents) # WRONG!!!! # print index # WAY-2: Faster 20s/1000 tagger = PerceptronTagger() with open(input_filename) as data_file: for (index,line) in enumerate(data_file): line=line.decode('utf-8','ignore') # you have to decode the line using the corresponded coding! # sents = nltk.sent_tokenize(line) # print sents # sentences_pos=tagger.tag_sents(sents) word_list=nltk.word_tokenize(line) line_tagged=tagger.tag(word_list) if index in range(5000,60001,5000): print index # print line_tagged for t in line_tagged: output.write('_'.join(t)+' ') output.write('\n') # # WAY-3: More precise but slower 21s/1000 # tagger = PerceptronTagger() # with open(input_filename) as data_file: # for (index,line) in enumerate(data_file): # line=line.decode('utf-8','ignore') # you have to decode the line using the corresponded coding! # sents = nltk.sent_tokenize(line) # tokenized_sents = [nltk.word_tokenize(i) for i in sents]
test_tokens = [] test_labels = [] dummy_label = "Dummy" last_label = "" if prev_token: test_tokens.append(prev_token) last_label = dummy_label test_labels.append(last_label) for pjtk in projection_tokens: test_tokens.append(pjtk) test_labels.append("None") if next_token: test_tokens.append(next_token) test_labels.append(dummy_label) test_pos_tags = tagger.tag(test_tokens) tagged_text = [tpt + (test_labels[i],) for i, tpt in enumerate(test_pos_tags)] if debug and False: print >> std.stderr, "Tagged projection", tagged_text if extra_features: X_test = kpc.sent2features_extra(tagged_text, qr) else: X_test = kpc.sent2features(tagged_text) is_not_kp = "None" tmp_label = is_not_kp new_kp = [] new_list_labels = [] X_labeled = crftagger.tag(X_test)
class NLP: def __init__(self, nodes): self.mwe_tokenizer = MWETokenizer(self._build_mwe(nodes.nodelist)) self.lemmatizer = WordNetLemmatizer() self.tagger = PerceptronTagger() self.stop_words = set(stopwords.words("english")) @staticmethod def _build_mwe(nodelist): """Builds multi word expressions based on synonyms of nodes in nodelist. Parameters ---------- nodelist: list(Node) Returns ------- multi_word_expressions : list(str) """ multi_word_expressions = [] # iterate through synonyms in nodelist and append joined n-grams # (separator = "_") to above list. for node in nodelist: for idx in range(len(node.synonyms)): mwe = [word_tokenize(node.synonyms[idx].lower())] multi_word_expressions.append(tuple(mwe[0])) return multi_word_expressions def tokenize_into_words(self, text_input): """Splits strings into list of words with regard to multi word expressions generated from nodelist. Parameters ---------- text_input : str Returns ------- tokenized_string : list(str) """ tokenized_string = self.mwe_tokenizer.tokenize( word_tokenize(text_input.lower())) return tokenized_string @staticmethod def tokenize_into_sentences(text_input): """Splits strings into list of sentences. Parameters ---------- text_input: str Returns ------- tokenized_string : list(str) """ tokenized_string = sent_tokenize(text_input) return tokenized_string @staticmethod def get_wordnet_pos(treebank_tag): """Used to translate part of speech tags (wordnet vs. nltk). Parameters ---------- treebank_tag : str Returns ------- wordnet.* : str References ------- https://stackoverflow.com/questions/15586721/wordnet-lemmatization-and-pos-tagging-in-python """ if treebank_tag.startswith('J'): return wordnet.ADJ elif treebank_tag.startswith('V'): return wordnet.VERB elif treebank_tag.startswith('N'): return wordnet.NOUN elif treebank_tag.startswith('R'): return wordnet.ADV else: return wordnet.NOUN # Alternatively: "" or None. def pos_tag_words(self, text_input): """Assigns part of speech tags to tokenized words. Parameters ---------- text_input : str Returns ------- pos_tagged_words : list(str) """ sentences = sent_tokenize(text_input) pos_tagged_words = [] for sentence in sentences: sentence_pos = self.tagger.tag( self.mwe_tokenizer.tokenize(word_tokenize(sentence))) for t in sentence_pos: pos_tagged_words.append(t) return pos_tagged_words def lemmatize_words(self, text_input): """Identifies the lemma of each word based on specified lemmatizer and part of speech tagger. Parameters ---------- text_input : str Returns ------- lemmatized_words : list(str) """ lemmatized_words = [] for w in self.pos_tag_words(text_input): if w[0] not in string.punctuation: lemmatized_word = self.lemmatizer.lemmatize( word=w[0], pos=self.get_wordnet_pos(w[1])) lemmatized_words.append(lemmatized_word.lower()) return lemmatized_words def list_based_stopword_removal(self, text_input): """Removes stopwords based on specified stopword list. Parameters ---------- text_input : list(str) Returns ------- filtered_words . list(str) """ filtered_words = [] for word in text_input: if word not in self.stop_words: filtered_words.append(word) return filtered_words def list_based_stopword_removal_for_pos_tagged_words(self, text_input): """Removes stopwords based on specified stopword list (specified for pos_tagged_words) Parameters ---------- text_input : str Returns ------- filtered_words : list(str) """ filtered_words = [] for word in text_input: if word[0] not in self.stop_words: filtered_words.append(word) return filtered_words
class HearstPatterns(object): def __init__(self): self.__chunk_patterns = r""" # helps us find noun phrase chunks NP: {<DT|PP\$>?<JJ>*<NN>+} {<NNP>+} {<NNS>+} """ self.__np_chunker = nltk.RegexpParser(self.__chunk_patterns) # create a chunk parser # now define the Hearst patterns # format is <hearst-pattern>, <general-term> # so, what this means is that if you apply the first pattern, the firsr Noun Phrase (NP) # is the general one, and the rest are specific NPs self.__hearst_patterns = [ ("(NP_\w+ (, )?such as (NP_\w+ ? (, )?(and |or )?)+)", "first"), ("(such NP_\w+ (, )?as (NP_\w+ ?(, )?(and |or )?)+)", "first"), ("((NP_\w+ ?(, )?)+(and |or )?other NP_\w+)", "last"), ("(NP_\w+ (, )?including (NP_\w+ ?(, )?(and |or )?)+)", "first"), ("(NP_\w+ (, )?especially (NP_\w+ ?(, )?(and |or )?)+)", "first"), ] self.__pos_tagger = PerceptronTagger() def prepare(self, rawtext): sentences = nltk.sent_tokenize(rawtext.strip()) # NLTK default sentence segmenter sentences = [nltk.word_tokenize(sent) for sent in sentences] # NLTK word tokenizer sentences = [self.__pos_tagger.tag(sent) for sent in sentences] # NLTK POS tagger return sentences def chunk(self, rawtext): sentences = self.prepare(rawtext.strip()) all_chunks = [] for sentence in sentences: chunks = self.__np_chunker.parse(sentence) # parse the example sentence #for chunk in chunks: # print str(chunk) all_chunks.append(self.prepare_chunks(chunks)) return all_chunks def prepare_chunks(self, chunks): # basically, if the chunk is NP, keep it as a string taht starts w/ NP and replace " " with _ # otherwise, keep the word. # remove punct # this is all done to make it super easy to apply the Hearst patterns... terms = [] for chunk in chunks: label = None try: # gross hack to see if the chunk is simply a word or a NP, as we want. But non-NP fail on this method call label = chunk.label() except: pass if label is None: #means one word... token = chunk[0] pos = chunk[1] if pos in ['.', ':', '-', '_']: continue terms.append(token) else: np = "NP_"+"_".join([a[0] for a in chunk]) #This makes it easy to apply the Hearst patterns later terms.append(np) return ' '.join(terms) """ This is the main entry point for this code. It takes as input the rawtext to process and returns a list of tuples (specific-term, general-term) where each tuple represents a hypernym pair. """ def find_hyponyms(self, rawtext): hyponyms = [] np_tagged_sentences = self.chunk(rawtext) for raw_sentence in np_tagged_sentences: # two or more NPs next to each other should be merged into a single NP, it's a chunk error # find any N consecutive NP_ and merge them into one... # So, something like: "NP_foo NP_bar blah blah" becomes "NP_foo_bar blah blah" sentence = re.sub(r"(NP_\w+ NP_\w+)+", lambda m: m.expand(r'\1').replace(" NP_", "_"), raw_sentence) for (hearst_pattern, parser) in self.__hearst_patterns: matches = re.search(hearst_pattern, sentence) if matches: match_str = matches.group(0) nps = [a for a in match_str.split() if a.startswith("NP_")] if parser == "first": general = nps[0] specifics = nps[1:] else: general = nps[-1] specifics = nps[:-1] for i in range(len(specifics)): hyponyms.append((self.clean_hyponym_term(specifics[i]), self.clean_hyponym_term(general))) return hyponyms def clean_hyponym_term(self, term): # good point to do the stemming or lemmatization return term.replace("NP_","").replace("_", " ")
class HearstPatterns(object): def __init__(self, extended=False): self.__chunk_patterns = r""" # helps us find noun phrase chunks NP: {<DT|PP\$>?<JJ>*<NN>+} {<NNP>+} {<NNS>+} """ self.__np_chunker = nltk.RegexpParser( self.__chunk_patterns) # create a chunk parser # now define the Hearst patterns # format is <hearst-pattern>, <general-term> # so, what this means is that if you apply the first pattern, the firsr Noun Phrase (NP) # is the general one, and the rest are specific NPs self.__hearst_patterns = [ ("(NP_\w+ (, )?such as (NP_\w+ ? (, )?(and |or )?)+)", "first"), ("(such NP_\w+ (, )?as (NP_\w+ ?(, )?(and |or )?)+)", "first"), ("((NP_\w+ ?(, )?)+(and |or )?other NP_\w+)", "last"), ("(NP_\w+ (, )?including (NP_\w+ ?(, )?(and |or )?)+)", "first"), ("(NP_\w+ (, )?especially (NP_\w+ ?(, )?(and |or )?)+)", "first"), ] if extended: self.__hearst_patterns.extend([ ("((NP_\w+ ?(, )?)+(and |or )?any other NP_\w+)", "last"), ("((NP_\w+ ?(, )?)+(and |or )?some other NP_\w+)", "last"), ("((NP_\w+ ?(, )?)+(and |or )?is a NP_\w+)", "last"), ("((NP_\w+ ?(, )?)+(and |or )?was a NP_\w+)", "last"), ("((NP_\w+ ?(, )?)+(and |or )?were a NP_\w+)", "last"), ("((NP_\w+ ?(, )?)+(and |or )?are a NP_\w+)", "last"), ("(NP_\w+ (, )?like (NP_\w+ ? (, )?(and |or )?)+)", "first"), ("such (NP_\w+ (, )?as (NP_\w+ ? (, )?(and |or )?)+)", "first"), ("((NP_\w+ ?(, )?)+(and |or )?like other NP_\w+)", "last"), ("((NP_\w+ ?(, )?)+(and |or )?one of the NP_\w+)", "last"), ("((NP_\w+ ?(, )?)+(and |or )?one of these NP_\w+)", "last"), ("((NP_\w+ ?(, )?)+(and |or )?one of those NP_\w+)", "last"), ("examples of (NP_\w+ (, )?is (NP_\w+ ? (, )?(and |or )?)+)", "first"), ("examples of (NP_\w+ (, )?are (NP_\w+ ? (, )?(and |or )?)+)", "first"), ("((NP_\w+ ?(, )?)+(and |or )?are examples of NP_\w+)", "last"), ("((NP_\w+ ?(, )?)+(and |or )?is example of NP_\w+)", "last"), ("(NP_\w+ (, )?for example (NP_\w+ ? (, )?(and |or )?)+)", "first"), ("((NP_\w+ ?(, )?)+(and |or )?wich is called NP_\w+)", "last"), ("((NP_\w+ ?(, )?)+(and |or )?which is named NP_\w+)", "last"), ("(NP_\w+ (, )?mainly (NP_\w+ ? (, )?(and |or )?)+)", "first"), ("(NP_\w+ (, )?mostly (NP_\w+ ? (, )?(and |or )?)+)", "first"), ("(NP_\w+ (, )?notably (NP_\w+ ? (, )?(and |or )?)+)", "first"), ("(NP_\w+ (, )?particularly (NP_\w+ ? (, )?(and |or )?)+)", "first"), ("(NP_\w+ (, )?principally (NP_\w+ ? (, )?(and |or )?)+)", "first"), ("(NP_\w+ (, )?in particular (NP_\w+ ? (, )?(and |or )?)+)", "first"), ("(NP_\w+ (, )?except (NP_\w+ ? (, )?(and |or )?)+)", "first"), ("(NP_\w+ (, )?other than (NP_\w+ ? (, )?(and |or )?)+)", "first"), ("(NP_\w+ (, )?e.g. (NP_\w+ ? (, )?(and |or )?)+)", "first"), ("(NP_\w+ (, )?i.e. (NP_\w+ ? (, )?(and |or )?)+)", "first"), ("((NP_\w+ ?(, )?)+(and |or )?a kind of NP_\w+)", "last"), ("((NP_\w+ ?(, )?)+(and |or )?kinds of NP_\w+)", "last"), ("((NP_\w+ ?(, )?)+(and |or )?form of NP_\w+)", "last"), ("((NP_\w+ ?(, )?)+(and |or )?forms of NP_\w+)", "last"), ("((NP_\w+ ?(, )?)+(and |or )?which looks like NP_\w+)", "last"), ("((NP_\w+ ?(, )?)+(and |or )?which sounds like NP_\w+)", "last"), ("(NP_\w+ (, )?which are similar to (NP_\w+ ? (, )?(and |or )?)+)", "first"), ("(NP_\w+ (, )?which is similar to (NP_\w+ ? (, )?(and |or )?)+)", "first"), ("(NP_\w+ (, )?examples of this is (NP_\w+ ? (, )?(and |or )?)+)", "first"), ("(NP_\w+ (, )?examples of this are (NP_\w+ ? (, )?(and |or )?)+)", "first"), ("(NP_\w+ (, )?types (NP_\w+ ? (, )?(and |or )?)+)", "first"), ("((NP_\w+ ?(, )?)+(and |or )? NP_\w+ types)", "last"), ("(NP_\w+ (, )?whether (NP_\w+ ? (, )?(and |or )?)+)", "first"), ("(compare (NP_\w+ ?(, )?)+(and |or )?with NP_\w+)", "last"), ("(NP_\w+ (, )?compared to (NP_\w+ ? (, )?(and |or )?)+)", "first"), ("(NP_\w+ (, )?among them (NP_\w+ ? (, )?(and |or )?)+)", "first"), ("((NP_\w+ ?(, )?)+(and |or )?as NP_\w+)", "last"), ("(NP_\w+ (, )? (NP_\w+ ? (, )?(and |or )?)+ for instance)", "first"), ("((NP_\w+ ?(, )?)+(and |or )?sort of NP_\w+)", "last"), ]) self.__pos_tagger = PerceptronTagger() def prepare(self, rawtext): sentences = nltk.sent_tokenize( rawtext.strip()) # NLTK default sentence segmenter sentences = [nltk.word_tokenize(sent) for sent in sentences] # NLTK word tokenizer sentences = [self.__pos_tagger.tag(sent) for sent in sentences] # NLTK POS tagger return sentences def chunk(self, rawtext): sentences = self.prepare(rawtext.strip()) all_chunks = [] for sentence in sentences: chunks = self.__np_chunker.parse( sentence) # parse the example sentence #for chunk in chunks: # print(str(chunk)) all_chunks.append(self.prepare_chunks(chunks)) return all_chunks def prepare_chunks(self, chunks): # basically, if the chunk is NP, keep it as a string taht starts w/ NP and replace " " with _ # otherwise, keep the word. # remove punct # this is all done to make it super easy to apply the Hearst patterns... terms = [] for chunk in chunks: label = None try: # gross hack to see if the chunk is simply a word or a NP, as we want. But non-NP fail on this method call label = chunk.label() except: pass if label is None: #means one word... token = chunk[0] pos = chunk[1] if pos in ['.', ':', '-', '_']: continue terms.append(token) else: np = "NP_" + "_".join([ a[0] for a in chunk ]) #This makes it easy to apply the Hearst patterns later terms.append(np) return ' '.join(terms) def replace_np_sequences(self, sentence): words = "" first_word_in_sequence = False for word in nltk.word_tokenize(sentence.replace("NP_", "_")): if word[0] == "_": if not first_word_in_sequence: word = "NP" + word first_word_in_sequence = True words = words + " " + word else: words += word else: words = words + " " + word first_word_in_sequence = False return words.strip() """ This is the main entry point for this code. It takes as input the rawtext to process and returns a list of tuples (specific-term, general-term) where each tuple represents a hypernym pair. """ def find_hyponyms(self, rawtext): hyponyms = [] np_tagged_sentences = self.chunk(rawtext) #print "NP tagged-->",np_tagged_sentences for raw_sentence in np_tagged_sentences: # two or more NPs next to each other should be merged into a single NP, it's a chunk error # find any N consecutive NP_ and merge them into one... # So, something like: "NP_foo NP_bar blah blah" becomes "NP_foo_bar blah blah" sentence = self.replace_np_sequences(raw_sentence) for (hearst_pattern, parser) in self.__hearst_patterns: matches = re.search(hearst_pattern, sentence) if matches: match_str = matches.group(0) nps = [a for a in match_str.split() if a.startswith("NP_")] if parser == "first": general = nps[0] specifics = nps[1:] else: general = nps[-1] specifics = nps[:-1] #print(str(general)) #print(str(nps)) for i in range(len(specifics)): #print("%s, %s" % (specifics[i], general)) hs = self.clean_hyponym_term( specifics[i]) + "-" + self.clean_hyponym_term( general) hyponyms.append(hs) # hyponyms.append((self.clean_hyponym_term(general),self.clean_hyponym_term(general))) # return hyponyms return hyponyms # from pyspark.sql.functions import udf # from pyspark.sql.types import ArrayType, StringType # find_hyponyms_udf = udf(lambda x: find_hyponyms(x), ArrayType(StringType())) def clean_hyponym_term(self, term): # good point to do the stemming or lemmatization return term.replace("NP_", "").replace("_", " ")
class ActionDetection(object): def __init__(self, train=False): self.tagger = PerceptronTagger() self.model = None # BOW: triangle, rectangle, circle, hand # verbs: draw, wave, rotate self.BOW = ['draw', 'wave', 'rotate', 'triangle', 'rectangle', 'circle', 'hand'] self.VERBS = [wn.synset('draw.v.01'), wn.synset('wave.v.01'), wn.synset('rotate.v.01')] self.n_bow, self.n_verbs = len(self.BOW), len(self.VERBS) if train: self.train_svm() else: self.load_model() return def save_model(self): f = open(MODEL_PATH + 'action_detection.model', 'wb') pickle.dump(self.model, f) f.close() return def train_svm(self): with open(DATA_PATH+'action_detection_training_set.txt') as f: data = f.readlines() X, y = [],[] for line in data: line = line.strip() if not line: continue line = line.split(' ',1) X.append(self.extract_feature(line[1])) y.append(int(line[0])) lin_clf = svm.LinearSVC() lin_clf.fit(X, y) self.model = lin_clf self.save_model() return def load_model(self): f = open(MODEL_PATH + 'action_detection.model', 'rb') self.model = pickle.load(f) f.close() return def extract_feature(self, sent): feature = [0] * (self.n_bow+self.n_verbs) verbs = [ w for w,pos in self.tagger.tag(word_tokenize(sent)) if pos=='VB' ] words = set(sent.split()) for i in xrange(self.n_bow): feature[i] = 1 if self.BOW[i] in words else 0 for i in xrange(self.n_verbs): if not verbs: feature[self.n_bow+i] = 0 else: similarities = [ wn.path_similarity(self.VERBS[i],wn.synset(v+'.v.01')) for v in verbs ] feature[self.n_bow+i] = max(similarities) return feature def predict(self, sent): # classes: 0(rectangle), 1(circle), 2(triangle), 3(wave), 4(rotate) feature = self.extract_feature(sent) idx = self.model.predict([feature])[0] probs = self.model._predict_proba_lr([feature])[0] # return value: 0(none), 1-5(classes+1) if probs[idx]>CONFIDENCE_THRESHOLD: return idx+1 else: return 0
import nltk from nltk.tag.perceptron import PerceptronTagger # nltk.download() sentence = """At eight o'clock on Thursday morning... Arthur didn't feel very good.""" tokens = nltk.word_tokenize(sentence) tagger = PerceptronTagger(False) tagger.load('file:///C:/Users/yarov/nltk_data/taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle') tagged = tagger.tag(tokens) print tagged
class TermExtractor: def __init__(self, stopwords=[], term_patterns=[], min_term_length=3, min_term_words=2): #StopWordsDetector self.stopwords = set(stopwords) self.min_term_length = min_term_length self.term_patterns = term_patterns self.min_term_words = min_term_words self.detectors = [] self.pos_tagger=PerceptronTagger() for tp in term_patterns: self.detectors.append(POSSequenceDetector(tp)) self.swd = StopWordsDetector(self.stopwords) def extract_terms(self, doc_txt, trace=False): sent_tokenize_list = filter(lambda x: len(x) > 0, map(lambda s: nltk.tokenize.sent_tokenize(s), doc_txt)) sentences = [] _ = [sentences.extend(lst) for lst in sent_tokenize_list] if trace: print('len(sentences)=' + str(len(sentences))) terms = [] #pd.DataFrame(columns=['term']) #sentences = sentences[:30] i = 1 filter_fn = lambda x: len(x) >= self.min_term_length max_i = len(sentences) for s in sentences: text = nltk.word_tokenize(s) #sent_pos_tags=nltk.pos_tag(text, tagset='universal') sent_pos_tags = self.pos_tagger.tag(text) sentence_terms = set() for fsa1 in self.detectors: stn = filter(filter_fn, [' '.join(t) for t in fsa1.detect(sent_pos_tags) if len(t) >= self.min_term_words and len(self.swd.detect(t)) == 0]) sentence_terms.update(stn) terms.extend([str(trm).strip() for trm in sentence_terms]) if trace: print(i, '/', max_i, s) i = i + 1 return terms ''' ''' def c_values(self, terms, trace=False): terms_df = pd.DataFrame(terms, columns=['term']) terms_df['w'] = 1 terms_df['len'] = len(terms_df['term']) term_stats = terms_df.groupby(['term'])['w'].agg([np.sum]) term_stats['len'] = list(pd.Series(term_stats.index).apply(lambda x:len(x))) term_series = list(term_stats.index) n_terms = len(term_series) for i in range(0, n_terms): term_series[i]=' '+str(term_series[i])+' ' term_stats['trm']=term_series term_stats.set_index('trm', inplace=True) A = ahocorasick.Automaton() for i in range(0, n_terms): A.add_word(term_series[i], (i, term_series[i])) A.make_automaton() is_part_of = [] for i in range(0, n_terms): haystack=term_series[i] for end_index, (insert_order, original_value) in A.iter(haystack): if original_value!=haystack: #print original_value, "insideof ", haystack is_part_of.append((original_value, haystack, 1)) subterms = pd.DataFrame(is_part_of, columns=['term', 'part_of', 'w']).set_index(['term', 'part_of']) if trace: print("terms/subterms relations discovered ...") c_values = [] # term_series=['time'] for t in term_series: if t in term_stats.index: current_term = term_stats.loc[t] # average frequency of the superterms c_value = 0 if t in subterms.index: subterm_of = list(subterms.loc[t].index) for st in subterm_of: c_value -= term_stats.loc[st]['sum'] c_value /= float(len(subterm_of)) # add current term frequency c_value += current_term['sum'] # multiply to log(term length) c_value = c_value * np.log(current_term['len']) if current_term['len']>0 else 0 if trace: print(t, 'freq=', current_term['sum'], ' cvalue=', c_value) c_values.append(c_value) # break return sorted(zip( [x.strip() for x in term_series], c_values), key=lambda x: x[1], reverse=True)
# # lecture/analyse PHRASES sents = [] for textli in textlines: sents += sent_tokenize(textli) del textlines t2 = datetime.now() # # lecture/analyse MOTS + CATS pos_sents=[] for s in sents: tok_sent = [] wtoks = word_tokenize(s) # A pos_sents.append(tagr.tag(wtoks)) # B # for w in wtoks: # tok_sent += pos_tag(w) # pos_sents.append(tok_sent) del sents t3 = datetime.now() # pos_sents= # [[('In', 'IN'), # ('the', 'DT'), # ('land', 'NN'), # ('of', 'IN'),
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from nltk.stem import WordNetLemmatizer, SnowballStemmer from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from nltk.corpus import wordnet from nltk.tag.perceptron import PerceptronTagger from gensim.models import Phrases from gensim.models.phrases import Phraser from config import dataset_path lemmatizer = WordNetLemmatizer() tagger = PerceptronTagger() pos_tag = lambda x : tagger.tag([x]) preprocessing_steps = ["remove_punctuation", "lowercase", "remove_numbers", "remove_stop_words", "keep_only_nouns", "lemmatization", "remove_infrequent_words", "keep_most_frequent_words", "make_bigrams"] tag_to_keep = ['FW', 'JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS', 'RB', 'RBR', 'RBS'] #tag_to_keep = ['NN', 'NNS', 'NNP', 'NNPS'] preprocessed_dict = {} # save the preprocessing of each word to avoid re-computing it def countWordsOnTexts(df): wordCounter = Counter() for i, row in df.iterrows():
from nltk.tag.perceptron import PerceptronTagger # importing a tagger from nltk.tokenize import word_tokenize # tokenizing the input tagger = PerceptronTagger() # init the tagger in default mode for i in join: text_orig = i[1] year = i[0] text = text_orig text = nltk.word_tokenize(text) if keyword in text: ind = text.index(keyword) w1 = ind - 1 w2 = ind + 1 text = " ".join(text[w1:w2]) for word, tag in tagger.tag(word_tokenize(text)): if tag == "JJ": # check if the word is an adjective # SentiWordNet if len(list(swn.senti_synsets(word, "a"))) > 0: synset = list(swn.senti_synsets( word, "a"))[0] # get the most likely synset if synset.pos_score() > synset.neg_score(): positive.append([year, word, keyword, text_orig]) # counting positive by decade decade_counts = [0] * 16 decade_words = [[]] * 16 for i in positive:
class LemmaWordpieceTokenizer(BertTokenizer): def __init__(self, vocab_file, do_lower_case=True, never_split=None, additional_special_tokens=[ "[JJR]", "[JJS]", "[NNS]", "[NNP]", "[NNPS]", "[RBR]", "[RBS]", "[VBD]", "[VBG]", "[VBN]", "[VBP]", "[VBZ]" ], **kwargs): self.inflection_tokens = additional_special_tokens self.tagger = PerceptronTagger() super().__init__(vocab_file, do_lower_case=do_lower_case, never_split=never_split, additional_special_tokens=additional_special_tokens, **kwargs) self.have_inflections = {'NOUN', 'ADJ', 'VERB'} self.lemma_tags = {'NN', 'VB', 'JJ', 'RB', 'MD', "NNP"} self.do_lower_case = do_lower_case if do_lower_case: self.cased_tokenizer = BasicTokenizer(do_lower_case=False, never_split=never_split) else: self.cased_tokenizer = self.basic_tokenizer def _tokenize(self, text): tokenized = self.cased_tokenizer.tokenize( text, never_split=self.all_special_tokens) #print(tokenized) ptb_pos_tagged = self.tagger.tag(tokenized) #print(pos_tagged) #print(pos_tagged) universal_pos_tagged = [(token, map_tag("en-ptb", 'universal', tag)) for (token, tag) in ptb_pos_tagged] #print(universal_pos_tagged) split_tokens = [] for i, (word, pos) in enumerate(ptb_pos_tagged): if self.do_lower_case: word = word.lower() if universal_pos_tagged[i][ 1] in self.have_inflections and word not in ( string.punctuation + '—') and pos not in self.lemma_tags: # (universal_)pos_tagged in the form of [(word, pos),(word, pos),...] # getLemma returns a tuple (lemma,) lemma = getLemma(word, upos=universal_pos_tagged[i][1])[0] if not lemma: lemma = word wordpieced = self.wordpiece_tokenizer.tokenize(lemma) #print(wordpieced) split_tokens.extend(wordpieced) else: wordpieced = self.wordpiece_tokenizer.tokenize(word) split_tokens.extend(wordpieced) return split_tokens def convert_tokens_to_string(self, tokens): result = [] for i, token in enumerate(tokens): # combine wordpiece tokens if len(token) > 2 and token[:2] == '##': if result: result[-1] += token[2:] else: result.append(token[2:]) continue if token in self.inflection_tokens: if i != 0: inflected = getInflection(result[-1], tag=token[1:-1]) if inflected: result[-1] = inflected[0] else: result.append(token) return ' '.join(result)
for (dirname, _, filenames) in os.walk(dir_corpus): for f in filenames: ext = f[-4:] if ext == '.ann': file_count += 1 if debug and file_count > debug_tests: break file_text = os.path.join(dirname, f[:-4] + ".txt") text_file = open(file_text, "r") file_kpe = os.path.join(dir_output, f[:-4] + ".ann") kpe_file = open(file_kpe, "w") raw_text = unicode(text_file.read(), encoding="utf-8") tokens = tokenizer.tokenize(raw_text) tagged_text = [t + ("None",) for t in tagger.tag(tokens)] text_file.close() #test_sents.append(tagged_text) if extra_features: X_test = kpc.sent2features_extra(tagged_text, qr) else: X_test = kpc.sent2features(tagged_text) is_not_kp = "None" tmp_label = is_not_kp new_kp = [] kp_list = [] for kp in zip(crftagger.tag(X_test), [tt[0] for tt in tagged_text]): if debug and False: print >> sys.stderr, " ---- ", kp if kp[0][0:2] == "B-":
class HearstPatterns(object): def __init__(self): self.__chunk_patterns = r""" # helps us find noun phrase chunks NP: {<DT|PP\$>?<JJ>*<NN>+} {<NNP>+} {<NNS>+} """ self.__np_chunker = nltk.RegexpParser(self.__chunk_patterns) # create a chunk parser # now define the Hearst patterns # format is <hearst-pattern>, <general-term> # so, what this means is that if you apply the first pattern, the firsr Noun Phrase (NP) # is the general one, and the rest are specific NPs self.__hearst_patterns = [ ("(NP_\w+ (, )?such as (NP_\w+ ? (, )?((and |or )NP_\w+)?)+)", "first"), # ("(such NP_\w+ (, )?as (NP_\w+ ?(, )?(and |or )?)+)", "first"), ("((NP_\w+ ?(, )?)+(and |or )?other NP_\w+)", "last"), ("(NP_\w+ (, )?including (NP_\w+ ?(, )?(and |or )?)+)", "first"), ("(NP_\w+ (, )?especially (NP_\w+ ?(, )?(and |or )?)+)", "first"), ] self.__pos_tagger = PerceptronTagger() # divid text into sentences, tokenze the setnences and add par of speech tagging def prepare(self, rawtext): sentences = nltk.sent_tokenize(rawtext.strip()) # NLTK default sentence segmenter sentences = [nltk.word_tokenize(sent) for sent in sentences] # NLTK word tokenizer sentences = [self.__pos_tagger.tag(sent) for sent in sentences] # NLTK POS tagger return sentences # apply chunking step on the setences using the defined grammer for extracting nounphrases def chunk(self, rawtext): sentences = self.prepare(rawtext.strip()) all_chunks = [] for sentence in sentences: chunks = self.__np_chunker.parse(sentence) # parse the example sentence all_chunks.append(self.prepare_chunks(chunks)) return all_chunks # annotate the np with a prefiy NP_ also exclude the other and such from NP to be used in the patterns def prepare_chunks(self, chunks): # basically, if the chunk is NP, keep it as a string that starts w/ NP and replace " " with _ # otherwise, keep the word. # remove punct # this is all done to make it super easy to apply the Hearst patterns... terms = [] for chunk in chunks: label = None try: # gross hack to see if the chunk is simply a word or a NP, as we want. But non-NP fail on this method call label = chunk.label() except: pass if label is None: # means one word... token = chunk[0] pos = chunk[1] if pos in ['.', ':', '-', '_']: continue terms.append(token) else: if chunk[0][0]=='such': np = "such NP_" + "_".join([a[0] for a in chunk[1:]]) elif chunk[0][0]=='other': np = "other NP_" + "_".join([a[0] for a in chunk[1:]]) else: np = "NP_" + "_".join([a[0] for a in chunk]) # This makes it easy to apply the Hearst patterns later terms.append(np) return ' '.join(terms) # main method for extracting hyponym relations based on hearst patterns def find_hyponyms(self, folderpath,stopWord): all_sentences=list() hyponyms = [] filelist = os.listdir(folderpath) for filePath in filelist: print ("processing file ","........................",filePath) file = codecs.open(folderpath + "//" + filePath, "r", encoding='utf-8', errors='ignore') lines = file.readlines() rawtext=(''.join(lines)) rawtext=rawtext.lower() np_tagged_sentences = self.chunk(rawtext) for raw_sentence in np_tagged_sentences: # two or more NPs next to each other should be merged into a single NP, it's a chunk error # find any N consecutive NP_ and merge them into one... # So, something like: "NP_foo NP_bar blah blah" becomes "NP_foo_bar blah blah" sentence = re.sub(r"(NP_\w+ NP_\w+)+", lambda m: m.expand(r'\1').replace(" NP_", "_"), raw_sentence) # print sentence for (hearst_pattern, parser) in self.__hearst_patterns: matches = re.search(hearst_pattern, sentence) if matches: match_str = matches.group(1) nps = [a for a in match_str.split() if a.startswith("NP_")] if parser == "first": general = nps[0] specifics = nps[1:] else: general = nps[-1] specifics = nps[:-1] for i in range(len(specifics)): if i==0: e2=general.replace("NP_", "").replace("_", " ") e1=specifics[0].replace("NP_", "").replace("_", " ") clean_sen=sentence.replace("NP_", "").replace("_", " ") clean_sen=clean_sen.replace(e1,"<e1>"+e1+"</e1>").replace(e2,"<e2>"+e2+"</e2>") all_sentences.append(clean_sen.replace("NP_", "").replace("_", " ")) #print(clean_sen) #print(general, specifics[i]) hyponyms.append(( self.clean_hyponym_term(general),self.clean_hyponym_term(specifics[i]))) file.close() return self.refine_hyponym_term(hyponyms,stopWord),all_sentences def clean_hyponym_term(self, term): return term.replace("NP_", "").replace("_", " ") # remove stopwprds and sniguralize specfic and general concepts def refine_hyponym_term(self,hyponyms,stopWord): cleanedHyponyms=[] with open(stopWord) as f: stopWords = f.read().splitlines() for hyponym in hyponyms: #print(hyponym) specific = ' '.join([i for i in hyponym[1].split(' ') if not any(w == i.lower() for w in stopWords)]) general = ' '.join([i for i in hyponym[0].split(' ') if not any(w == i.lower() for w in stopWords)]) if specific == '' or general=='': print ('skipped relation: ', hyponym[1], 'is a ', hyponym[0]) continue cleanedHyponyms.append((singularize(general) ,singularize(specific))) cleanedHyponyms.sort() return self.remove_duplicates(cleanedHyponyms) # remove duplicates in hyonym list def remove_duplicates(self, seq): seen = set() seen_add = seen.add return [x for x in seq if not (x in seen or seen_add(x))]
class BITETokenizer(object): inflection_tokens = [ "[JJR]", "[JJS]", "[NNS]", "[NNPS]", "[RBR]", "[RBS]", "[VBD]", "[VBG]", "[VBN]", "[VBP]", "[VBZ]" ] single_char_map = { "[JJR]": chr(9774), "[JJS]": chr(9775), "[NNS]": chr(9776), "[NNPS]": chr(9777), "[RBR]": chr(9778), "[RBS]": chr(9779), "[VBD]": chr(9780), "[VBG]": chr(9781), "[VBN]": chr(9782), "[VBP]": chr(9783), "[VBZ]": chr(9784) } reverse_single_char_map = {v: k for k, v in single_char_map.items()} lemma_tags = {'NN', 'VB', 'JJ', 'RB', 'MD', "NNP"} have_inflections = {'NOUN', 'ADJ', 'VERB'} def __init__(self, pretokenizer='moses'): self.tagger = PerceptronTagger() self.pretok_type = pretokenizer if pretokenizer == 'bertpretokenizer': self.pretokenizer = BertPreTokenizer() elif pretokenizer == 'moses': self.pretokenizer = MosesTokenizer() self.detokenizer = MosesDetokenizer() elif pretokenizer == 'whitespace': pass else: raise ValueError( "pretokenizer must be 'bertpretokenizer', 'moses', or 'whitespace'." ) def _pretokenize(self, sentence: str) -> List[str]: if self.pretok_type == 'bertpretokenizer': return [tup[0] for tup in self.pretokenizer.pre_tokenize(sentence)] elif self.pretok_type == 'whitespace': return sentence.split() else: return self.pretokenizer.tokenize(sentence) def tokenize(self, sentence: Union[str, List[str]], pretokenize: bool = True, map_to_single_char: bool = False) -> List[str]: if pretokenize: pretokenized = self._pretokenize(sentence) else: # Allow users to pass in a list of tokens if using custom pretokenizers pretokenized = sentence ptb_pos_tagged = self.tagger.tag(pretokenized) universal_pos_tagged = [(token, map_tag("en-ptb", 'universal', tag)) for (token, tag) in ptb_pos_tagged] tokenized = [] for i, (word, pos) in enumerate(ptb_pos_tagged): if universal_pos_tagged[i][ 1] in self.have_inflections and word not in ( string.punctuation + '—') and pos not in self.lemma_tags: lemma = getLemma(word, upos=universal_pos_tagged[i][1])[0] if not lemma: lemma = word tokenized.append(lemma) tokenized.append('[' + pos + ']') else: tokenized.append(word) if map_to_single_char: tokenized = [ self.single_char_map[token] if token in self.inflection_tokens else token for token in tokenized ] return tokenized def detokenize(self, tokens: List[str], as_list: bool = False) -> Union[str, List[str]]: result = [] for i, token in enumerate(tokens): # combine wordpiece tokens if token in self.reverse_single_char_map: token = self.reverse_single_char_map[token] if token in self.inflection_tokens: if i != 0: inflected = getInflection(result[-1], tag=token[1:-1]) if inflected: result[-1] = inflected[0] else: result.append(token) if as_list: # Allow users to detokenize using their own detokenizers return result if self.pretok_type == 'moses': return self.detokenizer.detokenize(result) return ' '.join(result)
ann = unicode(ann, encoding="utf-8") if ann[0] not in ["R", "*"]: ann_items = ann.strip().split("\t") if ann_items[1].find(";") >= 0: type_indexes_tmp = ann_items[1].split(" ") type_indexes = type_indexes_tmp[0:2] + type_indexes_tmp[3:] else: type_indexes = ann_items[1].split(" ") type_indexes[1] = int(type_indexes[1]) type_indexes[2] = int(type_indexes[2]) indexes_kp_tmp.setdefault(type_indexes[1], -1) if indexes_kp_tmp[type_indexes[1]] < type_indexes[2]: indexes_kp_tmp[type_indexes[1]] = type_indexes[2] ann_text = ann_items[2] tokens = tokenizer.tokenize(ann_text) pos_tags = " ".join([pos[1] for pos in tagger.tag(tokens)]) ann_text = ann_text.encode("utf-8") print >> ann_ext_file, " ".join([str(ti) for ti in type_indexes]) + "\t" + ann_text + "\t" + pos_tags ann_file.close() ann_ext_file.close() file_text = os.path.join(dirname, f[:-4] + ".txt") text_file = open(file_text, "r") file_nokp = os.path.join(dir_output, f[:-4] + ".nann") nokp_file = open(file_nokp, "w") raw_text = unicode(text_file.read(), encoding="utf-8") indexes_kp_tmp = sorted(indexes_kp_tmp.items(), key=operator.itemgetter(0,1)) indexes_kp = [] ikp_tmp = (-1, -1)
class TermExtractor: def __init__(self, stopwords=[], term_patterns=[], min_term_length=3, min_term_words=2): #StopWordsDetector self.stopwords = set(stopwords) self.min_term_length = min_term_length self.term_patterns = term_patterns self.min_term_words = min_term_words self.detectors = [] self.pos_tagger=PerceptronTagger() for tp in term_patterns: self.detectors.append(POSSequenceDetector(tp)) self.swd = StopWordsDetector(self.stopwords) def extract_terms(self, doc_txt, trace=False): sent_tokenize_list = filter(lambda x: len(x) > 0, map(lambda s: nltk.tokenize.sent_tokenize(s), doc_txt)) sentences = [] _ = [sentences.extend(lst) for lst in sent_tokenize_list] terms = [] #pd.DataFrame(columns=['term']) #sentences = sentences[:30] i = 1 filter_fn = lambda x: len(x) >= self.min_term_length max_i = len(sentences) for s in sentences: text = nltk.word_tokenize(s) #sent_pos_tags=nltk.pos_tag(text, tagset='universal') sent_pos_tags = self.pos_tagger.tag(text) sentence_terms = set() for fsa1 in self.detectors: stn = filter(filter_fn, [' '.join(t) for t in fsa1.detect(sent_pos_tags) if len(t) >= self.min_term_words and len(self.swd.detect(t)) == 0]) sentence_terms.update(stn) terms.extend(sentence_terms) if trace: print(i, '/', max_i, s) i = i + 1 return terms ''' ''' def c_values(self, terms, trace=False): terms_df = pd.DataFrame(terms, columns=['term']) terms_df['w'] = 1 terms_df['len'] = len(terms_df['term']) term_stats = terms_df.groupby(['term'])['w'].agg([np.sum]) term_stats['len'] = list(pd.Series(term_stats.index).apply(lambda x:len(x))) term_stats.sort_values(by=['len'], ascending=True, inplace=True) term_series = list(term_stats.index) vectorizer = CountVectorizer(analyzer='word') vectorizer.fit(term_series) term_vectors = vectorizer.transform(term_series) n_terms = len(term_series) is_part_of = [] for i in range(0, n_terms): for j in range(i + 1, n_terms): if scipy.spatial.distance.cosine(term_vectors[i].toarray(), term_vectors[j].toarray()) < 1: # i may be inside j if term_series[j].find(term_series[i]) >= 0: # i is inside j if trace: print term_series[i], " -- ",term_series[j] is_part_of.append((term_series[i], term_series[j], 1)) subterms = pd.DataFrame(is_part_of, columns=['term', 'part_of', 'w']).set_index(['term', 'part_of']) if trace: print "terms/subterms relations discovered ..." c_values = [] # term_series=['time'] for t in term_series: # print t current_term = term_stats.loc[t] # average frequency of the superterms c_value = 0 if t in subterms.index: subterm_of = list(subterms.loc[t].index) for st in subterm_of: c_value -= term_stats.loc[st]['sum'] c_value /= float(len(subterm_of)) # add current term frequency c_value += current_term['sum'] # multiply to log(term length) c_value = c_value * np.log(current_term['len']) if trace: print(t, 'freq=', current_term['sum'], ' cvalue=', c_value) c_values.append(c_value) # break return sorted(zip(term_series, c_values), key=lambda x: x[1], reverse=True)
class WN_Linker(): def __init__(self, w2v, stopwords=None, lemmatizer=None): if stopwords == None: stopwords = nltk.corpus.stopwords.words('english') self.w2v,self.stopwords,self.unfound_words,self.unfound_defs,self.no_synsets = w2v,stopwords,[],[],[] #Define map from Penn Treebank tags to WN tags from nltk.tag.perceptron import PerceptronTagger self.tagger = PerceptronTagger() pt_pss = self.tagger.classes self.pos_map = { k: 'a' if k.startswith('J') else k[0].lower() if k[0] in ['N', 'V', 'R'] else None for k in pt_pss } self.pos_map[None] = None self.lemmatizer = lemmatizer if lemmatizer else WordNetLemmatizer() try: self.lemmatizer.lemmatize('rowing') except LookupError: nltk.download('wordnet') self.lemmatizer.lemmatize('rowing') def convert_to_uni_tag(self, token): return '_'.join( [token[0], nltk.map_tag('en-ptb', 'universal', token[1])]) def tokenize(self, s): return [w for w in nltk.word_tokenize(s) if w not in self.stopwords] def tokenize_and_tag(self, sentence): return [ self.convert_to_uni_tag(token) for token in nltk.pos_tag(nltk.word_tokenize(sentence)) if token[0] not in self.stopwords ] def wv_similarity(self, w1, w2): return self.w2v.similarity(w1, w2) def wv_n_similarity(self, s1, s2): if not (len(self.tokenize_and_tag(s1)) and len(self.tokenize_and_tag(s2))): return 0. try: return self.w2v.n_similarity(self.tokenize_and_tag(s1), self.tokenize_and_tag(s2)) except KeyError: s1vecs = [] s2vecs = [] for w in self.tokenize_and_tag(s1): try: s1vecs.append(self.w2v[w]) except KeyError: self.unfound_words.append(w) for w in self.tokenize_and_tag(s2): try: s2vecs.append(self.w2v[w]) except KeyError: self.unfound_words.append(w) if not (len(s1vecs) and len(s2vecs)): self.unfound_defs.append(s2) return 0. return dot(matutils.unitvec(array(s1vecs).mean(axis=0)), matutils.unitvec(array(s2vecs).mean(axis=0))) def get_vecs_for_BOW(self, bag): try: return [self.w2v[word] for word in self.tokenize_and_tag(bag)] except KeyError: vecs = [] for w in self.tokenize_and_tag(bag): try: vecs.append(self.w2v[w]) except KeyError: self.unfound_words.append(w) if vecs == []: return [np.zeros(300)] return vecs def compute_similarity(self, word_context, synset): return self.wv_n_similarity(word_context, synset.definition()) def link_word_to_wn(self, word, context, pos=None, context_as_vec=False): orig_synsets = wordnet.synsets(word) synsets = [ss for ss in orig_synsets if ss.pos() == pos] if pos else orig_synsets if len(synsets) == 0: synsets = orig_synsets #sims = softmax([self.compute_similarity(context,ss) for ss in synsets]) if len(synsets) == 0: #set_trace() #print('no synsets for',word) self.no_synsets.append(word) return None if len(synsets) == 1: return synsets[0] synsets = synsets[:5] # Ignore rare senses in highly polysemous words if context_as_vec: synsets_vecs = [ self.get_vecs_for_BOW(ss.definition()) for ss in synsets ] synsets_vecs = [ array(vecs).mean(0) for vecs in synsets_vecs if vecs ] synsets_vecs = [normalize_vec(vec) for vec in synsets_vecs] sims = np.matmul(array(synsets_vecs), context) else: sims = [self.compute_similarity(context, ss) for ss in synsets] _, sense = max(zip(sims, synsets)) return sense def lemmatize(self, w): return self.lemmatizer.lemmatize(w) def get_synsets_of_rule_parse(self, dp, use_offset=True, convert=False): try: context = ' '.join(dp['captions']) except TypeError: # Nan is in list context = ' '.join( [item for item in dp['captions'] if isinstance(item, str)]) tokens = self.tokenize(context) vecs = self.get_vecs_for_BOW(context) context_vec = normalize_vec(array(vecs).mean(axis=0)) context_vec = matutils.unitvec(context_vec) pos_tagged_context_dict = { self.lemmatize(k): v for k, v in self.tagger.tag(tokens) } # <token>: <pos> new_atoms = [] unique_entities = set([x for atom in dp['atoms'] for x in atom]) entity_id_dict = {} for entity in unique_entities: try: pt_pos = pos_tagged_context_dict[entity] except KeyError: pt_pos = self.tagger.tag([entity])[0][-1] pos = self.pos_map[pt_pos] synset = self.link_word_to_wn(entity, context_vec, context_as_vec=True, pos=pos) #synset = self.link_word_to_wn(entity,context,pos=pos) if synset is None: offset = None else: offset = synset.offset() if use_offset: entity_id_dict[entity] = offset else: entity_id_dict[entity] = synset for atom in dp['atoms']: new_atom = [] for entity in atom: new_atom.append((entity, entity_id_dict[entity])) new_atoms.append(new_atom) return convert_logical_caption(new_atoms) if convert else new_atoms
class WordnetUtils(): def __init__(self, path='../data/', win_size=4): self.path = path self.win_size = win_size self.create_synsets_dictionaries() self.create_lemma_unique_synset_dictionary() self.synsetid_synsetname_d = None self.synsetname_synsetid_d = None self.lemma_synsetid_d = None self.synsetid_lemma_d = None self.word_lemma_d = None self.tagger = PerceptronTagger() self.lemmatizer = WordNetLemmatizer() def create_synsets_dictionaries(self): ''' From wordnet creates two dictionaries: synsetid -> synset name for ex. 2084071 -> 'dog.n.01' synset name -> synsetid 'dog.n.01' -> 2084071 And pickle them to the disk ''' f = '%s%s' % (self.path, synsetid_synset_name_d_filename) if not os.path.isfile(f): print("Generating synsetid to synset name dictionary") syns = list(wn.all_synsets()) print('#of senses=' + str(len(syns))) offsets_list = [(s.offset(), s.name()) for s in syns] pdict = dict(offsets_list) pickle.dump(pdict, open(f, "wb")) else: print("Synset id to synset name dictionary already existing, skipping...") pdict = self.load_dict(synsetid_synset_name_d_filename) f = '%s%s' % (self.path, synset_name_synsetid_d_filename) if not os.path.isfile(f): print("Generating synset name to synset id dictionary") reverse_dictionary = dict(zip(pdict.values(), pdict.keys())) pickle.dump(reverse_dictionary, open(f, "wb")) else: print("Synset name to synset id dictionary already existing, skipping...") def create_lemma_unique_synset_dictionary(self): ''' From wordnet create a dictionary which keeps track all the lemmas which are not ambiguous, i.e. one lemma has only one synsetid. And create the reverse dictionary and save it on disk. For example: pingpong_table -> 4381587 4381587 -> pingpong_table ''' f = '%s%s' % (self.path, lemma_synsetid_d_filename) if not os.path.isfile(f): print("Generating lemma to synset id dictionary") lemmas_in_wordnet = set(chain(*[ss.lemma_names() for ss in wn.all_synsets()])) dictionary = dict() for lemma in lemmas_in_wordnet: synsets = wn.synsets(lemma) if len(synsets) == 1: print("Lemma=%s, synset=%s" % (lemma, synsets)) dictionary[lemma] = synsets[0].offset() print('Size of lemma uniqe synsets=' + str(len(dictionary))) pickle.dump(dictionary, open(f, "wb")) else: print("lemma to synset id dictionary already existing, skipping...") dictionary = self.load_dict(lemma_synsetid_d_filename) f = '%s%s' % (self.path, synsetid_lemma_d_filename) if not os.path.isfile(f): print("Generating synset id to lemma dictionary") reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) pickle.dump(reverse_dictionary, open(f, "wb")) else: print("synset id to lemma dictionary already existing, skipping...") def lookup_synset_name_from_synsetid(self, synsetid): ''' If not loaded, loads synsetid_synset_name dictionary Then given a synsetid returns its name. :param synsetid: :returns part_name ''' if self.synsetid_synsetname_d == None: self.synsetid_synsetname_d = self.load_dict(synsetid_synset_name_d_filename) if synsetid in self.synsetid_synsetname_d: name = self.synsetid_synsetname_d[synsetid] if name is None: return synsetid else: part_name = name.split('.')[0] return part_name else: return synsetid def lookup_synsetid_from_synset_name(self, name): ''' If not loaded, loads synset_name_synsetid dictionary Then given a synset name returns its synsetid :param name: :returns synsetid ''' if self.synsetname_synsetid_d == None: self.synsetname_synsetid_d = self.load_dict(synset_name_synsetid_d_filename) if name in self.synsetname_synsetid_d: synsetid = self.synsetname_synsetid_d[name] return synsetid return None def lookup_lemma_from_synsetid(self, synsetid): ''' Given a synsetid, lookup its lemma :param synsetid: :returns lemma ''' if self.synsetid_lemma_d == None: self.synsetid_lemma_d = self.load_dict(synsetid_lemma_d_filename) if synsetid in self.synsetid_lemma_d: synsetid = self.synsetid_lemma_d[synsetid] return synsetid return None def lookup_synsetid_from_lemma(self, lemma): ''' Given a lemma, lookup its synset id :param lemma: :returns synsetid ''' if self.lemma_synsetid_d == None: self.lemma_synsetid_d = self.load_dict(lemma_synsetid_d_filename) if lemma in self.lemma_synsetid_d: synsetid = self.lemma_synsetid_d[lemma] return synsetid return None def create_word_to_lemma_dictionary(self, words): ''' Given a list of words, create word_lemma dictionary in order to speed up mapping of a word to a synsetid. :param words: ''' tags = self.tagger.tag(words) print(tags[:5]) word_lemma_d = dict() for word, tag in tags: pos = get_wordnet_pos(tag) # print("word=%s,tag=%s,pos=%s" %(word, tag, pos)) if pos is not None: lemma = self.lemmatizer.lemmatize(word, pos) if lemma is not None: # print(lemma) word_lemma_d[word] = lemma else: print("Lemma not found for word=" + word) print("Word_lemma_dictionary size=%s" % (len(word_lemma_d.items()))) return word_lemma_d def lookup_lemma_from_word(self, words, word): ''' Given a word, lookup its lemma :param word: :returns lemma ''' if self.word_lemma_d == None: self.word_lemma_d = self.create_word_to_lemma_dictionary(words) if word in self.word_lemma_d: lemma = self.word_lemma_d[word] return lemma return None def clean_data(self, words): ''' Given a list of words, remove the stop words and returns the most common words :param filename: :param lemma: :returns filtered_words ''' stops = set(stopwords.words("english")) filtered_words = [word for word in words if word not in stops] print(filtered_words[:10]) count = collections.Counter(filtered_words).most_common() dictionary = dict() for word, _ in count: dictionary[word] = len(dictionary) data = list() for word in words: if word in dictionary: data.append(word) return data def load_dict(self, filename): ''' Loads senses to names dictionary. :param filename :returns senses_names_d ''' f = '%s%s' % (self.path, filename) senses_names_d = pickle.load(open(f, "rb")) return senses_names_d def generate_synsetids(self, words): ''' This function generates a list of synsetids for a given list of words by mapping each word to its synsetid. Given a list of words: 1. Removes stop words and keeps most common words 2. for each word: - finds the lemma - using its lemma find the synsetid - if there is no synsetId then try to disambiguate the word using lesk - It is slow to disambuguate a larger corpus so only a window around the word to disambiguate is given as context of disambiguation - if we have a synset then add it to the list of synsetids if not just add the ambiguous word :param words: :return: synsetids ''' filtered_words = self.clean_data(words) print("Words", filtered_words[:5]) unknown_count = 0 synsetids = [] i = 0 words_sz = len(words) for word in words: lemma = self.lookup_lemma_from_word(words, word) if lemma is None: unknown_count += 1 synsetid = word else: synsetid = self.lookup_synsetid_from_lemma(lemma) if synsetid is None: # synset = lesk(sentence, word) (li, ri) = get_window_indices(words_sz, i, self.win_size) win_words = words[li: i] + words[i: ri + 1] # print(win_words) synset = lesk(win_words, word) if synset is None: unknown_count += 1 synsetid = word else: synsetid = synset.offset() if synsetid == word: print("No synsetid for=", word) else: print("word=%s, synsetName=%s, synsetId=%d" % ( word, self.lookup_synset_name_from_synsetid(synsetid), synsetid)) synsetids.append(synsetid) i += 1 synsetids_size = len(synsetids) print(synsetids_size) if synsetids_size != 0: rejected_percent = 100.0 * (unknown_count / synsetids_size) else: rejected_percent = 0 print("Synsetids_size list size=%d, unknown=%.3f" % (synsetids_size, rejected_percent)) return synsetids def generate_lemma_pos_with_win(self, words): ''' This function generates lemma to POS using the selected tagger (by default Perceptron Tager) and a lemmatizer to find given a word its lemma and POS using a context window. It returns a list of (lemma, POS). :param words: :return: tuples ''' #filtered_words = self.clean_data(words) filtered_words = words #print("Filtered words", filtered_words[:5]) tuples = [] words_sz = len(filtered_words) print("Processing %d words" % words_sz) i = 0 unknown_count = 0 for word in filtered_words: (li, ri) = get_window_indices(words_sz, i, self.win_size) win_words = filtered_words[li: i] + filtered_words[i: ri + 1] logging.debug("Word=%s, win_words=%s" %(word,win_words)) tags = [] try: tags = self.tagger.tag(win_words) except: print("Unexpected error:%s" %sys.exc_info()[0]) if len(tags) == 0: continue word_tag_d = dict(tags) w_tag = word_tag_d[word] if w_tag is None: #print("No tag for word=%s" % word) i += 1 unknown_count += 1 tuples.append((word, None)) continue pos = get_wordnet_pos(w_tag) if pos is None: #print("No pos for word=%s" % word) i += 1 unknown_count += 1 tuples.append((word, None)) continue logging.debug("word=%s,tag=%s,pos=%s" %(word, w_tag, pos)) lemma = word try: lemma = self.lemmatizer.lemmatize(word, pos) except: print("Unexpected error:%s" %sys.exc_info()[0]) # print("Lemma=%s" %lemma) tuples.append((lemma, pos)) # synset = lesk(win_words, lemma, pos) # print("Synset=%s" %synset) i += 1 logging.debug("Processing i=%d,total=%d" % (i, words_sz)) if words_sz != 0: rejected_percent = 100.0 * (unknown_count / words_sz) else: rejected_percent = 0 print("word_size=%d, unknown=%.3f" % (words_sz, rejected_percent)) return tuples def generate_lemma_pos(self, words): ''' This is a similar function than the one above without contextual window. :param words: :return: tuples ''' words_sz = len(words) tags = [] try: tags = self.tagger.tag(words) except: print("Unexpected error:%s" %sys.exc_info()[0]) #print('Tags=%s' % tags[:5]) i = 0 tuples = [] unknown_count = 0 for word, tag in tags: pos = get_wordnet_pos(tag) if pos is None: #print("No pos for word=%s" % word) tuples.append((word, None)) i += 1 unknown_count += 1 continue lemma = word try: lemma = self.lemmatizer.lemmatize(word, pos) except: print("Unexpected error:%s" %sys.exc_info()[0]) #print("word=%s,lemma=%s,tag=%s,pos=%s" % (word, lemma, tag, pos)) tuples.append((lemma, pos)) i += 1 if words_sz != 0: rejected_percent = 100.0 * (unknown_count / words_sz) else: rejected_percent = 0 #print("word_size=%d, unknown=%.3f" % (words_sz, rejected_percent)) return tuples def generate_lemma_given_pos(self, words, pos): ''' This helper function is used for the questions-answers challenge. The tagger is sometimes wrongs thus this generates for the 3 lemmas the given POS. It returns a list of (lemma, pos). :param words: :return: tuples ''' tuples = [] for word in words: lemma = self.lemmatizer.lemmatize(word, pos) #print("word=%s,lemma=%s" % (word, lemma)) tuples.append((lemma, pos)) return tuples def generate_lemma_adj_adv(self, words): ''' This helper function is used for the questions-answers challenge. The function returns (lemma1,'a'),(lemma2,'r'),(lemma3,'a'),(lemma4,'r') :param words: :return: tuples ''' tuples = [] if len(words) != 4: return tuples lemma = self.lemmatizer.lemmatize(words[0], 'a') tuples.append((lemma, 'a')) lemma = self.lemmatizer.lemmatize(words[1], 'r') tuples.append((lemma, 'r')) lemma = self.lemmatizer.lemmatize(words[2], 'a') tuples.append((lemma, 'a')) lemma = self.lemmatizer.lemmatize(words[3], 'r') tuples.append((lemma, 'r')) return tuples def generate_lemma_noun_adj(self, words): ''' This helper function is used for the questions-answers challenge. The function returns (lemma1,'n'),(lemma2,'a'),(lemma3,'n'),(lemma4,'a') :param words: :return: tuples ''' tuples = [] if len(words) != 4: return tuples lemma = self.lemmatizer.lemmatize(words[0], 'n') tuples.append((lemma, 'n')) lemma = self.lemmatizer.lemmatize(words[1], 'a') tuples.append((lemma, 'a')) lemma = self.lemmatizer.lemmatize(words[2], 'n') tuples.append((lemma, 'n')) lemma = self.lemmatizer.lemmatize(words[3], 'a') tuples.append((lemma, 'a')) return tuples def transform(self, synsetids): words = [] for s in synsetids: w = self.lookup_synset_name_from_synsetid(s) # if w is None: words.append(str(w)) # else: # words.append(w) return words
df.columns = docs.get('ColNames') print(df) #================================================================ #**************************************************************** # PerceptronTagger # Need to import and download averaged_perceptron_tagger import nltk nltk.download('averaged_perceptron_tagger') # Process the text data from nltk.tag.perceptron import PerceptronTagger PT = PerceptronTagger() print(PT.tag('This is a sample English sentence'.split())) #----output---- [('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('sample', 'JJ'), ('English', 'JJ'), ('sentence', 'NN')] # To get help about tags nltk.help.upenn_tagset('NNP') # can run this in IPython Console # Alternatively, use this method involving tokenizer import nltk nltk.download('punkt') # can run this in IPython Console from nltk.tag import pos_tag from nltk.tokenize import word_tokenize Percept_list = pos_tag(word_tokenize("John's big idea isn't all that bad."))
from matplotlib.pyplot import pyplot as plt tagger = PerceptronTagger() name_tag = [] non_name_tag = [] full_names = [] non_names = [] with open('full_names.txt') as infile: full_names = infile.read().splitlines() with open('non_names.txt') as infile: non_names = infile.read().splitlines() for x in full_names: tags = nltk.ne_chunk(tagger.tag(x.split())).pos() name_tag.append(tags[0][0][1]+' '+tags[0][1]+' '+tags[1][0][1]+' '+tags[1][1]) for x in non_names: tags = nltk.ne_chunk(tagger.tag(x.split())).pos() non_name_tag.append(tags[0][0][1]+' '+tags[0][1]+' '+tags[1][0][1]+' '+tags[1][1]) name_tag = [w.split() for w in name_tag] non_name_tag = [w.split() for w in non_name_tag] train_data = np.vstack((np.array(name_tag), np.array(non_name_tag))) train_f1 = label_binarize(train_data[:,0], classes=list(set(train_data[:,0]))) train_f2 = label_binarize(train_data[:,1], classes=list(set(train_data[:,1]))) train_f3 = label_binarize(train_data[:,2], classes=list(set(train_data[:,2]))) train_f4 = label_binarize(train_data[:,3], classes=list(set(train_data[:,3])))
out_filename = './sentences/subj-verb.txt' f = open(out_filename, 'w') with_verb_count = 0 # count how many sentences match our search for i, sent in enumerate(corpus, start=1): # Logging if i % 10000 == 0: print('{} sentences analysed.'.format(i)) # Prune sentences with unknown tokens if '<unk>' in sent: continue tagged_sent = tagger.tag(sent.split()) with_verb = False for w, t in tagged_sent: # 3rd-singular or non-3rd-singular, present tense if t in ('VBZ', 'VBP'): with_verb = True sent = sent.replace(w, '*{}*'.format(w), 1) if with_verb: print(sent, file=f) with_verb_count += 1 print('>> {} sentences with VBZs and VBPs were saved to {}'.format( with_verb_count, out_filename)) f.close()
print file_count, f[:-4] try: file_text = os.path.join(dirname, f[:-4] + ".txt") text_file = open(file_text, "r") raw_text = unicode(text_file.read(), encoding="utf-8") file_kpe = os.path.join(dir_output, f[:-4] + ".ann") kpe_file = open(file_kpe, "w") except: print >> sys.stderr, "E) Open files: ", sys.exc_info() text_tokens = tokenizer.tokenize(raw_text) text_tokens = kpcommon.escape_not_abbreviations(text_tokens) #if debug: # print text_tokens, len(text_tokens) pos_tags = tagger.tag(text_tokens) pos_tags_string = " ".join([pt[1] for pt in pos_tags]) #print "\n".join([str(ptstr) for ptstr in pos_tags]) kps_candidates = {} test_match = 0 for ct in pos_sequences: if is_posregex: pos_regex = re.compile("\s?(" + ct[2] + ")\s") else: pos_regex = re.compile("\s?(" + re.escape(ct[2]) + ")\s") for pos_match in re.finditer(pos_regex, pos_tags_string): pos_match_string = pos_match.group(1) pos_seq_start = pos_match.start(1) pos_seq_end = pos_match.end(1) token_index_start = len(pos_tags_string[:pos_seq_start].split()) token_offset = len(pos_tags_string[pos_seq_start:pos_seq_end].split())