def get_instances(self, label_file, xml_file): instances = [] labels_final = set() tagger = PerceptronTagger( ) # load nltk perceptron just once to speed up tagging labels_dict = { 0: "anger", 1: "disgust", 2: "fear", 3: "joy", 4: "sadness", 5: "surprise" } tree = ET.parse(xml_file) root = tree.getroot() with open(label_file) as f: for sent, line in izip(root, f): id_xml = sent.attrib.values()[0] id_labels = line.rstrip().split() id_file = id_labels[0] if id_xml == id_file: for i in sent.itertext(): text = i labels = id_labels[1:] label = labels.index( str(max([int(label) for label in labels]))) inst = Instance(text, labels_dict[label]) inst_tokenized = word_tokenize(text) inst_tagged = tagger.tag(inst_tokenized) for tokentag in inst_tagged: token = Token(tokentag[0], tokentag[1]) inst.add_token(token) instances.append(inst) labels_final.add(label) return instances, labels_final
def __init__(self, extended=False): self.__chunk_patterns = r""" # helps us find noun phrase chunks NP: {<DT>?<JJ.*>*<NN.*>+} {<NN.*>+} """ # create a chunk parser self.__np_chunker = nltk.RegexpParser(self.__chunk_patterns) # now define the Hearst patterns # format is <hearst-pattern>, <hypernym_location> # so, what this means is that if you apply the first pattern, self.__hearst_patterns = [ ("(NP_\w+ (, )?such as (NP_\w+ ? (, )?(and |or )?)+)", "first"), ("(NP_such_\w+ as (NP_\w+ ? (, )?(and |or )?)+\w+)", "first"), ("((NP_\w+ (, )?)+( )?(and |or )?NP_other_\w+)", "last"), ("(NP_\w+ (, )?including (NP_\w+ (, )?(and |or )?)+\w+)", "first"), ("(NP_\w+ (, )?especially (NP_\w+ (, )?(and |or )?)+\w+)", "first") # ''' IMPLEMENT ADDITIONAL HEARST PATTERNS HERE ''' ] if extended: self.__hearst_patterns.extend([ ("(NP_\w+ (, )?such as (NP_\w+ ? (, )?(and |or )?)+)", "first"), #''' IMPLEMENT ADDITIONAL PATTERNS HERE ''' ]) self.__pos_tagger = PerceptronTagger()
def get_all_terms_in_sent(reg_exp, sent): tokens = nltk.word_tokenize(sent) # tags = nltk.pos_tag(tokens) pretrain = PerceptronTagger() tags = pretrain.tag(tokens) tags = [[tag[0], tag[1]] for tag in tags] if (not (tags[0][1].startswith("NNP"))): tags[0][0] = tags[0][0].lower() tag_string = "".join(get_tag_string(tags)) p = re.compile(reg_exp) res = [] retrieved_phrases = p.finditer(tag_string) for m in retrieved_phrases: np_lst = [ tok for (tok, tag) in tags[m.start():m.start() + len(m.group())] ] tag_lst = [ interpret_tag(tag) for (tok, tag) in tags[m.start():m.start() + len(m.group())] ] res.append(" ".join(np_lst)) if "P" in tag_lst: idx = tag_lst.index("P") res.append(" ".join(np_lst[:idx])) res.append(" ".join(np_lst[idx + 1:])) return res
def pos_titles_from(input_path, output_path = None, options = None): finput, foutput = get_streams(input_path, output_path) skip, end = get_options(options) tokenizer = Tokenizer() tagger = PerceptronTagger() line_counter = 0 skipped_lines = 0 for line in finput: log_advance(1000000, line_counter) line_counter += 1 if line_counter <= skip: continue if end and line_counter > end: break try: paper_id, title = get_fields(line) if is_english(title): print >> foutput, paper_id tokens = tokenizer.tokenize(title) for token in tagger.tag(tokens): print >> foutput, token[0], token[1] print >> foutput else: skipped_lines += 1 except: print >> sys.stderr, "Error:", line, sys.exc_info() log_nlines(line_counter, skipped_lines)
def tag_text_en(tokens, tokens_span): """Receive tokens and spans and return tuple list with tagged tokens""" tagger = PerceptronTagger() tags = [] for i, tagged in enumerate(tagger.tag(tokens)): tags.append(tagged + (tokens_span[i], [])) return tags
def __init__(self): import nltk nltk.download('averaged_perceptron_tagger') from nltk.tag.perceptron import PerceptronTagger self.inst = PerceptronTagger()
def __init__(self, stop_words=None, sent_detector=None, documents=None): # Load a persisted list of stopwords # unless something else is specified if not stop_words: self._stop_words = stopwords.words('english') self._stop_words += ['minister', 'question', 'member', "member’s"] else: self._stop_words = stop_words # Load these to save time reading from # disk later if not sent_detector: self._sent_detector = nltk.data.load( 'tokenizers/punkt/english.pickle') else: self._sent_detector = sent_detector self._tokenizor = TreebankWordTokenizer() self._lemmatizer = WordNetLemmatizer() self._tagger = PerceptronTagger() if documents: # Create a list of lists of tokens all lowercased # , lemmatized and filtered self.documents = self._make_tokens(documents) self.dictionary = corpora.Dictionary(self.documents) self.corpus = self._bag_o_words(self.documents) self._tf_idf_model = models.TfidfModel(self.corpus) self.transformed_corpus = self._tf_idf_model[self.corpus] else: self.documents = None
def pos_tagger(infile, outfile): start = time.time() tagged_ngrams_counter = Counter() tagger = PerceptronTagger() with io.open(infile, encoding='utf-8', mode='rt') as text_file: for i, line in enumerate(text_file): if i % 100000 == 0: print( f'{os.getpid()} process, {i} lines, {time.time()-start:.1f} time' ) if n == 1: tagged_ngrams = tagger.tag(line.rstrip().split(' ')) else: tagged_ngrams = [ tuple(tagger.tag(ngram)) for ngram in get_ngrams(line.rstrip().split(' '), n) ] tagged_ngrams_counter.update(tagged_ngrams) with open(outfile, mode='wb') as counter_pickle, \ open(outfile[:-6]+'csv', 'w', encoding='utf-8') as counter_csv: pickle.dump(tagged_ngrams_counter, counter_pickle) counter_csv.write(csv_headers[n]) if n == 1: for tagged_ngram, count in tagged_ngrams_counter.items(): counter_csv.write('{} {} {}\n'.format( *tagged_ngram, count)) else: for tagged_ngram, count in tagged_ngrams_counter.items(): ngram, tags = zip(*tagged_ngram) counter_csv.write('{} {} {}\n'.format( ' '.join(ngram), ' '.join(tags), count))
def __init__(self, vocab_file, do_lower_case=True, never_split=None, additional_special_tokens=[ "[JJR]", "[JJS]", "[NNS]", "[NNP]", "[NNPS]", "[RBR]", "[RBS]", "[VBD]", "[VBG]", "[VBN]", "[VBP]", "[VBZ]" ], **kwargs): self.inflection_tokens = additional_special_tokens self.tagger = PerceptronTagger() super().__init__(vocab_file, do_lower_case=do_lower_case, never_split=never_split, additional_special_tokens=additional_special_tokens, **kwargs) self.have_inflections = {'NOUN', 'ADJ', 'VERB'} self.lemma_tags = {'NN', 'VB', 'JJ', 'RB', 'MD', "NNP"} self.do_lower_case = do_lower_case if do_lower_case: self.cased_tokenizer = BasicTokenizer(do_lower_case=False, never_split=never_split) else: self.cased_tokenizer = self.basic_tokenizer
def get_instances(self, folder): # happiness/joy??????????????????????????? labels_dict = { "hp": "joy", "sd": "sadness", "ag": "anger", "dg": "disgust", "sp": "surprise", "fr": "fear" } instances = [] labels = set() tagger = PerceptronTagger( ) # load nltk perceptron just once to speed up tagging with open(folder) as f: for line in f: label, id, text = line.strip().split( " ", 2) # split by first two spaces only if label == "ne": # ignore no emotion continue inst = Instance(text, labels_dict[label]) inst_tokenized = word_tokenize(text) inst_tagged = tagger.tag(inst_tokenized) for tokentag in inst_tagged: token = Token(tokentag[0], tokentag[1]) inst.add_token(token) instances.append(inst) labels.add(label) return instances, labels
def __init__(self): self.hmm_models = [] self.n_hmm = 0 self.hmm2idx = {} self.idx2hmm = {} self.tagger = PerceptronTagger() return
def Top_K_verbs(k, df, text_col, class_col, plot=False): vect = TfidfVectorizer() vect.fit(df[text_col]) tfidf_df = pd.DataFrame(vect.transform(df[text_col]).toarray()) tfidf_df.columns = vect.get_feature_names() tfidf_T = tfidf_df.transpose() tagger = PerceptronTagger() tfidf_T['pos'] = tagger.tag(tfidf_T.index) tfidf_T = tfidf_T[tfidf_T['pos'].apply( lambda tup: tup[1] in ['VB', 'VBD', 'VBG', 'VBN'])] tfidf_df = tfidf_T.drop(['pos'], axis=1).transpose() top_k_by_class = dict() for v in df[class_col].value_counts().index: freq_in_class = tfidf_df[df[class_col] == v].sum(axis=0).sort_values( ascending=False) frac_in_class = freq_in_class / freq_in_class.sum() top_k_by_class[v] = frac_in_class[:k].index if plot: print('the top {} frequent nouns for class {}:'.format(k, v)) plt.figure(figsize=(5, 10)) sns.barplot(y=frac_in_class[:k].index, x=frac_in_class[:k]) plt.xlabel('fraction') plt.show() return (top_k_by_class)
def _get_tagger(lang=None): if lang == 'rus': tagger = PerceptronTagger(False) ap_russian_model_loc = 'file:' + str(find(RUS_PICKLE)) tagger.load(ap_russian_model_loc) else: tagger = PerceptronTagger() return tagger
def pos_sequence_from(keyphrase, tags): """Receive keyphrase dict and return list of tags""" pos_sequence = list(map(lambda i: tags[i][1], keyphrase["tokens-indices"])) # Special case when tokenization don't match with annotation if pos_sequence == []: tagger = PerceptronTagger() keyphrase_tags = tagger.tag(keyphrase['keyphrase-text'].split()) pos_sequence = list(map(lambda t: t[1], keyphrase_tags)) return pos_sequence
def _get_tagger(): # TODO: Instead of manually downloading the dutch_tagger, download it from an external source if it isn't installed at Data/ try: os.chdir(r"Data") tagger = PerceptronTagger(load=False) tagger.load('model.perc.dutch_tagger_small.pickle') return tagger except (IndexError, FileNotFoundError): return None
def __init__(self, df_train, df_valid, df_test, args, unk=True): self.dictionary = Dictionary(max_vocab_size=args.max_vocab_size) self.unk = unk self.classification = args.classification self.transfer_learning = args.transfer_learning self.max_length = args.n_ctx self.pos = args.POS_tags self.pos_tags = set() self.bpe = args.bpe self.lang = args.lang if self.bpe: self.sp = spm.SentencePieceProcessor() self.sp.Load(args.bpe_model_path) if self.pos: self.tagger = PerceptronTagger() if self.transfer_learning: with open(args.dict_path, 'rb') as file: self.dictionary = pickle.load(file) else: self.tokenize_df(df_train) self.tokenize_df(df_valid) self.tokenize_df(df_test) self.dictionary.sort_words(unk=self.unk, pos_tags=self.pos_tags) if not self.transfer_learning: with open(args.dict_path, 'wb') as file: pickle.dump(self.dictionary, file) if not self.classification: if self.pos: self.train, self.train_pos = self.tokenize_(df_train) self.valid, self.valid_pos = self.tokenize_(df_valid) self.test, self.test_pos = self.tokenize_(df_test) else: self.train = self.tokenize_(df_train) self.valid = self.tokenize_(df_valid) self.test = self.tokenize_(df_test) else: if self.pos: self.train, self.train_pos, self.train_target, self.train_keywords, self.train_stemmed_string = self.tokenize_doc( df_train, max_length=self.max_length) self.valid, self.valid_pos, self.valid_target, self.valid_keywords, self.valid_stemmed_string = self.tokenize_doc( df_valid, max_length=self.max_length, valid=False) self.test, self.test_pos, self.test_target, self.test_keywords, self.test_stemmed_string = self.tokenize_doc( df_test, max_length=self.max_length) else: self.train, self.train_target, self.train_keywords, self.train_stemmed_string = self.tokenize_doc( df_train, max_length=self.max_length) self.valid, self.valid_target, self.valid_keywords, self.valid_stemmed_string = self.tokenize_doc( df_valid, max_length=self.max_length, valid=False) self.test, self.test_target, self.test_keywords, self.test_stemmed_string = self.tokenize_doc( df_test, max_length=self.max_length)
def test_perceptron_tagging(self): sentence = "This is a test sentence to test if the testing works." tokens = word_tokenize(sentence) pt = PerceptronTagger(load=True) tag_result1 = [x[1] for x in pt.tag(tokens)] pt2 = perctagger() pt2.load() tag_result2 = pt2.tag(tokens) self.assertListEqual(tag_result1, tag_result2)
class Tagger(AbstractTagger): def __init__(self): self._tagger = PerceptronTagger(load=False) self._name = 'nltkperceptron' self._model_name = "nltkperceptron" self._result = None super().__init__() def _save_model(self, fpath): with open(fpath, 'wb') as f: dill.dump(self._tagger, f) def load(self, path=''): if path == '': self._load_model(path) else: mpath = os.path.join(path, self.model_name) self._load_model(mpath) def _load_model(self, fpath): if fpath == '': self._tagger = PerceptronTagger(load=True) else: with open(fpath, 'rb') as f: self._tagger = dill.load(f) def tag(self, data): res = self._tagger.tag(data) return [x[1] for x in res] def train(self, data): # Reset tagger. self._tagger = PerceptronTagger(load=False) self._tagger.train(data) @property def produces_temp_data(self): return False @property def requires_additional_params(self): return False def set_additional_params(self, options): pass def add_temp_dir(self, options): pass @property def model_name(self): return self._model_name @property def name(self): return self._name
def _get_tagger(lang=None): if lang == "rus": tagger = PerceptronTagger(False) ap_russian_model_loc = "file:" + str(find(RUS_PICKLE)) tagger.load(ap_russian_model_loc) elif lang == "eng": tagger = PerceptronTagger() else: tagger = PerceptronTagger() return tagger
def pos_tokenizer(text): word_tokens = tokenize(text) # using pretrained model to tag all tokens pretrained_tagger = PerceptronTagger(load=True) results = pretrained_tagger.tag(word_tokens) # collecting pos from resulting tuples pos_tokens = [] for word_pos in results: pos_tokens.append(word_pos[1]) return pos_tokens
def ap(train_path, test_path): modelref = 'ap-' + md5(('ap///' + train_path).encode()).hexdigest() + '.pickle' test_sentences = list(gen_corpus(test_path)) if not isfile(modelref): start = perf_counter() training_sentences = list(gen_corpus(train_path)) ap_model = PerceptronTagger(load=False) ap_model.train(list(convert_sents_to_zipped(training_sentences)), save_loc=modelref) end = perf_counter() print('Training took {} ms.'.format(int((end - start) * 1000))) else: ap_model = PerceptronTagger(load=False) ap_model.load(modelref) print('Model loaded from file.') # Evaluation start = perf_counter() y_pred, y_true = [], [] for words, tags in test_sentences: y_pred.extend(y for x, y in ap_model.tag(words)) y_true.extend(tags) end = perf_counter() print('Testing took {} ms.'.format(int((end - start) * 1000))) for l in classification_report(y_true, y_pred).split('\n'): print(l)
def pos_per_line(text_file): try: tokenizer = Tokenizer() #pos tagger = PerceptronTagger() for s in text_file: tokens = tokenizer.tokenize(s) #print " ".join([" ".join(token) for token in tagger.tag(tokens)]) print " ".join([token[1] for token in tagger.tag(tokens)]) except: print >> sys.stderr, "Error pos_per_line(text_file): ", sys.exc_info()
def ie_preprocess(document): tagger = PerceptronTagger() tagged = [] sentences = document.split("\n") sentences = [nltk.word_tokenize(sent) for sent in sentences] for sent in sentences: tagged_tokens = tagger.tag(sent) tagged.append(tagged_tokens) return tagged
def __init__(self, train=False): self.tagger = PerceptronTagger() self.model = None # BOW: triangle, rectangle, circle, hand # verbs: draw, wave, rotate self.BOW = ['draw', 'wave', 'rotate', 'triangle', 'rectangle', 'circle', 'hand'] self.VERBS = [wn.synset('draw.v.01'), wn.synset('wave.v.01'), wn.synset('rotate.v.01')] self.n_bow, self.n_verbs = len(self.BOW), len(self.VERBS) if train: self.train_svm() else: self.load_model() return
def __init__(self, path='../data/', win_size=4): self.path = path self.win_size = win_size self.create_synsets_dictionaries() self.create_lemma_unique_synset_dictionary() self.synsetid_synsetname_d = None self.synsetname_synsetid_d = None self.lemma_synsetid_d = None self.synsetid_lemma_d = None self.word_lemma_d = None self.tagger = PerceptronTagger() self.lemmatizer = WordNetLemmatizer()
def __init__(self, stopwords=[], term_patterns=[], min_term_length=3, min_term_words=2): #StopWordsDetector self.stopwords = set(stopwords) self.min_term_length = min_term_length self.term_patterns = term_patterns self.min_term_words = min_term_words self.detectors = [] self.pos_tagger=PerceptronTagger() for tp in term_patterns: self.detectors.append(POSSequenceDetector(tp)) self.swd = StopWordsDetector(self.stopwords)
def __init__(self, pretokenizer='moses'): self.tagger = PerceptronTagger() self.pretok_type = pretokenizer if pretokenizer == 'bertpretokenizer': self.pretokenizer = BertPreTokenizer() elif pretokenizer == 'moses': self.pretokenizer = MosesTokenizer() self.detokenizer = MosesDetokenizer() elif pretokenizer == 'whitespace': pass else: raise ValueError( "pretokenizer must be 'bertpretokenizer', 'moses', or 'whitespace'." )
def __init__(self, df_train, df_valid, df_test, args, unk=True): self.dictionary = Dictionary(max_vocab_size=args.max_vocab_size) self.unk = unk self.classification = args.classification self.transfer_learning = args.transfer_learning self.max_length = args.n_ctx self.pos = args.POS_tags self.pos_tags = set() self.bpe = args.bpe self.lang = args.lang if self.bpe: self.sp = GPT2Tokenizer.from_pretrained("gpt2", add_prefix_space=True) if self.pos: self.tagger = PerceptronTagger() self.tokenize_df(df_train) self.tokenize_df(df_valid) self.tokenize_df(df_test) with open(args.dict_path, 'wb') as file: pickle.dump(self.dictionary, file) if not self.classification: if self.pos: self.train, self.train_pos = self.tokenize_(df_train) self.valid, self.valid_pos = self.tokenize_(df_valid) self.test, self.test_pos = self.tokenize_(df_test) else: self.train = self.tokenize_(df_train) self.valid = self.tokenize_(df_valid) self.test = self.tokenize_(df_test) else: if self.pos: self.train, self.train_pos, self.train_target, self.train_keywords, self.train_stemmed_string = self.tokenize_doc( df_train, max_length=self.max_length) self.valid, self.valid_pos, self.valid_target, self.valid_keywords, self.valid_stemmed_string = self.tokenize_doc( df_valid, max_length=self.max_length, valid=False) self.test, self.test_pos, self.test_target, self.test_keywords, self.test_stemmed_string = self.tokenize_doc( df_test, max_length=self.max_length) else: self.train, self.train_target, self.train_keywords, self.train_stemmed_string = self.tokenize_doc( df_train, max_length=self.max_length) self.valid, self.valid_target, self.valid_keywords, self.valid_stemmed_string = self.tokenize_doc( df_valid, max_length=self.max_length, valid=False) self.test, self.test_target, self.test_keywords, self.test_stemmed_string = self.tokenize_doc( df_test, max_length=self.max_length)
def _perform_analysis(self, tokenized_sents): res = [] if len(self.precalced_data): return self.precalced_data else: for tokens in tokenized_sents: tagger = PerceptronTagger() tags = tagger.tag(tokens) res += tags self.precalced_data = res return res
def __init__(self, hparams, dataset_type="relocar"): self.hparams = hparams self.dataset_type = dataset_type self._get_labels() self._get_word_dict() self._pad_idx = self.hparams.pad_idx # POS Tagger perceptron_tagger = PerceptronTagger() self.pos2id = {"<PAD>": 0} for pos_tag in list(perceptron_tagger.classes): self.pos2id[pos_tag] = len(self.pos2id) # Stanford NER self.ne2id = { "<PAD>": 0, "O": 1, "LOCATION": 2, "ORGANIZATION": 3, "PERSON": 4 } if self.hparams.do_bert and not self.hparams.do_roberta and not self.hparams.do_xlnet: self._bert_tokenizer_init() elif self.hparams.do_bert and self.hparams.do_roberta: self._roberta_tokenizer_init() elif self.hparams.do_bert and self.hparams.do_xlnet: self._xlnet_tokenizer_init()
def __init__(self): self.tagger = PerceptronTagger() self.alarm_ai = AlarmResponse() self.converter_ai = ConversionResponse() self.summarize_ai = SummarizerResponse() self.news_ai = NewsResponse() self.maps_ai = MapsResponse() self.lemmatizer = WordNetLemmatizer() self.parser = spy.English() self.conversion = ConversionService() self.ai = { "alarm_api": self.alarm_ai, "unit_conversion": self.converter_ai, "summarization": self.summarize_ai, "news": self.news_ai, "maps_api": self.maps_ai } self.classifiers = [] with open('nb_dumped_classifier.pkl', 'rb') as fid: self.classifiers.append(cPickle.load(fid)) with open('sgd_dumped_classifier.pkl', 'rb') as fid: self.classifiers.append(cPickle.load(fid)) with open('pla_dumped_classifier.pkl', 'rb') as fid: self.classifiers.append(cPickle.load(fid)) self.previous_ents = {"PERSON": "", "GPE": ""} self.special_tags = { 'PERSON': ['his', 'her', 'him', 'he', 'she'], 'GPE': ['it', 'there'] }
def wordTagging(self, sentence): posWords = [] # text = nltk.word_tokenize(sentence) tagset = None start_time = time.time() try: tokens = nltk.word_tokenize(sentence) except UnicodeDecodeError: tokens = nltk.word_tokenize(sentence.decode('utf-8')) tagger = PerceptronTagger() pos_arrs = nltk.tag._pos_tag(tokens, tagset, tagger) for pos_arr in pos_arrs: pos = [] pos.append(pos_arr[0]) pos.append(pos_arr[1]) # print pos_arr if pos_arr[1] in self.posDictionary: ppt = self.posDictionary[pos_arr[1]] else: ppt = "symbol" pos.append(ppt) posWords.append(pos) return posWords
def pos_tag(tokens, tagset=None): """ Use NLTK's currently recommended part of speech tagger to tag the given list of tokens. >>> from nltk.tag import pos_tag >>> from nltk.tokenize import word_tokenize >>> pos_tag(word_tokenize("John's big idea isn't all that bad.")) [('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is', 'VBZ'), ("n't", 'RB'), ('all', 'PDT'), ('that', 'DT'), ('bad', 'JJ'), ('.', '.')] >>> pos_tag(word_tokenize("John's big idea isn't all that bad."), tagset='universal') [('John', 'NOUN'), ("'s", 'PRT'), ('big', 'ADJ'), ('idea', 'NOUN'), ('is', 'VERB'), ("n't", 'ADV'), ('all', 'DET'), ('that', 'DET'), ('bad', 'ADJ'), ('.', '.')] NB. Use `pos_tag_sents()` for efficient tagging of more than one sentence. :param tokens: Sequence of tokens to be tagged :type tokens: list(str) :param tagset: the tagset to be used, e.g. universal, wsj, brown :type tagset: str :return: The tagged tokens :rtype: list(tuple(str, str)) """ tagger = PerceptronTagger() return _pos_tag(tokens, tagset, tagger)
def __init__(self, url, testrun): """Initialize the ShallowPipeline. Args: url (String) The Solr URL for the collection testrun (Boolean) True if it is a test run, False if need to index full corpus """ self.solr = index.SolrSearch(url) self.testrun = testrun self.stemmer = PorterStemmer() self.lemmatizer = WordNetLemmatizer() self.tagger = PerceptronTagger() self.dep_parser = StanfordDependencyParser( model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz', java_options=u'-mx4g')
def __init__(self, dbFilePath=None, revDbFilePath=None, pos=False): self.dbFilePath = dbFilePath self.revDbFilePath = revDbFilePath if pos: self.tagger = PerceptronTagger() self.pos = pos # try to open forward database if not dbFilePath: self.dbFilePath = os.path.join(os.path.dirname(__file__), "markovdb") try: with open(self.dbFilePath, 'rb') as dbfile: self.db = pickle.load(dbfile) except (IOError, ValueError): logging.warn( 'Database file corrupt or not found, using empty database') self.db = _db_factory() # try to open backwards database if not revDbFilePath: self.revDbFilePath = os.path.join(os.path.dirname(__file__), "revmarkovdb") try: with open(self.revDbFilePath, 'rb') as dbfile: self.rev_db = pickle.load(dbfile) except (IOError, ValueError): logging.warn( 'Database file corrupt or not found, using empty database') self.rev_db = _db_factory()
class HmmSeqRecognizer(object): def __init__(self): self.hmm_models = [] self.n_hmm = 0 self.hmm2idx = {} self.idx2hmm = {} self.tagger = PerceptronTagger() return def batch_test(self, samples, label): tp,ns = 0,len(samples) for i in xrange(ns): idx = self.predict_sample(samples[i]) if idx==label: tp+=1 return tp,float(tp)/ns def predict_sample(self, sample): sample = [sample] probs = [ model.test(sample) for model in self.hmm_models ] return probs.index(max(probs)) def predict_sentence(self, sentence): sample = [[ tag for _,tag in self.tagger.tag(word_tokenize(sentence)) ]] probs = [ model.test(sample) for model in self.hmm_models ] return probs.index(max(probs)) def add_model(self, name, model): self.hmm_models.append(model) self.hmm2idx[name] = self.n_hmm self.idx2hmm[self.n_hmm] = name self.n_hmm += 1 def new_hmm(self, name, datapath, nhs, ne): print '=> adding HMM model \'%s\'...' % name hmm_model = HmmModel(nhs) hmm_model.train(datapath,ne) self.add_model(name, hmm_model) print '| done' return def save_hmm(self, name, hmm_path): print '=> saving HMM model \'%s\'...' % name f = open(hmm_path, 'wb') pickle.dump(self.hmm_models[self.hmm2idx[name]], f) f.close() print '| done' return def load_hmm(self, name, hmm_path): # print '=> adding HMM model \'%s\'...' % name f = open(hmm_path, 'rb') hmm_model = pickle.load(f) f.close() self.add_model(name, hmm_model) # print '| done' return
def pos_title_abstract(resource): xml_file = resource[0] pos_data = [] try: xmldoc = minidom.parse(xml_file) elements_title = xmldoc.getElementsByTagName("Title") title = elements_title.item(0).childNodes[0].nodeValue elements_list = xmldoc.getElementsByTagName("S") sentences = [e[0].nodeValue for e in [element.childNodes for element in elements_list] if e[0].nodeType == e[0].TEXT_NODE] sentences.insert(0, title) #raw text txt_file = change_file_extention(resource[1], "xml", "txt") save_to_file(txt_file, sentences) #pos tagger = PerceptronTagger() for s in sentences: tokens = word_tokenize(s) pos_data.append(tagger.tag(tokens)) pos_file = change_file_extention(resource[1], "xml", "pos") save_to_file(pos_file, pos_data) except: print >> sys.stderr, "Error pos_title_abstract:", resource, sys.exc_info() return pos_data
class NERTagger: def __init__(self): self.pos_tagger = PerceptronTagger() def tag(self, tokens): tree = nltk.ne_chunk(self.pos_tagger.tag(tokens)) tagged_tokens = [] for t in tree: if type(t) == nltk.tree.Tree: label = t.label() for token in t: tagged_tokens.append((token[0], label)) else: tagged_tokens.append(t) return tagged_tokens
class ActionDetection(object): def __init__(self, train=False): self.tagger = PerceptronTagger() self.model = None # BOW: triangle, rectangle, circle, hand # verbs: draw, wave, rotate self.BOW = ['draw', 'wave', 'rotate', 'triangle', 'rectangle', 'circle', 'hand'] self.VERBS = [wn.synset('draw.v.01'), wn.synset('wave.v.01'), wn.synset('rotate.v.01')] self.n_bow, self.n_verbs = len(self.BOW), len(self.VERBS) if train: self.train_svm() else: self.load_model() return def save_model(self): f = open(MODEL_PATH + 'action_detection.model', 'wb') pickle.dump(self.model, f) f.close() return def train_svm(self): with open(DATA_PATH+'action_detection_training_set.txt') as f: data = f.readlines() X, y = [],[] for line in data: line = line.strip() if not line: continue line = line.split(' ',1) X.append(self.extract_feature(line[1])) y.append(int(line[0])) lin_clf = svm.LinearSVC() lin_clf.fit(X, y) self.model = lin_clf self.save_model() return def load_model(self): f = open(MODEL_PATH + 'action_detection.model', 'rb') self.model = pickle.load(f) f.close() return def extract_feature(self, sent): feature = [0] * (self.n_bow+self.n_verbs) verbs = [ w for w,pos in self.tagger.tag(word_tokenize(sent)) if pos=='VB' ] words = set(sent.split()) for i in xrange(self.n_bow): feature[i] = 1 if self.BOW[i] in words else 0 for i in xrange(self.n_verbs): if not verbs: feature[self.n_bow+i] = 0 else: similarities = [ wn.path_similarity(self.VERBS[i],wn.synset(v+'.v.01')) for v in verbs ] feature[self.n_bow+i] = max(similarities) return feature def predict(self, sent): # classes: 0(rectangle), 1(circle), 2(triangle), 3(wave), 4(rotate) feature = self.extract_feature(sent) idx = self.model.predict([feature])[0] probs = self.model._predict_proba_lr([feature])[0] # return value: 0(none), 1-5(classes+1) if probs[idx]>CONFIDENCE_THRESHOLD: return idx+1 else: return 0
if __name__ == "__main__": try: debug = True if sys.argv[-1] == "debug" else False debug_tests = 3 file_count = 0 dir_corpus = sys.argv[1] extra_features = True qr = mdbcl.QueryResources() without_types = False tokenizer = Tokenizer() #pos tagger = PerceptronTagger() train_sents = [] for (dirname, _, filenames) in os.walk(dir_corpus): for f in filenames: ext = f[-4:] if ext == '.ann': file_count += 1 if debug and file_count > debug_tests: break file_ann = os.path.join(dirname, f[:-4] + ".ann") ann_file = open(file_ann, "r") #file_ann_ext = os.path.join(dir_output, f[:-4] + ".anne") #ann_ext_file = open(file_ann_ext, "w")
# input_filename="removed2_en.csv" input_filename="sample_removed2_en.csv" output_filename="sample_tag_nltk_en.csv" output=open(output_filename,'w') # WAY-1 200s/1000 # with open(input_filename) as data_file: # for (index,line) in enumerate(data_file): # line=line.decode('utf-8','ignore') # you have to decode the line using the corresponded coding! # sents = nltk.sent_tokenize(line) # nltk.pos_tag_sents(sents) # WRONG!!!! # print index # WAY-2: Faster 20s/1000 tagger = PerceptronTagger() with open(input_filename) as data_file: for (index,line) in enumerate(data_file): line=line.decode('utf-8','ignore') # you have to decode the line using the corresponded coding! # sents = nltk.sent_tokenize(line) # print sents # sentences_pos=tagger.tag_sents(sents) word_list=nltk.word_tokenize(line) line_tagged=tagger.tag(word_list) if index in range(5000,60001,5000): print index # print line_tagged for t in line_tagged: output.write('_'.join(t)+' ') output.write('\n')
if __name__ == "__main__": try: debug = True if sys.argv[-1] == "debug" else False debug_tests = 3 file_count = 0 dir_corpus = sys.argv[1] dir_output = sys.argv[2] try: training_crfsuite = sys.argv[3] except: training_crfsuite = 'keyphrase.crfsuite' tokenizer = Tokenizer() #pos tagger = PerceptronTagger() extra_features = True qr = mdbcl.QueryResources() crftagger = pycrfsuite.Tagger() crftagger.open(training_crfsuite) #test_sents = [] for (dirname, _, filenames) in os.walk(dir_corpus): for f in filenames: ext = f[-4:] if ext == '.ann': file_count += 1 if debug and file_count > debug_tests: break
def __init__(self): self.pos_tagger = PerceptronTagger()
if __name__ == "__main__": try: debug = True if sys.argv[-1] == "debug" else False debug_tests = 3 file_count = 0 dir_corpus = sys.argv[1] dir_output = sys.argv[2] try: training_crfsuite = sys.argv[3] except: training_crfsuite = 'keyphrase.crfsuite' tokenizer = Tokenizer() #pos tagger = PerceptronTagger() crftagger = pycrfsuite.Tagger() crftagger.open(training_crfsuite) extra_features = True qr = mdbcl.QueryResources() #test_sents = [] for (dirname, _, filenames) in os.walk(dir_corpus): for f in filenames: ext = f[-4:] if ext == '.ann': file_count += 1 if debug and file_count > debug_tests: break
try: dir_corpus = sys.argv[1] dir_output = sys.argv[2] except: print >> sys.stderr, "E) Directories: ", sys.exc_info() try: pos_sequences_filename = sys.argv[3] count_limit = int(sys.argv[4]) pos_sequences, is_posregex = kpcommon.get_pos_tags_by_count(pos_sequences_filename, count_limit) except: print >> sys.stderr, "E) Common tags: ", sys.exc_info() qr = mdbcl.QueryResources() tokenizer = Tokenizer() tagger = PerceptronTagger() lemmatizer = WordNetLemmatizer() stemmer = LancasterStemmer() for (dirname, _, filenames) in os.walk(dir_corpus): for f in filenames: ext = f[-4:] if ext == '.txt': file_count += 1 #debug if debug and file_count > debug_tests: #debug break #debug print file_count, f[:-4] try: file_text = os.path.join(dirname, f[:-4] + ".txt") text_file = open(file_text, "r") raw_text = unicode(text_file.read(), encoding="utf-8")
#!/usr/bin/python import sys import os from nltk.tokenize import TreebankWordTokenizer as Tokenizer from nltk.tag.perceptron import PerceptronTagger import operator if __name__ == "__main__": try: dir_corpus = sys.argv[1] dir_output = sys.argv[2] tokenizer = Tokenizer() #pos tagger = PerceptronTagger() for (dirname, _, filenames) in os.walk(dir_corpus): for f in filenames: ext = f[-4:] if ext == '.ann': file_ann = os.path.join(dirname, f[:-4] + ".ann") ann_file = open(file_ann, "r") file_ann_ext = os.path.join(dir_output, f[:-4] + ".anne") ann_ext_file = open(file_ann_ext, "w") print f[:-4] indexes_kp_tmp = {} for ann in ann_file: ann = unicode(ann, encoding="utf-8") if ann[0] not in ["R", "*"]: ann_items = ann.strip().split("\t") if ann_items[1].find(";") >= 0:
import nltk from nltk.tag.perceptron import PerceptronTagger # nltk.download() sentence = """At eight o'clock on Thursday morning... Arthur didn't feel very good.""" tokens = nltk.word_tokenize(sentence) tagger = PerceptronTagger(False) tagger.load('file:///C:/Users/yarov/nltk_data/taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle') tagged = tagger.tag(tokens) print tagged
(c) R. Loth ISCPIF-CNRS (UPS 3611) """ from re import sub,search from glob import glob from nltk import sent_tokenize from nltk import word_tokenize # from nltk import pos_tag from nltk.tag.perceptron import PerceptronTagger from nltk import RegexpParser from json import load, dump from datetime import datetime tagr = PerceptronTagger() doc_metas = {} with open('work/meta/test.json') as metas: doc_metas = load(metas) doc_paths = glob("work/corpus/01-originaux/*") # total par terme term_totals = {} # nombre de docs ayant le terme par terme term_ndocs = {} # nombre de tokens dans le terme
from sklearn.svm import LinearSVC from sklearn.feature_extraction.text import CountVectorizer from sklearn.externals import joblib import numpy as np import nltk import re from nltk.tag.perceptron import PerceptronTagger from sklearn import cross_validation import os dir_path = os.path.dirname(os.path.abspath(__file__)) file_path = os.path.join(dir_path, 'Data/LowerTestData.txt') classifier_path = os.path.join(dir_path, 'Classifier/Classifier.pkl') tagger = PerceptronTagger() classifier = Pipeline([ ('vectorizer', CountVectorizer(min_df=1, max_features=100, lowercase=False, decode_error='strict')), ('clf', OneVsRestClassifier(LinearSVC()))]) print "Classifier pipeline initialized." labels = [] data = [] print "Processing the training file." with open (file_path, 'r') as readFile : for row in readFile : arr = [] sentences = [] arr = row.split(",,,") arr[1] = arr[1].strip(' ') arr[1] = arr[1].strip('\n') labels.append(arr[1])
import numpy as np import nltk from nltk.tag.perceptron import PerceptronTagger from sklearn.preprocessing import label_binarize from sklearn.naive_bayes import MultinominalNB from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score from matplotlib.pyplot import pyplot as plt tagger = PerceptronTagger() name_tag = [] non_name_tag = [] full_names = [] non_names = [] with open('full_names.txt') as infile: full_names = infile.read().splitlines() with open('non_names.txt') as infile: non_names = infile.read().splitlines() for x in full_names: tags = nltk.ne_chunk(tagger.tag(x.split())).pos() name_tag.append(tags[0][0][1]+' '+tags[0][1]+' '+tags[1][0][1]+' '+tags[1][1]) for x in non_names: tags = nltk.ne_chunk(tagger.tag(x.split())).pos() non_name_tag.append(tags[0][0][1]+' '+tags[0][1]+' '+tags[1][0][1]+' '+tags[1][1]) name_tag = [w.split() for w in name_tag] non_name_tag = [w.split() for w in non_name_tag]
import cowparser as cp train_sents = [] test_sents = [] gen = cp.sentences_for_dir(separate=False) for i, (metadata, data) in enumerate(gen): train_sents.append([(a,b) for a,b,c in data]) if i == 2000000: break for i, (metadata, data) in enumerate(gen): test_sents.append([(a,b) for a,b,c in data]) if i == 5000: break from nltk.tag.perceptron import PerceptronTagger pt = PerceptronTagger(load=False) pt.train(train_sents,'model2.perc.dutch_tagger') print(pt.evaluate(test_sents))
title= title.replace('\n','') titles+=[title] bodies+=[standard_body(body)] else: titles+=['Unknown'] bodies+=[standard_body(story)] ### -------------------------- ### # Tokenize and tag text # ### -------------------------- ### stories = pd.DataFrame({'title':titles,'body':bodies}) stories['sents'] = stories['body'].map(lambda x: nltk.sent_tokenize(x)) stories['words'] = stories['sents'].map(lambda x: [[w.lower() for w in nltk.word_tokenize(s)] for s in x]) from nltk.tag.perceptron import PerceptronTagger tagger = PerceptronTagger() stories['tags'] = stories['words'].map(lambda x: [tagger.tag(s) for s in x]) ### -------------------------- ### # Categorize tags into genres # ### -------------------------- ### # Note: Standards replace 'TG' with '{CLASS}' i.e. 'NN' -> '{ANML}' """ Genres: - occupation - animals: is_animal - body parts: - exclamation - food - location """ is_animal = lambda x: is_hyper_of(x,'animal')