class NltkToolsStemmer(LemmatizerWrapper): """ Wraps the NltkTools stemmer. It currently uses WordnetLemmatizer, which is English only. @warning This is the original implementation as used in our English Wikipedia parser. No effort has been made to clean up the code, or to fix the hardwired indexing, etc. The data must be already POS tagged, and the POS field must be the last one. """ def __init__(self, params): self.nt = NltkTools(stem=True) def lemmatize(self, tokens): # HACK for sen_i, sen in enumerate(tokens): stemmed = self.nt.stem(((tok[0], tok[-1]) for tok in sen)) hard_stemmed = self.nt.stem( (((tok[0][0].lower() + tok[0][1:] if tok[0][0].isupper() and tok[0][1:].islower() else tok[0]), tok[-1]) for tok in sen)) for tok_i, (tok_stemmed, tok_hard_stemmed) in enumerate( zip(stemmed, hard_stemmed)): tokens[sen_i][tok_i].append(tok_stemmed[2]) tokens[sen_i][tok_i].append(tok_hard_stemmed[2])
def _join_condition(self, sentences, current): """If this method returns @c True, _join_sentences joins the two current and the next sentence.""" end_with_abbrev = self._end_in_abbrev(sentences[current]) if end_with_abbrev is not None: if end_with_abbrev or not NltkTools.starts_with_upper(sentences[current + 1]): return True return False else: return (self._match_patterns(sentences[current]) and NltkTools.starts_with_upper(sentences[current + 1]))
def _join_condition(self, sentences, current): """If this method returns @c True, _join_sentences joins the two current and the next sentence.""" end_with_abbrev = self._end_in_abbrev(sentences[current]) if end_with_abbrev is not None: if end_with_abbrev or not NltkTools.starts_with_upper( sentences[current + 1]): return True return False else: return (self._match_patterns(sentences[current]) and NltkTools.starts_with_upper(sentences[current + 1]))
class HunposPosTagger(PosTaggerWrapper): """ Wraps NltkTools, which wraps HunPos as a POS tagger :). In order for NLTK to find the hunpos executable, the $HUNPOS environment variable must point to the directory with the hunpos-tag executable in it. The following parameters are used: - hunpos_model: the hunpos model file. Default is $HUNPOS/english.model; - hunpos_encoding: the encoding used by the hunpos model file. Default is iso-8859-1. """ def __init__(self, params): self.nt = NltkTools(pos=True, pos_model=params['hunpos_model']) self.encoding = params.get('hunpos_encoding', 'iso-8859-1') def pos_tag(self, tokens): for sen_i, sen in enumerate(tokens): tagged_sen = self.nt.pos_tag( [tok[0].encode(self.encoding) for tok in sen]) for tok_i, tagged_tok in enumerate(tagged_sen): try: tok, pos = [x.decode(self.encoding) for x in tagged_tok] except ValueError: continue tokens[sen_i][tok_i].append(pos)
class HunposPosTagger(PosTaggerWrapper): """ Wraps NltkTools, which wraps HunPos as a POS tagger :). In order for NLTK to find the hunpos executable, the $HUNPOS environment variable must point to the directory with the hunpos-tag executable in it. The following parameters are used: - hunpos_model: the hunpos model file. Default is $HUNPOS/english.model; - hunpos_encoding: the encoding used by the hunpos model file. Default is iso-8859-1. """ def __init__(self, params): self.nt = NltkTools(pos=True, pos_model=params['hunpos_model']) self.encoding = params.get('hunpos_encoding', 'iso-8859-1') def pos_tag(self, tokens): for sen_i, sen in enumerate(tokens): tagged_sen = self.nt.pos_tag([tok[0].encode(self.encoding) for tok in sen]) for tok_i, tagged_tok in enumerate(tagged_sen): try: tok, pos = [x.decode(self.encoding) for x in tagged_tok] except ValueError: continue tokens[sen_i][tok_i].append(pos)
class NltkToolsStemmer(LemmatizerWrapper): """ Wraps the NltkTools stemmer. It currently uses WordnetLemmatizer, which is English only. @warning This is the original implementation as used in our English Wikipedia parser. No effort has been made to clean up the code, or to fix the hardwired indexing, etc. The data must be already POS tagged, and the POS field must be the last one. """ def __init__(self, params): self.nt = NltkTools(stem=True) def lemmatize(self, tokens): # HACK for sen_i, sen in enumerate(tokens): stemmed = self.nt.stem(((tok[0], tok[-1]) for tok in sen)) hard_stemmed = self.nt.stem((((tok[0][0].lower() + tok[0][1:] if tok[0][0].isupper() and tok[0][1:].islower() else tok[0]), tok[-1]) for tok in sen)) for tok_i, (tok_stemmed, tok_hard_stemmed) in enumerate(zip(stemmed, hard_stemmed)): tokens[sen_i][tok_i].append(tok_stemmed[2]) tokens[sen_i][tok_i].append(tok_hard_stemmed[2])
class NltkToolsTokenizer(SentenceTokenizerWrapper, WordTokenizerWrapper): """ Wraps the NltkTools sentence and word tokenizer. The only parameter used is - abbrevs: a file that lists abbreviations and other problematic tokens that, because they include punctuation marks, can be mistaken for a sentence ending. Optional. """ def __init__(self, params): SentenceTokenizerWrapper.__init__(self, params) WordTokenizerWrapper.__init__(self, params) self.nt = NltkTools(tok=True, abbrev_set=self.abbrevs) def sen_tokenize(self, raw): """@note Does not use the abbrev_set.""" sentences = self.nt.sen_tokenize(raw) self._join_sentences(sentences) return sentences def word_tokenize(self, sen): tokens = self.nt.wordTokenizer.tokenize(sen) if len(tokens) == 0: return [] # Punctuation handling tokens = list( chain.from_iterable( [w for w in remove_quot_and_wiki_crap_from_word(token)] for token in tokens)) last_token, read_last = self.__get_last_token(tokens) punktMatchObject = self.nt.punktSplitter.match(tokens[last_token]) if punktMatchObject is not None and not self._is_abbrev( tokens[last_token]): tokens = tokens[:last_token] + list( punktMatchObject.groups()) + read_last return tokens def __get_last_token(self, tokens): last_token = -1 while len(tokens) > last_token * -1 and is_quote_or_garbage( tokens[last_token]): last_token -= 1 read_last = tokens[last_token + 1:] if last_token != -1 else [] return last_token, read_last
class NltkToolsTokenizer(SentenceTokenizerWrapper, WordTokenizerWrapper): """ Wraps the NltkTools sentence and word tokenizer. The only parameter used is - abbrevs: a file that lists abbreviations and other problematic tokens that, because they include punctuation marks, can be mistaken for a sentence ending. Optional. """ def __init__(self, params): SentenceTokenizerWrapper.__init__(self, params) WordTokenizerWrapper.__init__(self, params) self.nt = NltkTools(tok=True, abbrev_set=self.abbrevs) def sen_tokenize(self, raw): """@note Does not use the abbrev_set.""" sentences = self.nt.sen_tokenize(raw) self._join_sentences(sentences) return sentences def word_tokenize(self, sen): tokens = self.nt.wordTokenizer.tokenize(sen) if len(tokens) == 0: return [] # Punctuation handling tokens = list(chain.from_iterable([w for w in remove_quot_and_wiki_crap_from_word(token)] for token in tokens)) last_token, read_last = self.__get_last_token(tokens) punktMatchObject = self.nt.punktSplitter.match(tokens[last_token]) if punktMatchObject is not None and not self._is_abbrev(tokens[last_token]): tokens = tokens[:last_token] + list(punktMatchObject.groups()) + read_last return tokens def __get_last_token(self, tokens): last_token = -1 while len(tokens) > last_token * -1 and is_quote_or_garbage(tokens[last_token]): last_token -= 1 read_last = tokens[last_token + 1:] if last_token != -1 else [] return last_token, read_last
def __init__(self, params): self.nt = NltkTools(stem=True)
def __init__(self, params): self.nt = NltkTools(pos=True, pos_model=params['hunpos_model']) self.encoding = params.get('hunpos_encoding', 'iso-8859-1')
) print( ' considered to be titles, and will be processed accordingly.' ) print( ' -a: the output is appended to output_file, instead of overwriting it.' ) sys.exit() if 'o' in params: output_mode = 'a' if 'a' in params else 'w' out = FileWriter(params['o'], output_mode).open() else: out = StreamWriter(sys.stdout) nt = NltkTools(pos=True, stem=True, tok=True, pos_model=params.get('m')) for infile in filter(os.path.isfile, [ os.path.join(params['i'], infile) for infile in os.listdir(params['i']) ]): doc = FieldedDocument(infile) doc.fields = {} for field, raw_text in read_file(infile, True).iteritems(): filtered = nt.filter_long_sentences(raw_text) diff = len(raw_text) - len(filtered) if diff > 0: sys.stderr.write("{0}: {1} bytes filtered.\n".format( infile, diff)) if len(filtered) > 0: doc.fields[field] = nt.tag_raw(filtered) if len(doc.fields) > 0:
""" This script reads normal parsed Wikipedia pages in Conll-like format and transforms it to format needed by ndavid """ parser = OptionParser() parser.add_option("-m", "--model", dest="model", help="the hunpos model file. Default is $HUNPOS/english.model", metavar="MODEL_FILE") parser.add_option("-e", "--encoding", dest="encoding", help="the encoding used by the hunpos model file. Default is utf-8", default='utf-8') options, args = parser.parse_args() from langtools.nltk.nltktools import NltkTools nt = NltkTools(tok=True, pos=True, stem=True, pos_model=options.model) pageSep = "%%#PAGE" actPage = None starter = False for line in sys.stdin: l = line.strip().decode("utf-8") if l.startswith(pageSep): if actPage is not None: print actPage = l.split(" ", 1)[1] starter = True print l.encode("utf-8").replace(" ", "\t", 1) print "%%#Field\tTitle" titleTokens = nt.word_tokenize(actPage)
def __init__(self, params): SentenceTokenizerWrapper.__init__(self, params) WordTokenizerWrapper.__init__(self, params) self.nt = NltkTools(tok=True, abbrev_set=self.abbrevs)
raise ValueError('Input must be a directory of files.') except ValueError as err: print('Error: {0}'.format(err)) print(('Usage: {0} -i input_dir [-o output_file] -m [hunpos_model] ' + '[-a]').format(sys.argv[0])) print(' input_dir: the directory with the input text files.') print(' hunpos_model: the hunpos model file.') print(' output_file: the conll2 output file. If omitted, the result will') print(' be written to stdout.') print(' hunpos_model: the hunpos model file.') print(' -a: the output is appended to output_file, instead of overwriting it.') sys.exit() if 'o' in params: output_mode = 'a' if 'a' in params else 'w' out = FileWriter(params['o'], output_mode).open() else: out = StreamWriter(sys.stdout) nt = NltkTools(pos=True, stem=True, tok=True, pos_model=params.get('m')) for infile in (os.path.join(d, f) for d, _, fs in os.walk(params['i']) for f in fs): print "File " + infile doc = FieldedDocument(infile) doc.fields = {} for field, raw_text in read_file(infile).iteritems(): doc.fields[field] = nt.tag_raw(raw_text) write_doc(doc, out) if 'o' in params: out.close()
print(' hunpos_model: the hunpos model file.') print(' output_file: the conll2 output file. If omitted, the result will') print(' be written to stdout.') print(' hunpos_model: the hunpos model file.') print(' -t: If specified, the first non-empty line of the the text files are') print(' considered to be titles, and will be processed accordingly.') print(' -a: the output is appended to output_file, instead of overwriting it.') sys.exit() if 'o' in params: output_mode = 'a' if 'a' in params else 'w' out = FileWriter(params['o'], output_mode).open() else: out = StreamWriter(sys.stdout) nt = NltkTools(pos=True, stem=True, tok=True, pos_model=params.get('m')) for infile in filter(os.path.isfile, [os.path.join(params['i'], infile) for infile in os.listdir(params['i'])]): doc = FieldedDocument(infile) doc.fields = {} for field, raw_text in read_file(infile, True).iteritems(): filtered = nt.filter_long_sentences(raw_text) diff = len(raw_text) - len(filtered) if diff > 0: sys.stderr.write("{0}: {1} bytes filtered.\n".format(infile, diff)) if len(filtered) > 0: doc.fields[field] = nt.tag_raw(filtered) if len(doc.fields) > 0: write_doc(doc, out) if 'o' in params: