def clean_text(text):
    if not text:
        return ''

    abbreviations = identify_parenthetical_phrases()(text)
    parsers = [
        dedash(),
        titlecaps(),
        decaps_text(),
        unidecoder(),
        separate_reference(),
        url_replacement(),
        replace_acronyms(counter=abbreviations, underscore=False),
        pos_tokenizer(pre_pos_blacklist),
        token_replacement(remove=True),
        replace_from_dictionary(),
        pos_tokenizer(post_pos_blacklist)
    ]

    for parser in parsers:
        text = parser(text)

    text = remove_stopwords(text)
    text = lemmatize(text)

    return text
示例#2
0
def clean_text(text):
	no_digits = []
	for s in text.split(' '):
		if s.isdigit():
			p = inflect.engine()
			no_digits.append(p.number_to_words(s))
		else:
			no_digits.append(s)
	text = ' '.join(no_digits)
	for f in [nlpre.token_replacement(), nlpre.dedash(), nlpre.separated_parenthesis(), nlpre.replace_acronyms(nlpre.identify_parenthetical_phrases()(text))]: #, nlpre.decaps_text(), nlpre.titlecaps()
		text = f(text)
	if text[-1] == '.' and no_digits[-1][-1] != '.':
		text = text[:-1]
	text = text.replace('\n', ' ')
	return text
示例#3
0
  def call(self, data):
    ABBR = identify_parenthetical_phrases()(data)
    parsers = [
        dedash(),
        # titlecaps(),
        separate_reference(),
        unidecoder(),
        token_replacement(),
        url_replacement(),
        # replace_acronyms(ABBR, underscore=False),
        # separated_parenthesis(),
        # replace_from_dictionary(prefix="MeSH_")
    ]

    cleansed = data
    for f in parsers:
      cleansed = f(cleansed)

    return cleansed.replace('\n', ' ')
示例#4
0
    def setup_class(cls):
        from nlpre import dedash

        cls.parser0 = titlecaps(min_length=1)
        cls.parser1 = dedash()
示例#5
0
from argparse import ArgumentParser
from nlpre import titlecaps, dedash, identify_parenthetical_phrases
from nlpre import replace_acronyms, replace_from_dictionary
from nlpre import separated_parenthesis, unidecoder, token_replacement
from nlpre import url_replacement, separate_reference

if __name__ == '__main__':
  parser = ArgumentParser()
  parser.add_argument(
      "-t", "--text", dest="text", help="The text to clean", metavar="TEXT")
  args = parser.parse_args()
  data = args.text or ''

  ABBR = identify_parenthetical_phrases()(data)
  parsers = [
      dedash(),
      # titlecaps(),
      separate_reference(),
      unidecoder(),
      token_replacement(),
      url_replacement(),
      replace_acronyms(ABBR, underscore=False),
      separated_parenthesis(),
      # replace_from_dictionary(prefix="MeSH_")
  ]

  cleansed = data
  for f in parsers:
    cleansed = f(cleansed)

  sys.stdout.write(cleansed.replace('\n', ' '))