def clean_text(text):
    if not text:
        return ''

    abbreviations = identify_parenthetical_phrases()(text)
    parsers = [
        dedash(),
        titlecaps(),
        decaps_text(),
        unidecoder(),
        separate_reference(),
        url_replacement(),
        replace_acronyms(counter=abbreviations, underscore=False),
        pos_tokenizer(pre_pos_blacklist),
        token_replacement(remove=True),
        replace_from_dictionary(),
        pos_tokenizer(post_pos_blacklist)
    ]

    for parser in parsers:
        text = parser(text)

    text = remove_stopwords(text)
    text = lemmatize(text)

    return text
Exemplo n.º 2
0
  def call(self, data):
    ABBR = identify_parenthetical_phrases()(data)
    parsers = [
        dedash(),
        # titlecaps(),
        separate_reference(),
        unidecoder(),
        token_replacement(),
        url_replacement(),
        # replace_acronyms(ABBR, underscore=False),
        # separated_parenthesis(),
        # replace_from_dictionary(prefix="MeSH_")
    ]

    cleansed = data
    for f in parsers:
      cleansed = f(cleansed)

    return cleansed.replace('\n', ' ')
Exemplo n.º 3
0
 def setup_class(cls):
     cls.parser = unidecoder()
Exemplo n.º 4
0
import csv
import os
import itertools

from utils.os_utils import mkdir, grab_files
from utils.parallel_utils import jobmap
import nlpre

import logging
logger = logging.getLogger(__name__)

# Fix for pathological csv files
csv.field_size_limit(sys.maxsize)
_ref_counter = itertools.count()

parser_unicode = nlpre.unidecoder()


def map_to_unicode(s):
    '''
    Convert input string to unicode.

    Args:
        s: an input string document

    Returns
        s: a copy of the input string in unicode
    '''
    # Helper function to fix input format
    s = str(s)
    return s.decode('utf-8', errors='replace')
Exemplo n.º 5
0
from argparse import ArgumentParser
from nlpre import titlecaps, dedash, identify_parenthetical_phrases
from nlpre import replace_acronyms, replace_from_dictionary
from nlpre import separated_parenthesis, unidecoder, token_replacement
from nlpre import url_replacement, separate_reference

if __name__ == '__main__':
  parser = ArgumentParser()
  parser.add_argument(
      "-t", "--text", dest="text", help="The text to clean", metavar="TEXT")
  args = parser.parse_args()
  data = args.text or ''

  ABBR = identify_parenthetical_phrases()(data)
  parsers = [
      dedash(),
      # titlecaps(),
      separate_reference(),
      unidecoder(),
      token_replacement(),
      url_replacement(),
      replace_acronyms(ABBR, underscore=False),
      separated_parenthesis(),
      # replace_from_dictionary(prefix="MeSH_")
  ]

  cleansed = data
  for f in parsers:
    cleansed = f(cleansed)

  sys.stdout.write(cleansed.replace('\n', ' '))