def clean_text(text):
    if not text:
        return ''

    abbreviations = identify_parenthetical_phrases()(text)
    parsers = [
        dedash(),
        titlecaps(),
        decaps_text(),
        unidecoder(),
        separate_reference(),
        url_replacement(),
        replace_acronyms(counter=abbreviations, underscore=False),
        pos_tokenizer(pre_pos_blacklist),
        token_replacement(remove=True),
        replace_from_dictionary(),
        pos_tokenizer(post_pos_blacklist)
    ]

    for parser in parsers:
        text = parser(text)

    text = remove_stopwords(text)
    text = lemmatize(text)

    return text
Пример #2
0
  def call(self, data):
    ABBR = identify_parenthetical_phrases()(data)
    parsers = [
        dedash(),
        # titlecaps(),
        separate_reference(),
        unidecoder(),
        token_replacement(),
        url_replacement(),
        # replace_acronyms(ABBR, underscore=False),
        # separated_parenthesis(),
        # replace_from_dictionary(prefix="MeSH_")
    ]

    cleansed = data
    for f in parsers:
      cleansed = f(cleansed)

    return cleansed.replace('\n', ' ')
Пример #3
0
def clean_text(text):
    no_digits = []
    for s in text.split(' '):
        if s.isdigit():
            p = inflect.engine()
            no_digits.append(p.number_to_words(s))
        else:
            no_digits.append(s)
    text = ' '.join(no_digits)
    for f in [
            nlpre.token_replacement(),
            nlpre.dedash(),
            nlpre.separated_parenthesis(),
            nlpre.replace_acronyms(
                nlpre.identify_parenthetical_phrases()(text))
    ]:  #, nlpre.decaps_text(), nlpre.titlecaps()
        text = f(text)
    if text[-1] == '.' and no_digits[-1][-1] != '.':
        text = text[:-1]
    text = text.replace('\n', ' ')
    return text
Пример #4
0
import utils.db_utils as db_utils
from utils.os_utils import grab_files, mkdir

import pandas as pd
import collections
import nlpre
import os

import logging

logger = logging.getLogger(__name__)

# NLPre is too noisy at the info level
logging.getLogger("nlpre").setLevel(logging.WARNING)

parser_parenthetical = nlpre.identify_parenthetical_phrases()


def phrases_from_config(config):
    """
    Identify parenthetical phrases in the documents as they are being
    imported to the pipeline.

    import_data_from_config() and phrases_from_config() are the entry
    points for this step of the pipeline.

    Args:
        config: a config file
    :return:
    """
Пример #5
0
 def setup_class(cls):
     cls.parser = identify_parenthetical_phrases()
Пример #6
0
    'identify_parenthetical_phrases',
    'pos_tokenizer',
    'replace_acronyms',
    'separated_parenthesis',
    'titlecaps',
    'token_replacement',
    'replace_from_dictionary',
]


POS_Blacklist = ["connector","cardinal",
                 "pronoun","adverb",
                 "symbol","verb",
                 "punctuation","modal_verb","w_word"]

ABR = nlpre.identify_parenthetical_phrases()(doc2)
key0 = (('systemic', 'lupus', 'erythematosus'), 'SLE')
for n in range(50000):
    ABR[(key0[0],key0[1]+str(n))] += 1

n = 50
data=[]
for key in keys:
    if key =='pos_tokenizer':
        parser = nlpre.pos_tokenizer(POS_Blacklist)
    elif key == "replace_acronyms":
        parser = nlpre.replace_acronyms(ABR)
    else:
        parser = getattr(nlpre, key)()

    if key=='unidecoder':
Пример #7
0
import logging
# logging.basicConfig(level=logging.INFO)
from argparse import ArgumentParser
from nlpre import titlecaps, dedash, identify_parenthetical_phrases
from nlpre import replace_acronyms, replace_from_dictionary
from nlpre import separated_parenthesis, unidecoder, token_replacement
from nlpre import url_replacement, separate_reference

if __name__ == '__main__':
  parser = ArgumentParser()
  parser.add_argument(
      "-t", "--text", dest="text", help="The text to clean", metavar="TEXT")
  args = parser.parse_args()
  data = args.text or ''

  ABBR = identify_parenthetical_phrases()(data)
  parsers = [
      dedash(),
      # titlecaps(),
      separate_reference(),
      unidecoder(),
      token_replacement(),
      url_replacement(),
      replace_acronyms(ABBR, underscore=False),
      separated_parenthesis(),
      # replace_from_dictionary(prefix="MeSH_")
  ]

  cleansed = data
  for f in parsers:
    cleansed = f(cleansed)