def clean_text(text): if not text: return '' abbreviations = identify_parenthetical_phrases()(text) parsers = [ dedash(), titlecaps(), decaps_text(), unidecoder(), separate_reference(), url_replacement(), replace_acronyms(counter=abbreviations, underscore=False), pos_tokenizer(pre_pos_blacklist), token_replacement(remove=True), replace_from_dictionary(), pos_tokenizer(post_pos_blacklist) ] for parser in parsers: text = parser(text) text = remove_stopwords(text) text = lemmatize(text) return text
def setup_class(cls): POS_Blacklist = set(( "connector", "cardinal", "pronoun", "adverb", "symbol", "verb", "punctuation", "modal_verb", "w_word", )) cls.parser = pos_tokenizer(POS_Blacklist)
def setup_class(cls): POS_blacklist = set( ( "connector", "cardinal", "pronoun", "adverb", "symbol", "verb", "adjective", "punctuation", "possessive", "unknown", ) ) cls.parser = pos_tokenizer(POS_blacklist)
def cardinal_word_test(self): doc = "There are two phases." doc_right = "there be phase ." doc_new = pos_tokenizer(["cardinal"])(doc) assert_equal(doc_right, doc_new)
def possesive_word_test(self): doc = "I am Jack's complete lack of surprise" doc_right = "i be Jack complete lack of surprise" doc_new = pos_tokenizer(["possessive"])(doc) assert_equal(doc_right, doc_new)
def implied_verb_test(self): # snarfed is not a real word, but we are using like a verb doc = "The boy snarfed the ball into the yard" doc_right = "the boy the ball into the yard" doc_new = pos_tokenizer(["verb"])(doc) assert_equal(doc_right, doc_new)
def symbol_test(self): doc = """I am #1.""" doc_right = "i be 1 ." doc_new = pos_tokenizer(["symbol"])(doc) assert_equal(doc_right, doc_new)
POS_Blacklist = ["connector","cardinal", "pronoun","adverb", "symbol","verb", "punctuation","modal_verb","w_word"] ABR = nlpre.identify_parenthetical_phrases()(doc2) key0 = (('systemic', 'lupus', 'erythematosus'), 'SLE') for n in range(50000): ABR[(key0[0],key0[1]+str(n))] += 1 n = 50 data=[] for key in keys: if key =='pos_tokenizer': parser = nlpre.pos_tokenizer(POS_Blacklist) elif key == "replace_acronyms": parser = nlpre.replace_acronyms(ABR) else: parser = getattr(nlpre, key)() if key=='unidecoder': func = lambda : [parser(unicode(x)) for x in [doc2]] else: func = lambda : [parser(x) for x in [doc2]] cost = timeit.timeit(func, number=n) / n item = {'function':key, "time":cost} print item data.append(item) df = pd.DataFrame(data) df = df.set_index('function').sort_values('time')
def w_word_test(self): doc = "Transcriptions that are observed." doc_right = "Transcription be observe ." doc_new = pos_tokenizer(["w_word"])(doc) assert_equal(doc_right, doc_new.text)
def possesive_word_test(self): doc = "I am Jack's complete lack of surprise" doc_right = "be jack complete lack of surprise" doc_new = pos_tokenizer(["pronoun"])(doc) assert_equal(doc_right, doc_new.text)
def symbol_test(self): doc = '''I am #1.''' doc_right = "I be 1 ." doc_new = pos_tokenizer(["symbol"])(doc) assert_equal(doc_right, doc_new.text)
def quoted_word_test(self): doc = '''We find the answer is "not quite".''' doc_right = "We find the answer be not quite ." doc_new = pos_tokenizer(["quote"])(doc) assert_equal(doc_right, doc_new.text)